### **Installing the Packages**

In [6]:
!pip install reportlab



In [7]:
!pip install user_agents



### **Importing the Packages**

In [2]:
# Importing modules
import tkinter as tk
from tkinter import filedialog
from tkinter import ttk
from tkinter import *
import advertools as adv
import pandas as pd
import matplotlib.pyplot as plt
import pyarrow.parquet as pq
import pyarrow
from ipywidgets import interact
from user_agents import parse
from ua_parser import user_agent_parser
import os
import os.path
from pathlib import Path
from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg

### **Defining the Functions**

In [3]:
# Defining a function to ask for a file
def ask_file():
    filename = filedialog.askopenfilename(title="Select a log file", filetypes=(("log files", "*.log"), ("all files", "*.*")))
    if filename:
        file_label.config(text=filename)
        generate_button.config(state=NORMAL)
    else:
        file_label.config(text="No file selected")
        generate_button.config(state=DISABLED)


In [4]:
# Converting the log file into parquet file
def convert_file():
    if os.path.exists('output_file.parquet'):
        os.remove('output_file.parquet')
    if os.path.exists('errors_file.txt'):
        os.remove('errors_file.txt')
    adv.logs_to_df(
        log_file='log_file.log',
        output_file='output_file.parquet',
        errors_file='errors_file.txt',
        log_format='combined')

In [5]:
# Defining the function to clear the canvas
def clear_canvas():
    global w, fig
    if w and fig:
        w.pack_forget()
        fig.clear()
        w = None
        fig = None


In [6]:
# Defining the function to plot the top 10 visitors
def plot_top_visitors():
    global w, fig
    clear_canvas()
    df = pd.read_parquet('output_file.parquet')
    top_visitors = df['client'].value_counts().head(10) 
    fig = plt.figure(figsize=(8,6)) 
    ax = fig.add_subplot(111) 
    ax.bar(top_visitors.index, top_visitors.values) 
    ax.set_xlabel("IP Address") 
    ax.set_ylabel("Number of visits") 
    ax.set_title("Top 10 Visitors") 
    ax.tick_params(axis='x', labelrotation = 40)
    fig.tight_layout() 
    mpl_canvas = FigureCanvasTkAgg(fig, master=output_canvas) 
    output_canvas.update()
    mpl_canvas.draw() 
    w = mpl_canvas.get_tk_widget()
    w.pack(side=tk.TOP, fill=tk.BOTH, expand=True)

In [7]:
# Defining the function to plot the top 10 most frequent requests
def plot_top_requests():
    global w, fig
    clear_canvas()
    df = pd.read_parquet('output_file.parquet')
    top_requests = df['request'].value_counts().head(10) 
    fig = plt.figure(figsize=(10,6)) 
    ax = fig.add_subplot(111) 
    ax.barh(top_requests.index, top_requests.values, color ='red') 
    ax.set_xlabel("Number of requests") 
    ax.set_ylabel("Requested URL") 
    ax.set_title("Top 10 Most Frequent Requests") 
    fig.tight_layout() 
    mpl_canvas = FigureCanvasTkAgg(fig, master=output_canvas) 
    output_canvas.update()
    mpl_canvas.draw() 
    w = mpl_canvas.get_tk_widget()
    w.pack(side=tk.TOP, fill=tk.BOTH, expand=True)

In [8]:
# Defining the function to plot the distribution of status codes
def plot_status_codes():
    global w, fig
    clear_canvas()
    df = pd.read_parquet('output_file.parquet')
    status_df = df.groupby('status').size().reset_index(name='count')
    x = status_df['status']
    y = status_df['count'] 
    percent = 100.*y/y.sum()
    fig = plt.figure(figsize=(8,6)) 
    ax = fig.add_subplot(111) 
    patches, texts = ax.pie(y, startangle=90, radius=1.2)
    labels = ['{0} - {1:1.2f} %'.format(i,j) for i,j in zip(x, percent)]
    sort_legend = True
    if sort_legend:
        patches, labels, dummy =  zip(*sorted(zip(patches, labels, y), key=lambda x: x[2], reverse=True))
        ax.legend(patches, labels, loc='upper center', bbox_to_anchor=(-0.3, 1),fontsize=8)
        ax.set_title('Distribution of Status Codes')
    fig.tight_layout() 
    mpl_canvas = FigureCanvasTkAgg(fig, master=output_canvas)
    output_canvas.update() 
    mpl_canvas.draw() 
    w = mpl_canvas.get_tk_widget()
    w.pack(side=tk.TOP, fill=tk.BOTH, expand=True)

In [9]:
# Defining the function to plot the hourly and weekly averages
def plot_hourly_weekly():
    global w, fig
    clear_canvas()
    df = pd.read_parquet('output_file.parquet')
    df['datetime'] = pd.to_datetime(df['datetime'],format='%d/%b/%Y:%H:%M:%S %z')
    df['datetime'] = pd.to_datetime(df['datetime'],utc=True)
    df['hour'] = df['datetime'].dt.hour
    df['day_of_week'] = df['datetime'].dt.dayofweek
    hourly_counts = df.groupby(['day_of_week', 'hour'])['hour'].count().reset_index(name='request_count')
    weekly_avg = df.groupby(['day_of_week'])['day_of_week'].count().reset_index(name='request_count')
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(8,3)) 
    ax1.plot(hourly_counts['hour'], hourly_counts['request_count'])
    ax1.set_xlabel("Hour")
    ax1.set_ylabel("Average Requests")
    ax1.set_title("Hourly Average")
    ax2.plot(weekly_avg['day_of_week'], weekly_avg['request_count'])
    ax2.set_xlabel("Day")
    ax2.set_ylabel("Average Requests") 
    ax2.set_title("Weekly Average")
    fig.tight_layout() 
    mpl_canvas = FigureCanvasTkAgg(fig, master=output_canvas) 
    output_canvas.update()
    mpl_canvas.draw() 
    w = mpl_canvas.get_tk_widget()
    w.pack(side=tk.TOP, fill=tk.BOTH, expand=True)


In [10]:
# Defining the function to plot the browser hits
def extract_browser(user_agent):
    ua = parse(user_agent)
    browser_family = ua.browser.family.lower()
    if 'chrome' in browser_family:
        return 'Chrome'
    elif 'firefox' in browser_family:
        return 'Firefox'
    elif 'safari' in browser_family:
        return 'Safari'
    elif 'edge' in browser_family:
        return 'Edge'
    elif 'opera' in browser_family:
        return 'Opera'
    elif 'brave' in browser_family:
        return 'Brave'
    elif 'internet explorer' in browser_family or 'msie' in browser_family:
        return 'Internet Explorer'
    else:
        return 'Other'
def plot_top_browsers():
    global w, fig
    clear_canvas()
    df = pd.read_parquet('output_file.parquet')
    df['browser'] = df['user_agent'].apply(extract_browser)
    browser_hits = df.groupby('browser').size().reset_index(name='total_hits')
    browser_hits = browser_hits.sort_values(by=['total_hits'], ascending=False)
    fig = plt.figure(figsize=(10,6)) 
    ax = fig.add_subplot(111)  
    ax.bar(browser_hits['browser'], browser_hits['total_hits']) 
    ax.set_xlabel("Browser Name") 
    ax.set_ylabel("Number of visits") 
    ax.set_title("Hits by Different Browsers")  
    fig.tight_layout()  
    mpl_canvas = FigureCanvasTkAgg(fig, master=data_frame)
    output_canvas.update()
    mpl_canvas.draw()  
    w = mpl_canvas.get_tk_widget()
    w.pack(side=tk.TOP, fill=tk.BOTH, expand=True) 

In [11]:
# Defining the function to plot the hourly hit of top 10 sites
def extract_site(referer):
    if pd.isna(referer):
        return 'Direct'
    else:
        referer_parts = referer.split('/')
        if len(referer_parts) >= 3:
            return referer_parts[2]
        else:
            return 'Unknown'
def plot_hourly_hit():
    global w, fig
    clear_canvas()    
    df = pd.read_parquet('output_file.parquet')
    df['datetime'] = pd.to_datetime(df['datetime'],format='%d/%b/%Y:%H:%M:%S %z')
    df['site'] = df['referer'].apply(extract_site)
    df['datetime'] = pd.to_datetime(df['datetime'],utc=True)
    df['hour'] = df['datetime'].dt.hour
    hourly_hits = df.groupby('hour').size().reset_index(name='total_hits')
    hourly_hits = hourly_hits.sort_values(by=['total_hits'], ascending=False).head(10)
    fig = plt.figure(figsize=(8,6)) 
    ax = fig.add_subplot(111)  
    ax.bar(hourly_hits['hour'], hourly_hits['total_hits']) 
    ax.set_ylabel("Number of Hits")  
    ax.set_xlabel("Hour") 
    ax.set_title("Hourly Hits")  
    fig.tight_layout() 
    mpl_canvas = FigureCanvasTkAgg(fig, master=output_canvas)
    output_canvas.update()
    mpl_canvas.draw()
    w = mpl_canvas.get_tk_widget()
    w.pack(side=tk.TOP, fill=tk.BOTH, expand=True)

## Creating the Tkinter Window

In [None]:
# Initializing global variables for canvas widget and figure object
global w, fig
w = None
fig = None

# Creating root window 
root = tk.Tk()
root.title("Web Log Analysis Generator")
root.geometry("300x400")
root.state("zoomed")
title_label = tk.Label(root,text = "Apache Web Log Analysis", font=("Calibri",50,"bold"),bd=10,relief=tk.GROOVE,bg="#35455D",fg="white")
title_label.pack(side='top',fill='x')

# Creating a label for instructions 
instruction_label = tk.Label(root, text="Please select a log file to generate a report:",font=("Calibri",12))
instruction_label.place(x=0,y=110)

# Creating a button for asking file 
ask_button = tk.Button(root, text="Select File", command=ask_file)
ask_button.place(x=300,y=110)

# Creating a label for displaying file name
file_label = tk.Label(root, text="No file selected",font=("Calibri",12))
file_label.place(x=380,y=110)

# Creating the frames
option_frame = tk.LabelFrame(root,text="Choose an option",font=("Calibri,20"),bd=12,relief=tk.GROOVE)
option_frame.place(x=2,y=140,width=350,height=580)

data_frame = tk.LabelFrame(root,text="Generated Analysis",font=("Calibri,20"),bd=12,relief=tk.GROOVE)
data_frame.place(x=350,y=140,width=930,height=580)

# Creating buttons
generate_button = tk.Button(option_frame, text="Convert the Selected File", command=convert_file, state=tk.DISABLED,font=("arial",12,))
generate_button.grid(row=1,column=2,padx=40,pady=20)

visitor = tk.Button(option_frame, text="Top 10 Visitors",font=("arial",12,"bold"),width=25,pady=5,command=plot_top_visitors)
visitor.grid(row=2,column=2,padx=40,pady=10)

frequent_requests = tk.Button(option_frame, text="Top 10 Most Frequent Requests",font=("arial",12,"bold"),width=25,pady=5,command=plot_top_requests)
frequent_requests.grid(row=3,column=2,padx=40,pady=20)

average = tk.Button(option_frame, text="Hourly and Weekly Average",font=("arial",12,"bold"),width=25,pady=5,command=plot_hourly_weekly)
average.grid(row=4,column=2,padx=40,pady=20)

browser_hit = tk.Button(option_frame, text="Browser Hits",font=("arial",12,"bold"),width=25,pady=5,command=plot_top_browsers)
browser_hit.grid(row=5,column=2,padx=40,pady=20)

site_hit = tk.Button(option_frame, text="Hourly Hits of Top Sites",font=("arial",12,"bold"),width=25,pady=5,command=plot_hourly_hit)
site_hit.grid(row=6,column=2,padx=40,pady=20)

status_code = tk.Button(option_frame, text="Distribution of Status Code",font=("arial",12,"bold"),width=25,pady=5,command=plot_status_codes)
status_code.grid(row=7,column=2,padx=40,pady=20)

output_canvas = tk.Canvas(data_frame)
output_canvas.pack(fill=tk.BOTH, expand=True)

# Starting the main loop using
root.mainloop()

Parsed       2,425,942 lines.
