In [18]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

import re
import json
import ijson

from scipy.stats import shapiro
from scipy import stats
import math
from scipy.stats import ttest_ind
from sklearn import preprocessing

# Load csv dataset

In [10]:
df = pd.read_csv("client_hostname.csv")
df.head(20)

Unnamed: 0,client,hostname,alias_list,address_list
0,5.123.144.95,5.123.144.95,[Errno 1] Unknown host,
1,5.122.76.187,5.122.76.187,[Errno 1] Unknown host,
2,5.215.249.99,5.215.249.99,[Errno 1] Unknown host,
3,31.56.102.211,31-56-102-211.shatel.ir,['211.102.56.31.in-addr.arpa'],['31.56.102.211']
4,5.123.166.223,5.123.166.223,[Errno 1] Unknown host,
5,5.160.26.98,5.160.26.98,[Errno 1] Unknown host,
6,5.127.147.132,5.127.147.132,[Errno 1] Unknown host,
7,158.58.30.218,158.58.30.218,[Errno 1] Unknown host,
8,86.55.230.86,86.55.230.86,[Errno 1] Unknown host,
9,89.35.65.186,89.35.65.186,[Errno 1] Unknown host,


# Descriptive statistics

### Building Summary of train data

In [6]:
df.describe()

Unnamed: 0,client,hostname,alias_list,address_list
count,258445,258441,258445,5819
unique,258445,258273,5820,5819
top,5.123.144.95,int0.client.access.fanaptelecom.net,[Errno 1] Unknown host,['31.56.102.211']
freq,1,101,252626,1


In [14]:
# Reading the access.log file
with open('access.log', 'r') as file:
    log_entries = file.readlines()

# List to store the parsed log entries
parsed_entries = []

# Iterate over each log entry
for log_entry in log_entries:
    # Split the log entry by space
    split_entry = log_entry.split()

    # Ensure the split_entry has enough elements
    if len(split_entry) >= 14:
        # Extracting columns
        ip_address = split_entry[0]        
        timestamp = split_entry[3][1:] + ' ' + split_entry[4][:-1]        
        http_method = split_entry[5][1:]      
        request_url = split_entry[6]  
        http_protocol = split_entry[7][:-1]      
        response_code = split_entry[8]     
        response_size = split_entry[9]      
        user_agent = split_entry[11][1:-1]
        referrer = split_entry[13][1:-1]

        # Creating a dictionary for the log entry
        entry_dict = {
            'IP Address': ip_address,
            'Timestamp': timestamp,
            'HTTP Method': http_method,
            'Request URL': request_url,
            'HTTP Protocol': http_protocol,
            'Response Code': response_code,
            'Response Size': response_size,
            'User-Agent': user_agent,
            'Referrer Information': referrer
        }

        # Appending the dictionary to the list
        parsed_entries.append(entry_dict)

# the parsed entries to access.json file
with open('access.json', 'w') as file:
    json.dump(parsed_entries, file, indent=4)

In [19]:
def read_json_chunks(file_path, chunk_size):
    with open(file_path, 'r') as file:
        objects = ijson.items(file, 'item')
        chunk = []

        for i, obj in enumerate(objects):
            chunk.append(obj)
            if i + 1 == chunk_size:
                # When the number of objects in the chunk list reaches the specified chunk_size, 
                # the yield statement is used to yield the chunk.
                # This allows the caller to process the chunk while the function continues to read and yield subsequent chunks.
                yield chunk
                chunk = []

        if chunk:
            yield chunk

In [20]:
chunk_size = 10000
file_path = 'access.json'

access_df = pd.DataFrame()

for chunk in read_json_chunks(file_path, chunk_size):
    chunk_df = pd.DataFrame(chunk)
    access_df = pd.concat([access_df, chunk_df], ignore_index=True)

access_df

Unnamed: 0,IP Address,Timestamp,HTTP Method,Request URL,HTTP Protocol,Response Code,Response Size,User-Agent,Referrer Information
0,54.36.149.41,22/Jan/2019:03:56:14 +0330,GET,/filter/27|13%20%D9%85%DA%AF%D8%A7%D9%BE%DB%8C...,HTTP/1.1,200,30577,Mozilla/5.,hrefsBot/6.1
1,31.56.96.51,22/Jan/2019:03:56:16 +0330,GET,/image/60844/productModel/200x200,HTTP/1.1,200,5667,Mozilla/5.,ndroi
2,31.56.96.51,22/Jan/2019:03:56:16 +0330,GET,/image/61474/productModel/200x200,HTTP/1.1,200,5379,Mozilla/5.,ndroi
3,40.77.167.129,22/Jan/2019:03:56:17 +0330,GET,/image/14925/productModel/100x100,HTTP/1.1,200,1696,Mozilla/5.,ingbot/2.0
4,91.99.72.15,22/Jan/2019:03:56:17 +0330,GET,/product/31893/62100/%D8%B3%D8%B4%D9%88%D8%A7%...,HTTP/1.1,200,41483,Mozilla/5.,
...,...,...,...,...,...,...,...,...,...
10186122,188.229.21.56,26/Jan/2019:20:29:13 +0330,GET,/content/view/shoppingRules,HTTP/1.1,302,0,Mozilla/5.,ndroi
10186123,5.127.220.71,26/Jan/2019:20:29:13 +0330,GET,/apple-touch-icon-120x120.png,HTTP/1.1,404,32420,MobileSafari/604.,arwin/18.2.0
10186124,5.213.7.50,26/Jan/2019:20:29:13 +0330,GET,/m/product/18962/%D8%BA%D8%B0%D8%A7-%D8%B3%D8%...,HTTP/1.1,200,20959,Mozilla/5.,P
10186125,109.125.169.52,26/Jan/2019:20:29:13 +0330,GET,/image/%7B%7BbasketItem.id%7D%7D?type=productM...,HTTP/1.1,200,5,Mozilla/5.,


In [47]:
access_df['IP Address'] = access_df['IP Address'].astype('category')
access_df['Timestamp'] = pd.to_datetime(access_df['Timestamp'], format='%d/%b/%Y:%H:%M:%S %z')
access_df['HTTP Method'] = access_df['HTTP Method'].astype('category')
access_df['Response Code'] = pd.to_numeric(access_df['Response Code'], errors='coerce').astype('Int32')
access_df['Response Size'] = pd.to_numeric(access_df['Response Size'], errors='coerce').astype('Int32')
access_df['User-Agent'] = access_df['User-Agent'].astype('category')

col_names = ['IP Address','Timestamp','HTTP Method','Response Code','Response Size','User-Agent']
logs_df = access_df[col_names]
logs_df

Unnamed: 0,IP Address,Timestamp,HTTP Method,Response Code,Response Size,User-Agent
0,54.36.149.41,2019-01-22 03:56:14+03:30,GET,200,30577,Mozilla/5.
1,31.56.96.51,2019-01-22 03:56:16+03:30,GET,200,5667,Mozilla/5.
2,31.56.96.51,2019-01-22 03:56:16+03:30,GET,200,5379,Mozilla/5.
3,40.77.167.129,2019-01-22 03:56:17+03:30,GET,200,1696,Mozilla/5.
4,91.99.72.15,2019-01-22 03:56:17+03:30,GET,200,41483,Mozilla/5.
...,...,...,...,...,...,...
10186122,188.229.21.56,2019-01-26 20:29:13+03:30,GET,302,0,Mozilla/5.
10186123,5.127.220.71,2019-01-26 20:29:13+03:30,GET,404,32420,MobileSafari/604.
10186124,5.213.7.50,2019-01-26 20:29:13+03:30,GET,200,20959,Mozilla/5.
10186125,109.125.169.52,2019-01-26 20:29:13+03:30,GET,200,5,Mozilla/5.


### Relationships between features

# Exploratory Data Analysis (EDA)

### Identify the variable and their types

In [26]:
df_types = {i: set(map(type, df[i])) for i in df.columns} 
print(df_types)

access_df_types = {i: set(map(type, access_df[i])) for i in access_df.columns} 
access_df_types

{'client': {<class 'str'>}, 'hostname': {<class 'str'>, <class 'float'>}, 'alias_list': {<class 'str'>}, 'address_list': {<class 'str'>, <class 'float'>}}


{'IP Address': {str},
 'Timestamp': {str},
 'HTTP Method': {str},
 'Request URL': {str},
 'HTTP Protocol': {str},
 'Response Code': {str},
 'Response Size': {str},
 'User-Agent': {str},
 'Referrer Information': {str}}

In [36]:
access_df.info(show_counts=True, verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10186127 entries, 0 to 10186126
Data columns (total 9 columns):
 #   Column                Non-Null Count     Dtype                                
---  ------                --------------     -----                                
 0   IP Address            10186127 non-null  category                             
 1   Timestamp             10186127 non-null  datetime64[ns, pytz.FixedOffset(210)]
 2   HTTP Method           10186127 non-null  category                             
 3   Request URL           10186127 non-null  object                               
 4   HTTP Protocol         10186127 non-null  object                               
 5   Response Code         10186127 non-null  object                               
 6   Response Size         10186127 non-null  object                               
 7   User-Agent            10186127 non-null  object                               
 8   Referrer Information  10186127 non-null  

### Clean data (error, remove duplicates, missing values, Outliers)


In [40]:
missing_values = access_df.isnull().sum().sort_values(ascending = False)
percentage_missing_values = missing_values/len(access_df)*100
check_missing_values = pd.concat([missing_values, percentage_missing_values], axis = 1, keys= ['Missing values', '% Missing'])
print(check_missing_values)
print()

                      Missing values  % Missing
Response Code                    211   0.002071
Response Size                    211   0.002071
IP Address                         0   0.000000
Timestamp                          0   0.000000
HTTP Method                        0   0.000000
Request URL                        0   0.000000
HTTP Protocol                      0   0.000000
User-Agent                         0   0.000000
Referrer Information               0   0.000000



In [54]:
m = access_df["Response Code"].mean()
m = int(round(m))  # Round the mean value to the nearest integer
m1 = access_df["Response Size"].mean()
m1 = int(round(m1))  # Round the mean value to the nearest integer
access_df["Response Code"].fillna(m, inplace=True)
access_df["Response Size"].fillna(m1, inplace=True)
access_df.isnull().sum()

IP Address              0
Timestamp               0
HTTP Method             0
Request URL             0
HTTP Protocol           0
Response Code           0
Response Size           0
User-Agent              0
Referrer Information    0
dtype: int64

### Transformation (Standardization, Normalization, encoding categorical to numerical)

### Data Visualization