<a href="https://colab.research.google.com/github/Boudia27/Projects/blob/main/project_5_Web_Server_Access_Logs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
import collections
from sklearn.preprocessing import StandardScaler
from scipy.stats import chi2_contingency
import statsmodels.api as sm
from statsmodels.formula.api import ols

In [None]:
# Read the CSV file
df = pd.read_csv('/content/drive/MyDrive/Data Science Projects project/project_4/weblog.csv')
df.head()

Unnamed: 0,client,hostname,alias_list,address_list
0,5.123.144.95,5.123.144.95,[Errno 1] Unknown host,
1,5.122.76.187,5.122.76.187,[Errno 1] Unknown host,
2,5.215.249.99,5.215.249.99,[Errno 1] Unknown host,
3,31.56.102.211,31-56-102-211.shatel.ir,['211.102.56.31.in-addr.arpa'],['31.56.102.211']
4,5.123.166.223,5.123.166.223,[Errno 1] Unknown host,


In [None]:
print(df.columns)

Index(['client', 'hostname', 'alias_list', 'address_list'], dtype='object')


In [None]:
print(df.head())

          client                 hostname                      alias_list  \
0   5.123.144.95             5.123.144.95          [Errno 1] Unknown host   
1   5.122.76.187             5.122.76.187          [Errno 1] Unknown host   
2   5.215.249.99             5.215.249.99          [Errno 1] Unknown host   
3  31.56.102.211  31-56-102-211.shatel.ir  ['211.102.56.31.in-addr.arpa']   
4  5.123.166.223            5.123.166.223          [Errno 1] Unknown host   

        address_list  
0                NaN  
1                NaN  
2                NaN  
3  ['31.56.102.211']  
4                NaN  


In [None]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27093 entries, 0 to 27092
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   client        27093 non-null  object
 1   hostname      27089 non-null  object
 2   alias_list    27092 non-null  object
 3   address_list  2412 non-null   object
dtypes: object(4)
memory usage: 846.8+ KB
None


In [None]:
print(df.describe())

              client                             hostname  \
count          27093                                27089   
unique         27093                                27029   
top     5.123.144.95  int0.client.access.fanaptelecom.net   
freq               1                                   42   

                    alias_list       address_list  
count                    27092               2412  
unique                    2413               2412  
top     [Errno 1] Unknown host  ['31.56.102.211']  
freq                     24680                  1  


In [None]:
# Check for missing values
print(df.isnull().sum())

client              0
hostname            4
alias_list          1
address_list    24681
dtype: int64


# Exploratory Data Analysis (EDA)

In [None]:
# Count unique values in each column
print("Unique Value Counts:")
for column in df.columns:
    unique_values = df[column].nunique()
    print(f"{column}: {unique_values}")



Unique Value Counts:
client: 27093
hostname: 27029
alias_list: 2413
address_list: 2412


# Cleaning DATA


In [None]:
# Display the first few rows of the DataFrame
print(df.head())

# Check the dimensions of the DataFrame
print("Shape of the DataFrame:", df.shape)

# Check for missing values
print("Missing Values:")
print(df.isnull().sum())

# Remove duplicates
df.drop_duplicates(inplace=True)

# Handle missing values if any
# Example: Fill missing values in 'client' column with a default value
df['client'].fillna('Unknown', inplace=True)

# Preprocess the 'hostname' column
# Example: Remove leading/trailing whitespaces and convert to lowercase
df['hostname'] = df['hostname'].str.strip().str.lower()

# Preprocess the 'alias_list' column if needed
# Example: Replace missing values with an empty string
df['alias_list'].fillna('', inplace=True)

# Preprocess the 'address_list' column if needed
# Example: Remove any parentheses or special characters
df['address_list'] = df['address_list'].str.replace('[^a-zA-Z0-9.-]', '')

# Verify the changes
print("Cleaned and Preprocessed DataFrame:")
print(df.head())

          client                 hostname                      alias_list  \
0   5.123.144.95             5.123.144.95          [Errno 1] Unknown host   
1   5.122.76.187             5.122.76.187          [Errno 1] Unknown host   
2   5.215.249.99             5.215.249.99          [Errno 1] Unknown host   
3  31.56.102.211  31-56-102-211.shatel.ir  ['211.102.56.31.in-addr.arpa']   
4  5.123.166.223            5.123.166.223          [Errno 1] Unknown host   

        address_list  
0                NaN  
1                NaN  
2                NaN  
3  ['31.56.102.211']  
4                NaN  
Shape of the DataFrame: (27093, 4)
Missing Values:
client              0
hostname            4
alias_list          1
address_list    24681
dtype: int64
Cleaned and Preprocessed DataFrame:
          client                 hostname                      alias_list  \
0   5.123.144.95             5.123.144.95          [Errno 1] Unknown host   
1   5.122.76.187             5.122.76.187          [Errno

  df['address_list'] = df['address_list'].str.replace('[^a-zA-Z0-9.-]', '')


# IP Address Analysis

In [None]:

# Extract IP addresses from the 'address_list' column
ip_addresses = df['address_list'].str.extract(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})')[0]

# Count the frequency of IP addresses
ip_counts = ip_addresses.value_counts()

# Display the top 10 most frequent IP addresses
print("Top 10 Most Frequent IP Addresses:")
print(ip_counts.head(10))

# Detect any unusual IP addresses
unusual_ips = ip_counts[ip_counts > 1].index

# Display the unusual IP addresses
print("Unusual IP Addresses (appearing more than once):")
print(unusual_ips)



Top 10 Most Frequent IP Addresses:
31.56.102.211     1
151.241.213.36    1
40.77.188.135     1
40.77.252.141     1
91.99.141.150     1
188.158.57.126    1
40.77.188.66      1
188.159.180.37    1
151.239.44.235    1
85.15.42.237      1
Name: 0, dtype: int64
Unusual IP Addresses (appearing more than once):
Index([], dtype='object')


# User Behavior Analysis

In [None]:
# Analyze frequently occurring clients
frequent_clients = df['client'].value_counts()

# Display the top 10 most frequent clients
print("Top 10 Most Frequent Clients:")
print(frequent_clients.head(10))

# Determine the most accessed hosts or services
top_hosts = df['hostname'].value_counts()

# Display the top 10 most accessed hosts or services
print("Top 10 Most Accessed Hosts or Services:")
print(top_hosts.head(10))

# Examine changes or anomalies in user behavior
# Example: Check for unique clients with multiple aliases
multiple_aliases = df[df['alias_list'].str.contains(',', na=False)]

# Display the rows where multiple aliases are present
print("Rows with Multiple Aliases:")
print(multiple_aliases)

# Example: Check for unique clients with unusual IP addresses
unusual_ips = df[df['address_list'].str.contains('^(\d{1,3}\.){3}\d{1,3}$', na=False)]

# Display the rows with unusual IP addresses
print("Rows with Unusual IP Addresses:")
print(unusual_ips)

Top 10 Most Frequent Clients:
5.123.144.95       1
83.123.222.28      1
37.254.228.141     1
5.236.40.88        1
185.145.185.121    1
5.122.30.251       1
91.98.117.130      1
37.148.99.116      1
83.121.57.22       1
93.115.219.150     1
Name: client, dtype: int64
Top 10 Most Accessed Hosts or Services:
int0.client.access.fanaptelecom.net    42
unknown.puregig.net                    14
pcomtelecom.net                         3
host.coloup.com                         2
client.access.myshanet.net              2
server2us.getmyip.co                    2
client.static.myshanet.net              2
83.121.57.22                            1
2.177.91.128                            1
46.51.94.31                             1
Name: hostname, dtype: int64
Rows with Multiple Aliases:
                client                           hostname  \
760    195.154.122.149    ip-195-154-122-149.a.ahrefs.com   
1263     5.230.146.199              lilli-server.konim.ir   
4339     93.158.162.70  proxy-mds

  unusual_ips = df[df['address_list'].str.contains('^(\d{1,3}\.){3}\d{1,3}$', na=False)]


# detect the most active IP addresses and hostnames

In [None]:
# Calculate the count of unique IP addresses
ip_counts = df['address_list'].nunique()

# Calculate the count of unique hostnames
hostname_counts = df['hostname'].nunique()

# Display the most active IP addresses and hostnames
print("Most Active IP Addresses:")
print(ip_counts)
print("Most Active Hostnames:")
print(hostname_counts)


Most Active IP Addresses:
2412
Most Active Hostnames:
27029


# Connections between IP addresses and hosts

In [None]:
# Create a dictionary to store the connections
ip_host_connections = {}

# Iterate over the DataFrame rows
for index, row in df.iterrows():
    ip = row['address_list']
    host = row['hostname']

    # Skip rows with missing IP address or host
    if pd.isnull(ip) or pd.isnull(host):
        continue

    # Add the IP address to the connections dictionary
    if ip not in ip_host_connections:
        ip_host_connections[ip] = set()

    # Add the host to the set of connected hosts for the IP address
    ip_host_connections[ip].add(host)

# Print the list of IP addresses and their connected hosts
for ip, connected_hosts in ip_host_connections.items():
    print(f"IP: {ip}")
    print("Connected Hosts:")
    for host in connected_hosts:
        print(f"- {host}")
    print()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
IP: 91.99.178.132
Connected Hosts:
- 91.99.178.132.parsonline.net

IP: 151.239.113.191
Connected Hosts:
- 151-239-113-191.shatel.ir

IP: 40.77.188.43
Connected Hosts:
- msnbot-40-77-188-43.search.msn.com

IP: 151.239.83.205
Connected Hosts:
- 151-239-83-205.shatel.ir

IP: 45.56.119.17
Connected Hosts:
- lidfitness.com

IP: 116.202.28.151
Connected Hosts:
- static.151.28.202.116.clients.your-server.de

IP: 37.148.120.106
Connected Hosts:
- 37-148-120-106.shatel.ir

IP: 31.56.114.203
Connected Hosts:
- 31-56-114-203.shatel.ir

IP: 151.242.178.194
Connected Hosts:
- 151-242-178-194.shatel.ir

IP: 84.241.13.163
Connected Hosts:
- 84-241-13-163.shatel.ir

IP: 151.238.193.161
Connected Hosts:
- 151-238-193-161.shatel.ir

IP: 91.99.197.238
Connected Hosts:
- 91.99.197.238.parsonline.net

IP: 40.77.189.25
Connected Hosts:
- msnbot-40-77-189-25.search.msn.com

IP: 94.183.241.202
Connected Hosts:
- 94-183-241-202.shatel.ir

IP: 75.

# Calculate the mean and standard deviation of connections for each host based

In [None]:
# Group the data by hostname and count the unique connections for each host
host_connections = df.groupby('hostname')['address_list'].nunique()

# Calculate the mean and standard deviation of connections for each host
mean_connections = host_connections.mean()
std_connections = host_connections.std()

# Print the mean and standard deviation of connections for each host
print("Mean connections for each host:")
print(mean_connections)
print()
print("Standard deviation of connections for each host:")
print(std_connections)

Mean connections for each host:
0.08912649376595509

Standard deviation of connections for each host:
0.3900633421027519


#  Calculate the mean and standard deviation of connections for each IP address

In [None]:
# Group the data by IP address and count the unique connections for each IP
ip_connections = df.groupby('address_list')['hostname'].nunique()

# Calculate the mean and standard deviation of connections for each IP
mean_connections = ip_connections.mean()
std_connections = ip_connections.std()

# Print the mean and standard deviation of connections for each IP
print("Mean connections for each IP:")
print(mean_connections)
print()
print("Standard deviation of connections for each IP:")
print(std_connections)

Mean connections for each IP:
0.9987562189054726

Standard deviation of connections for each IP:
0.035252650101243274


# Outputs the IP address followed by the associated information (client, hostname, and alias list) for each IP address in the dataset.

In [None]:
# Group the data by IP address and collect all the rows for each IP
ip_logs = df.groupby('address_list').apply(lambda x: x[['client', 'hostname', 'alias_list']].values.tolist())

# Print the log for each IP
for ip, logs in ip_logs.items():
    print(f"IP: {ip}")
    print("Log:")
    for log in logs:
        client, hostname, alias_list = log
        print(f"Client: {client}, Hostname: {hostname}, Alias List: {alias_list}")
    print()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
IP: 31.58.172.195
Log:
Client: 31.58.172.195, Hostname: 31-58-172-195.shatel.ir, Alias List: ['195.172.58.31.in-addr.arpa']

IP: 31.58.172.40
Log:
Client: 31.58.172.40, Hostname: 31-58-172-40.shatel.ir, Alias List: ['40.172.58.31.in-addr.arpa']

IP: 31.58.173.147
Log:
Client: 31.58.173.147, Hostname: 31-58-173-147.shatel.ir, Alias List: ['147.173.58.31.in-addr.arpa']

IP: 31.58.173.87
Log:
Client: 31.58.173.87, Hostname: 31-58-173-87.shatel.ir, Alias List: ['87.173.58.31.in-addr.arpa']

IP: 31.58.174.77
Log:
Client: 31.58.174.77, Hostname: 31-58-174-77.shatel.ir, Alias List: ['77.174.58.31.in-addr.arpa']

IP: 31.58.175.184
Log:
Client: 31.58.175.184, Hostname: 31-58-175-184.shatel.ir, Alias List: ['184.175.58.31.in-addr.arpa']

IP: 31.58.175.23
Log:
Client: 31.58.175.23, Hostname: 31-58-175-23.shatel.ir, Alias List: ['23.175.58.31.in-addr.arpa']

IP: 31.58.175.5
Log:
Client: 31.58.175.5, Hostname: 31-58-175-5.shatel.ir, A

Our Hypothesis

Hypothesis: The most frequent client is associated with the most active hostname.
- Null Hypothesis (H0): There is no relationship between the most frequent client and the most active hostname.
- Alternative Hypothesis (H1): The most frequent client is associated with the most active hostname.


# Hypothesis
### Calculate the frequency of each client and identify the most frequent client.

In [None]:
# Calculate the frequency of each client
client_frequency = df['client'].value_counts()

# Identify the most frequent client
most_frequent_client = client_frequency.idxmax()

# Print the frequency of each client and the most frequent client
print("Client Frequency:")
print(client_frequency)
print()
print("Most Frequent Client:", most_frequent_client)

Client Frequency:
5.123.144.95       1
83.123.222.28      1
37.254.228.141     1
5.236.40.88        1
185.145.185.121    1
                  ..
31.2.150.85        1
5.208.197.76       1
5.126.68.116       1
37.129.3.52        1
2                  1
Name: client, Length: 27093, dtype: int64

Most Frequent Client: 5.123.144.95


### Calculate the frequency of each hostname and identify the most active hostname.

In [None]:
# Calculate the frequency of each hostname
hostname_frequency = df['hostname'].value_counts()

# Identify the most active hostname
most_active_hostname = hostname_frequency.idxmax()

# Print the frequency of each hostname and the most active hostname
print("Hostname Frequency:")
print(hostname_frequency)
print()
print("Most Active Hostname:", most_active_hostname)


Hostname Frequency:
int0.client.access.fanaptelecom.net    42
unknown.puregig.net                    14
pcomtelecom.net                         3
host.coloup.com                         2
client.access.myshanet.net              2
                                       ..
2.186.225.234                           1
31.2.150.85                             1
5.208.197.76                            1
5.126.68.116                            1
31.59.236.66                            1
Name: hostname, Length: 27029, dtype: int64

Most Active Hostname: int0.client.access.fanaptelecom.net


### To determine if there is a significant association between the most frequent client and the most active hostname

In [None]:
# Calculate the frequency of each client and hostname
client_frequency = df['client'].value_counts()
hostname_frequency = df['hostname'].value_counts()

# Identify the most frequent client and most active hostname
most_frequent_client = client_frequency.idxmax()
most_active_hostname = hostname_frequency.idxmax()

# Create a contingency table
contingency_table = pd.crosstab(df['client'] == most_frequent_client, df['hostname'] == most_active_hostname)

# Perform the chi-square test
chi2, p_value, _, _ = chi2_contingency(contingency_table)

# Print the contingency table
print("Contingency Table:")
print(contingency_table)
print()

# Print the chi-square test results
print("Chi-square:", chi2)
print("p-value:", p_value)


Contingency Table:
hostname  False  True 
client                
False     27050     42
True          1      0

Chi-square: 0.0
p-value: 1.0


### Test the relationship between the most frequent client and the most active hostname

In [None]:
# Calculate the frequency of each client and hostname
client_frequency = df['client'].value_counts()
hostname_frequency = df['hostname'].value_counts()

# Identify the most frequent client and most active hostname
most_frequent_client = client_frequency.idxmax()
most_active_hostname = hostname_frequency.idxmax()

# Create a contingency table
contingency_table = pd.crosstab(df['client'] == most_frequent_client, df['hostname'] == most_active_hostname)

# Perform the chi-square test
chi2, p_value, _, _ = chi2_contingency(contingency_table)

# Set the significance level
alpha = 0.05

# Print the contingency table
print("Contingency Table:")
print(contingency_table)
print()

# Print the chi-square test results
print("Chi-square:", chi2)
print("p-value:", p_value)
print()

# Compare p-value with significance level
if p_value < alpha:
    print("Reject the null hypothesis. There is a significant relationship between the most frequent client and the most active hostname.")
else:
    print("Fail to reject the null hypothesis. There is no significant relationship between the most frequent client and the most active hostname.")


Contingency Table:
hostname  False  True 
client                
False     27050     42
True          1      0

Chi-square: 0.0
p-value: 1.0

Fail to reject the null hypothesis. There is no significant relationship between the most frequent client and the most active hostname.


# Abdullah al Mumen