In [None]:
# %%
# Importing required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# importing and checking the csv file
df = pd.read_csv(r'C:\Users\diana\OneDrive\Desktop\MUN\COMPSC\HOMEWORK\cyberattacks.csv')
df.info()
df.head()
data=df

# %%
# Data Cleaning
df.isnull().sum()  # Missing Values
df.dropna(inplace=True)  # Handling Missing Values

# %%
# Extracting Day of Week from Timestamp
df['Timestamp'] = pd.to_datetime(df['Timestamp'])
df['DayOfWeek'] = df['Timestamp'].dt.dayofweek

# %%
# Time Series Analysis - Number of Attacks by Day of Week
attacks_by_day = df['DayOfWeek'].value_counts().sort_index()
plt.figure(figsize=(10, 6))
sns.barplot(x=attacks_by_day.index, y=attacks_by_day.values)
plt.title('Number of Attacks by Day of Week')
plt.xlabel('Day of Week')
plt.ylabel('Number of Attacks')
plt.show()


# Visualize the distribution of attack types

attack_counts = df['Attack Type'].value_counts()

plt.figure(figsize=(10, 5))
sns.barplot(x=attack_counts.index , y=attack_counts)
plt.xlabel('Attack Type',fontsize=14, fontweight='bold')
plt.ylabel('Count',fontsize=14, fontweight='bold')
plt.title('Distribution of Attack Types', fontsize=16)
plt.xticks(rotation=45)
plt.show()
print(attack_counts)



# %%
# Distribution of Attack Types
attack_counts = df['Attack Type'].value_counts()
plt.figure(figsize=(10, 5))
sns.barplot(x=attack_counts.index, y=attack_counts)
plt.title('Distribution of Attack Types')
plt.xlabel('Attack Type')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

# %%
# Correlation Analysis Among Key Numerical Features
correlation_matrix = df[['Source Port', 'Destination Port', 'Packet Length', 'Anomaly Scores']].corr()
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix of Numerical Variables')
plt.show()


from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

X = df[['Source Port', 'Destination Port', 'Packet Length', 'Anomaly Scores']]
y = df['Attack Type']  

X = df[['Source Port', 'Destination Port', 'Packet Length', 'Anomaly Scores']]
y = df['Severity Level']  

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))


# Data for the pie chart

labels = ['UDP', 'ICMP', 'TCP']
sizes = df['Protocol'].value_counts()  # Proportional sizes of each category
colors = ['red', 'green', 'blue']  # Color
explode = (0.1, 0, 0)  # Explode a slice if needed

# Pie chart
plt.pie(sizes, labels=labels, colors=colors, explode=explode, autopct='%1.1f%%', startangle=180)
plt.axis('equal')  
plt.title('Distribution of Network Traffic Protocols')

# Display the pie chart
plt.show()

# Data for the pie chart

labels =['DNS','FTP','HTTP']
sizes = df['Traffic Type'].value_counts()
colors = ['yellow', 'green', 'orange']
explode = (0.1, 0, 0) 

# Data for pie chart
plt.pie(sizes, labels=labels, colors=colors, explode=explode, autopct='%1.1f%%', startangle=90)
plt.axis('equal')  
plt.title('Distribution of Network Traffic Types')
plt.show()


labels =['Blocked','Ignored','Logged']
sizes =df['Action Taken'].value_counts()
colors = ['Red', 'green', 'blue']
explode = (0.1, 0, 0) 

# Data for pie chart
plt.pie(sizes, labels=labels, colors=colors, explode=explode, autopct='%1.1f%%', startangle=190)

plt.axis('equal')
plt.title('Distribution of Actions Taken')

plt.show()


# Data for the pie chart
labels =['High','Medium','Low']
sizes =df['Severity Level'].value_counts()
colors = ['orange', 'blue','pink'] 
explode = (0, 0,0)

# Data for pie chart
plt.pie(sizes, labels=labels, colors=colors, explode=explode, autopct='%1.1f%%', startangle=90)

plt.axis('equal') 
plt.title('Distribution of Severity Levels')

plt.show()


labels =['Server','Firewall']
sizes =df['Log Source'].value_counts()  
colors = ['blue', 'orange']
explode = (0, 0) 

plt.pie(sizes, labels=labels, colors=colors, explode=explode, autopct='%1.1f%%', startangle=270)
plt.axis('equal')
plt.title('Distribution of Log Sources')

plt.show()


columns_to_plot = ['Protocol','Attack Signature','Action Taken','Network Segment']

for col in columns_to_plot:
    sns.catplot(data=df, x=col, hue='Attack Type', kind='count', height=8, aspect=1.5)
    plt.title(f'Count of Attack Types for Each {col.capitalize()}')
    plt.show



# Filtering data for high anomaly scores (e.g., above the 75th percentile)
high_anomaly_threshold = data['Anomaly Scores'].quantile(0.75)
high_anomaly_data = data[data['Anomaly Scores'] > high_anomaly_threshold]

# Plots
sns.set_style('whitegrid')

# Plotting the histogram
plt.figure(figsize=(12, 8))
sns.histplot(high_anomaly_data['Anomaly Scores'], bins=30, kde=False, color='orange', edgecolor='black')

# Adding a vertical line for the mean
mean_value = high_anomaly_data['Anomaly Scores'].mean()
plt.axvline(mean_value, color='red', linestyle='dashed', linewidth=1)

# Adding text for the mean line
plt.text(mean_value + 0.5, plt.ylim()[1] * 0.9, f'Mean: {mean_value:.2f}', color = 'red')

plt.title('Distribution of High Anomaly Scores', fontsize=16)
plt.xlabel('Anomaly Scores', fontsize=14)
plt.ylabel('Frequency', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.tight_layout()
plt.show()



high_severity_events = df[df['Severity Level'] == 'High']

# Histogram for the distribution of 'Packet Length' among high-severity events
plt.figure(figsize=(10, 6))
sns.histplot(high_severity_events['Packet Length'], bins=30, kde=True)
plt.title('Packet Length Distribution for High-Severity Events')
plt.xlabel('Packet Length')
plt.ylabel('Frequency')
plt.show()

plt.figure(figsize=(10, 6))
sns.boxplot(data=high_severity_events, x='Packet Length')
plt.title('Packet Length Spread for High-Severity Events')
plt.xlabel('Packet Length')
plt.show()

