In [None]:
import pandas as pd
import pandas as pd
import folium
from folium.plugins import MarkerCluster
from IPython.display import display
from folium.features import DivIcon
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
file_path = '/Users/alesarabandi/data_man_project/cleaned_data_V2.csv'
data = pd.read_csv(file_path)
df = pd.DataFrame(data)

# Replace "Korea, South" with "North Korea" in the vic_country_name column, we changed the name on data quality part just to validate based on the benchmaek we were using, so now for the sake of readiablity I just changed it back
data['vic_country_name'] = data['vic_country_name'].replace("Korea, South", "North Korea")
data['att_country_name'] = data['att_country_name'].replace("Korea, South", "North Korea")


# 🌟 Dataset Overview  

The dataset consists of **37 columns** and **39,657 rows**, providing a rich set of features for comprehensive exploratory data analysis. Here's an overview of the most relevant columns tailored to your objectives:

---

## 🔑 **Key Columns for Analysis**  

### 🛡️ **Attacker Information**  
- **🖥️ IP Address**: `att_ip`  
- **🌍 Country**: `att_country_name`  
- **🏙️ City**: `att_city`  
- **📍 Location**: `att_latitude`, `att_longitude`  
- **⚡ Threat Metrics**:  
  - `att_threat_score`  
  - `att_is_known_attacker`  
  - `att_is_bot`, etc.  

---

### 🛡️ **Victim Information**  
- **🖥️ IP Address**: `vic_ip`  
- **🌍 Country**: `vic_country_name`  
- **🏙️ City**: `vic_city`  
- **📍 Location**: `vic_latitude`, `vic_longitude`  

---

### 🚨 **Attack Characteristics**  
- **🛑 Attack Type**  
- **⚙️ Severity Level**  
- **📶 Traffic Type**  
- **📜 Protocol**  
- **🦠 Malware Indicators**  
- **📦 Packet Details**:  
  - `Packet Type`  
  - `Packet Length`  

---

### 🕒 **Timeline and Metadata**  
- **📅 Date**  
- **✔️ Action Taken**  
- **📋 Log Source**  
- **🌐 Browser**  
- **📱 Device/OS**  

---  

💡 *This dataset offers valuable insights for analyzing cybersecurity threats, attacker patterns, and victim vulnerabilities. Use this structured breakdown to focus your exploratory analysis efforts!* 🚀


# Inspecting unique values, patterns, and duplicate counts for each column after cleaning
column_overview = {}

for column in df.columns:
    column_overview[column] = {
        'Unique Values': df[column].nunique(),
        'Sample Values': df[column].dropna().unique()[:5].tolist(),  # Show first 5 unique values
        'Missing Values': df[column].isnull().sum(),
        'Data Type': df[column].dtype
    }

# Convert overview to a DataFrame for better readability
column_summary = pd.DataFrame(column_overview).T
column_summary.reset_index(inplace=True)
column_summary.rename(columns={'index': 'Column'}, inplace=True)

column_summary

## **1. Geolocation Data of Hackers 🌍**

### **Description**:

This code processes the **geolocation data** of attackers by converting latitude and longitude into numeric values and filtering out rows with missing coordinates. An **interactive map** is created using **Folium**, centered globally.

A **marker cluster** is used to group attacker locations, with each marker displaying a popup containing details such as the attacker's country, city, known hacker status, and TOR usage. The map is displayed within a Jupyter Notebook for an interactive, visual representation of attacker geolocation data.



# Preprocess geolocation data for attackers
data['att_lat'] = pd.to_numeric(data['att_latitude'], errors='coerce')
data['att_lon'] = pd.to_numeric(data['att_longitude'], errors='coerce')

# Filter out rows with missing coordinates
geo_data = data.dropna(subset=['att_lat', 'att_lon', 'att_country_name', 'att_city', 'att_threat_score'])

# Create the interactive map centered around global coordinates
interactive_map = folium.Map(location=[0, 0], zoom_start=2)

# Create a marker cluster for attackers
marker_cluster = MarkerCluster().add_to(interactive_map)

# Add markers with detailed popups
for _, row in geo_data.iterrows():
    popup_content = f"""
    <b>Hacker Country:</b> {row['att_country_name']}<br>
    <b>Hacker City:</b> {row['att_city']}<br>
    <b>The hacker is anonymous :</b> {row['att_is_anonymous']}<br>
    """
    folium.Marker(
        location=[row['att_lat'], row['att_lon']],
        popup=folium.Popup(popup_content, max_width=250),
    ).add_to(marker_cluster)

# Display the map in Jupyter Notebook
display(interactive_map)


## 2. **Top Hacker and Target Countries 🌍**
### Description:
This analysis identifies the top countries that are involved in attacks either as the origin (attacker) or as the target (victim). We are looking to identify patterns of regional threats. By visualizing the distribution of attackers and victims, we aim to uncover trends related to geographic hotspots for cybercrime and vulnerabilities. This could guide future cybersecurity efforts, focusing on regions with the highest number of attacks.

# Top attacker countries
top_attacker_countries = data['att_country_name'].value_counts().head(10)
plt.figure(figsize=(10, 6))
sns.barplot(x=top_attacker_countries.values, y=top_attacker_countries.index, palette='Reds_r')
plt.title('Top 10 Attacker Countries', fontsize=16)
plt.xlabel('Number of Attacks', fontsize=12)
plt.ylabel('Country', fontsize=12)
plt.show()

# Top victim countries
top_victim_countries = data['vic_country_name'].value_counts().head(10)
plt.figure(figsize=(10, 6))
sns.barplot(x=top_victim_countries.values, y=top_victim_countries.index, palette='Blues_r')
plt.title('Top 10 Victim Countries', fontsize=16)
plt.xlabel('Number of Attacks', fontsize=12)
plt.ylabel('Country', fontsize=12)
plt.show()
## 3. **Temporal Analysis of Attacks 📅**
### Description:
This analysis checks for any seasonal or periodic patterns in attacks, such as peaks during certain months. By tracking attack frequencies over time, we can identify whether there are specific months or seasons with higher or lower attack activity. Understanding these patterns can help anticipate and prepare for periods of heightened risk, enabling more proactive cybersecurity measures during those times. 📆

# Convert Date column to datetime
data['Date'] = pd.to_datetime(data['Date'])

# Filter the data for the years 2020, 2021, and 2022
data_filtered = data[data['Date'].dt.year.isin([2020, 2021, 2022])]

# Count of attacks per month for the filtered data
monthly_attacks = data_filtered.groupby(data_filtered['Date'].dt.to_period('M')).size()

# Find the maximum and minimum values in the series
max_value = monthly_attacks.max()
min_value = monthly_attacks.min()

# Find the corresponding dates for the max and min values
max_date = monthly_attacks.idxmax()
min_date = monthly_attacks.idxmin()

# Plot
plt.figure(figsize=(12, 6))
monthly_attacks.plot(kind='line', marker='o', color='darkred')

# Annotate the maximum point
plt.annotate(f'Max: {max_value} attacks\n{max_date}', 
             xy=(monthly_attacks.idxmax().to_timestamp(), max_value), 
             xytext=(monthly_attacks.idxmax().to_timestamp(), max_value + 2), 
             arrowprops=dict(facecolor='green'),
             fontsize=10, color='green')

# Annotate the minimum point
plt.annotate(f'Min: {min_value} attacks\n{min_date}', 
             xy=(monthly_attacks.idxmin().to_timestamp(), min_value), 
             xytext=(monthly_attacks.idxmin().to_timestamp(), min_value - 2), 
             arrowprops=dict(facecolor='red'),
             fontsize=10, color='red')

# Titles and labels
plt.title('Number of Attacks Over Time (2020-2022)', fontsize=16)
plt.xlabel('Month', fontsize=12)
plt.ylabel('Number of Attacks', fontsize=12)
plt.grid()
plt.tight_layout()
plt.show()


## 4. **Common Protocols Used by Attackers 📡**
### Description:
This analysis examines which **protocols** are most commonly used by attackers and how they vary by **attack type**. Understanding protocol usage patterns can help strengthen defenses against the most common attack methods. By identifying frequently exploited protocols, cybersecurity teams can prioritize efforts to secure these communication channels and mitigate risks associated with them. 🔐

### **Why Use a Logarithmic Scale?**
In the original bar chart, there were significant differences in the number of attacks across protocols. For example, some protocols might have been used in a large number of attacks, while others saw only a few. These extreme differences made it difficult to visualize and compare the smaller variations among protocols with fewer attacks. 

By applying a logarithmic scale, we compress the larger values and stretch out the smaller ones, allowing for a more balanced view of the data. This makes it easier to observe and compare trends in attack distribution, even when the differences are small.


# Group by Protocol and Attack Type
custom_colors = ['#003366', '#8B0000', '#006400']
protocol_count = data.groupby(['Protocol', 'Attack Type']).size().unstack().fillna(0)
# Plot with logarithmic scale on the y-axis
ax = protocol_count.plot(kind='bar', figsize=(12, 6), color=custom_colors)
plt.title('Protocol Usage by Attack Type (Log Scale)', fontsize=16)
plt.xlabel('Protocol', fontsize=12)
plt.ylabel('Log of Number of Attacks', fontsize=12)
plt.legend(title='Attack Type')
plt.xticks(rotation=45)

# Apply logarithmic scale to the y-axis
plt.yscale('log')

# Add labels to each bar
for container in ax.containers:
    ax.bar_label(container, label_type='edge', fontsize=10, color='white', padding=5)

# Show plot
plt.show()

## 5. **Severity Level Across Attack Types ⚠️**
### Description:
This **heatmap** shows the distribution of attack severity levels across different attack types using a **Reds color scale**.

- **🟥 Color Scale**: 
  - **Light red** = Low severity (fewer attacks)
  - **Dark red** = High severity (more attacks)

- **📊 Annotations**: Each cell shows the **count** of attacks for each severity level and attack type.
- **⚡ Color Bar**: The color bar on the right helps interpret the intensity of red, showing the range from low to high counts.

This visualization helps quickly identify where high-severity attacks are concentrated and where resources are most needed to address threats. 🔐
# Grouping data
severity_counts = df.groupby(['Attack Type', 'Severity Level']).size().unstack()



# Create a heatmap with a Reds colormap
plt.figure(figsize=(12, 6))
sns.heatmap(severity_counts, annot=True, cmap='Reds', fmt='d', cbar=True, linewidths=0.5)
plt.title('Severity Levels by Attack Type (Heatmap with Reds)', fontsize=16)
plt.xlabel('Severity Level', fontsize=12)
plt.ylabel('Attack Type', fontsize=12)
plt.show()


## 6. **Top 20 Attacker-Victim Country Interactions 🌍**

### Description:

This bar chart shows the **top 20 attacker-victim country pairs** based on the frequency of cyber attacks. The **x-axis** represents the **frequency**, and the **y-axis** lists the attacker → victim pairs. Each bar is uniquely colored, and the **number of attacks** is labeled at the end of each bar.


This chart helps identify **geopolitical patterns** in cyber threats, providing valuable insights for **cyber defense** 🌏.

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Group the data and get the most frequent attacker-victim pairs
country_pairs = data.groupby(['att_country_name', 'vic_country_name']).size().reset_index(name='Frequency')

# Sort by frequency and take the top N pairs (for visualization simplicity)
top_pairs = country_pairs.sort_values(by='Frequency', ascending=False).head(20)

# Generate a unique color for each bar
colors = plt.cm.get_cmap('tab20', len(top_pairs))  # Generates distinct colors

# Create a bar chart for the top N attacker-victim pairs
plt.figure(figsize=(12, 8))
bars = plt.barh(top_pairs['att_country_name'] + ' → ' + top_pairs['vic_country_name'], top_pairs['Frequency'], 
                color=colors(range(len(top_pairs))))

# Add labels to each bar
for bar in bars:
    plt.text(bar.get_width() - bar.get_width() * 0.05, bar.get_y() + bar.get_height() / 2, 
             f'{int(bar.get_width())}', va='center', ha='right', color='white', fontsize=10)

# Title and labels
plt.title('Top 20 Attacker-Victim Country Interactions 🌍', fontsize=16)
plt.xlabel('Frequency of Attacks', fontsize=12)
plt.ylabel('Attacker → Victim Countries', fontsize=12)

# Invert y-axis to show the most frequent pair on top
plt.gca().invert_yaxis()
plt.show()

## **7. Analyzing Hacker Characteristics: TOR, Spam, Bot, and More 🕵️‍♂️**

### **Description**:

This code creates a **Pandas DataFrame** summarizing the counts of various attacker traits, such as using TOR, spam, bots, proxies, or being known attackers. Each row corresponds to a specific characteristic, with two columns:

- **YES**: Number of attacks where the characteristic is true.
- **NO**: Number of attacks where the characteristic is false.

The output table is styled with **light green backgrounds**, **black text**, and **solid black borders**, making it easy to read and visually appealing. 

### **Why It’s Useful**:

This table reveals the **prevalence of attacker behaviors**, helping uncover patterns like the most common anonymity tools or malicious activity sources. It's an essential tool for **cybersecurity analysis** and **defense planning**.

import pandas as pd

# Define the columns to analyze and their corresponding categories
categories = [
    "Hacker used TOR", 
    "Hacker used spam", 
    "Hacker used bot", 
    "Hacker used proxy", 
    "The IP belongs to a cloud provider", 
    "Hacker is anonymous", 
    "Hacker is known attacker"
]

columns_to_analyze = [
    'att_is_tor', 
    'att_is_spam', 
    'att_is_bot', 
    'att_is_proxy', 
    'att_is_cloud_provider', 
    'att_is_anonymous', 
    'att_is_known_attacker'
]

# Initialize a list to store the counts
summary_data = []

# Loop through each column and calculate the counts for True and False
for col, category in zip(columns_to_analyze, categories):
    true_count = data[col].sum()  # Count of True values
    false_count = len(data) - true_count  # Count of False values
    summary_data.append({
        'Category': category,
        'YES': true_count,
        'NO': false_count
    })

# Convert the list of dictionaries into a pandas DataFrame
summary_df = pd.DataFrame(summary_data)

# Apply the preferred styling
styled_df = summary_df.style.set_properties(**{
    'background-color': 'lightgreen', 
    'color': 'black', 
    'border': '1px solid black'
})

# Display the styled DataFrame
styled_df

## **8. Investigating Attacker Traits Across Device/OS Types 📱💻**

### **🔍 Description**:

This code analyzes the relationship between **Device/Operating System types** and various attacker traits, such as the use of **bots**, **TOR**, **proxies**, and **spam**. Using a **Chi-square test of independence**, it evaluates the degree of association between the attacker's device/OS and these traits. The results are displayed in a styled **Pandas DataFrame** with the following columns:

- **Variable**: The attacker trait being tested.  
- **Chi2**: The Chi-square statistic indicating the level of association.  
- **p-value**: The probability of observing the data if the two variables are independent.  

### **✨ Why It’s Useful**:

This analysis uncovers whether certain attacker traits are more prevalent on specific devices or operating systems. By identifying such associations, **cybersecurity teams** can design better-targeted defenses and strategies to mitigate attacks effectively. 🚀  

---

### **📊 What It Shows**:

#### **Chi2 Values**:
- The highest Chi-square value (**23.148** for `att_is_bot`) indicates a **strong association** between the presence of bots and the type of device/OS used during attacks.

#### **p-values**:
- **p-value < 0.05**:  
   - The trait **`att_is_bot`** has a **significant relationship** with the type of device/OS (p-value: **0.000748**). This means bots are more likely associated with specific device/OS types, revealing potential targeting or usage patterns.  
- **p-value > 0.05**:  
   - For all other traits (`att_is_tor`, `att_is_proxy`, etc.), the p-values are greater than 0.05, indicating **no statistically significant relationship** between these traits and device/OS types.  

---

### **🏁 Conclusion**:

- **🚨 Significant Finding**:  
   - Bots (`att_is_bot`) show a clear relationship with device/OS types, suggesting targeted deployment or specific vulnerabilities exploited by bots. This insight could help develop precise mitigation strategies to counter bot-related attacks.  

- **❌ Non-significant Findings**:  
   - Traits like `att_is_tor`, `att_is_proxy`, and `att_is_spam` show no strong links to device/OS types, suggesting these behaviors are not device-specific.  


from scipy.stats import chi2_contingency

# Define the variables to test
variables = ['att_is_bot', 'att_is_tor', 'att_is_proxy', 'att_is_anonymous', 
             'att_is_known_attacker', 'att_is_spam', 'att_is_cloud_provider']

# Initialize a list to store the results
chi2_results = []

# Perform Chi-square test for each variable
for var in variables:
    contingency_table = pd.crosstab(data['Device/OS'], data[var])
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    chi2_results.append({'Variable': var, 'Chi2': chi2, 'p-value': p})

# Create a DataFrame from the results
chi2_df = pd.DataFrame(chi2_results)

# Style the DataFrame
chi2_df_styled = chi2_df.style.set_properties(**{
    'background-color': 'lightgreen', 
    'color': 'black', 
    'border': '1px solid black'
})

chi2_df_styled
