In [8]:
import pandas as pd

# Load the dataset
file_path = r"C:\Users\hp\Desktop\ML projects\CYBER_AI\Copy_of_updated_cyber_attack_data_large2.csv"
df = pd.read_csv(file_path)

# Inspect the first few rows of the dataset
print(df.head())

# Check the column names and data types
print(df.info())


   Event ID  Timestamp  Source IP  Destination IP  User Agent  \
0     11206      16728       5890           13791        1791   
1     17645      18182       7572           13631        3542   
2      7141      19842      19987           12091        4758   
3      5761      12574       6378           14210        1575   
4     17896      11554      16486           12023        3720   

                Attack Type  Attack Severity  Data Exfiltrated  \
0               SSH-Patator                0              True   
1                      DDoS                0              True   
2          Web Attack - XSS                1              True   
3               FTP-Patator                2             False   
4  Web Attack - Brute Force                2             False   

   Threat Intelligence  Response Action  ...  Unnamed: 13  Unnamed: 14  \
0                 8815                2  ...          NaN          NaN   
1                14105                1  ...          NaN       

In [9]:
# Check for missing values in the dataset
missing_values = df.isnull().sum()
print("Missing values per column:\n", missing_values)

# Option 1: Drop rows with missing values (if missing data is minimal)
df = df.dropna()

# Option 2: Fill missing values (e.g., with the mean or median for numeric columns)
# df.fillna(df.median(), inplace=True)

print("After handling missing values:", df.info())


Missing values per column:
 Event ID                  0
Timestamp                 0
Source IP                 0
Destination IP            0
User Agent                0
Attack Type               0
Attack Severity           0
Data Exfiltrated          0
Threat Intelligence       0
Response Action           0
Label                     0
Unnamed: 11            1000
Unnamed: 12            1000
Unnamed: 13            1000
Unnamed: 14            1000
Unnamed: 15            1000
Unnamed: 16            1000
Unnamed: 17            1000
Unnamed: 18            1000
Unnamed: 19            1000
Unnamed: 20            1000
Unnamed: 21            1000
Unnamed: 22            1000
dtype: int64
<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Data columns (total 23 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Event ID             0 non-null      int64  
 1   Timestamp            0 non-null      int64  
 2   Source IP            0

In [10]:
import numpy as np

# Replace infinite values with NaN
df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Drop rows with NaN values after replacing infinite values
df.dropna(inplace=True)

print("After handling infinite values:", df.info())


<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Data columns (total 23 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Event ID             0 non-null      int64  
 1   Timestamp            0 non-null      int64  
 2   Source IP            0 non-null      int64  
 3   Destination IP       0 non-null      int64  
 4   User Agent           0 non-null      int64  
 5   Attack Type          0 non-null      object 
 6   Attack Severity      0 non-null      int64  
 7   Data Exfiltrated     0 non-null      bool   
 8   Threat Intelligence  0 non-null      int64  
 9   Response Action      0 non-null      int64  
 10  Label                0 non-null      int64  
 11  Unnamed: 11          0 non-null      float64
 12  Unnamed: 12          0 non-null      float64
 13  Unnamed: 13          0 non-null      float64
 14  Unnamed: 14          0 non-null      float64
 15  Unnamed: 15          0 non-null      float64
 16  Unnamed

In [11]:
from sklearn import preprocessing

# Identify categorical columns
categorical_columns = df.select_dtypes(include=['object']).columns
print("Categorical columns:\n", categorical_columns)

# Apply Label Encoding to categorical columns
label_encoder = preprocessing.LabelEncoder()

for col in categorical_columns:
    df[col] = label_encoder.fit_transform(df[col])

print("After encoding categorical features:", df.head())


Categorical columns:
 Index(['Attack Type'], dtype='object')
After encoding categorical features: Empty DataFrame
Columns: [Event ID, Timestamp, Source IP, Destination IP, User Agent, Attack Type, Attack Severity, Data Exfiltrated, Threat Intelligence, Response Action, Label, Unnamed: 11, Unnamed: 12, Unnamed: 13, Unnamed: 14, Unnamed: 15, Unnamed: 16, Unnamed: 17, Unnamed: 18, Unnamed: 19, Unnamed: 20, Unnamed: 21, Unnamed: 22]
Index: []

[0 rows x 23 columns]


In [13]:
from sklearn.preprocessing import StandardScaler

# Identify numeric columns
numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns

# Initialize StandardScaler
scaler = StandardScaler()

# # Fit and transform the numeric columns
df[numeric_columns] = scaler.fit_transform(df[numeric_columns])


In [20]:
import pandas as pd

# Load the dataset
file_path = r"C:\Users\hp\Desktop\ML projects\CYBER_AI\ai_ml_cybersecurity_dataset.csv"
df = pd.read_csv(file_path)

# Display the current columns in the DataFrame
print("Current columns:", df.columns)

# Check if 'Label' or any other relevant column exists
if 'Label' in df.columns:
    print("Existing labels in 'Label' column:", df['Label'].unique())
else:
    print("No 'Label' column found. Checking for another label column.")

# If 'Label' column is not present, check for a potential column name that could contain threat types
# Replace 'Your_Column_Name' with the actual name you find appropriate
potential_label_column = 'Data Exfiltrated'  # Change this to the correct column name after checking df.columns
if potential_label_column not in df.columns:
    print(f"The potential label column '{potential_label_column}' does not exist.")
else:
    # Get existing labels
    existing_labels = df[potential_label_column].unique()
    print("Existing labels:", existing_labels)

    # Define expected labels (you may have different labels)
    expected_labels = [0, 1]  # For example, 0 = benign, 1 = malicious

    # Create a new 'Label' column based on existing data
    df['Label'] = -1  # Initialize with -1 or some default value indicating 'unknown'

    # Populate the new 'Label' column based on conditions
    for label in expected_labels:
        if label in existing_labels:
            df.loc[df[potential_label_column] == label, 'Label'] = label  # Set the correct label for known categories
        else:
            print(f"Label {label} is missing in the dataset.")

    # Display the updated DataFrame
    print("Updated DataFrame:")
    print(df.head())

    # Save the updated DataFrame to a new CSV file if needed
    df.to_csv("updated_ai_ml_cybersecurity_dataset.csv", index=False)
    print("Updated dataset saved to 'updated_ai_ml_cybersecurity_dataset.csv'.")


Current columns: Index(['Event ID', 'Timestamp', 'Source IP', 'Destination IP', 'User Agent',
       'Attack Type', 'Attack Severity', 'Data Exfiltrated',
       'Threat Intelligence', 'Response Action'],
      dtype='object')
No 'Label' column found. Checking for another label column.
Existing labels: [False  True]
Updated DataFrame:
   Event ID  Timestamp  Source IP  Destination IP  User Agent  Attack Type  \
0      2513        511      12084           14471        2796            4   
1      1765       6914       1233            7845        4464            2   
2      1183       9811       7296           17578        8861            4   
3       544      13569      13177           18934        1484            0   
4      9515       9284       2295            1522       10166            2   

   Attack Severity  Data Exfiltrated  Threat Intelligence  Response Action  \
0                0             False                 3928                2   
1                0              True  

In [5]:
import pandas as pd
from sklearn.utils import resample

# Load dataset
file_path = r"C:\Users\hp\Desktop\ML projects\CYBER_AI\updated_ai_ml_dataset.csv"
df = pd.read_csv(file_path)

# Check for the correct column name
print(df.columns)

# Use the correct column name for label (change 'Threat_Type' based on your dataset)
label_column = 'Attack Type'  # Replace with the actual column name in your dataset

# Separate majority and minority classes
df_majority = df[df[label_column] == 0]  # Assuming 0 = benign, 1 = malicious
df_minority = df[df[label_column] == 1]

# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # Sample with replacement
                                 n_samples=len(df_majority),    # Match majority class count
                                 random_state=42)

# Combine majority and upsampled minority class
df_balanced = pd.concat([df_majority, df_minority_upsampled])

# Shuffle the dataset
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Display class distribution
print(df_balanced[label_column].value_counts())


Index(['Event ID', 'Timestamp', 'Source IP', 'Destination IP', 'User Agent',
       'Attack Type', 'Attack Severity', 'Data Exfiltrated',
       'Threat Intelligence', 'Response Action', 'Label'],
      dtype='object')
Attack Type
1    27580
0    27580
Name: count, dtype: int64


In [24]:
from sklearn.feature_selection import SelectKBest, f_classif

# Separate features and target
X = df_balanced.drop('Label', axis=1)
y = df_balanced['Label']

# Apply SelectKBest to extract top 10 features
selector = SelectKBest(score_func=f_classif, k=10)
X_new = selector.fit_transform(X, y)

# Get selected feature names
selected_features = X.columns[selector.get_support(indices=True)]
print("Selected features:", selected_features)


Selected features: Index(['Event ID', 'Timestamp', 'Source IP', 'Destination IP', 'User Agent',
       'Attack Type', 'Attack Severity', 'Data Exfiltrated',
       'Threat Intelligence', 'Response Action'],
      dtype='object')


  f = msb / msw


In [25]:
from sklearn.model_selection import train_test_split

# Split the data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.2, random_state=42)


In [26]:
# Save training data
train_data = pd.DataFrame(X_train, columns=selected_features)
train_data['Label'] = y_train
train_data.to_csv('preprocessed_train_data.csv', index=False)

# Save testing data
test_data = pd.DataFrame(X_test, columns=selected_features)
test_data['Label'] = y_test
test_data.to_csv('preprocessed_test_data.csv', index=False)


In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the processed dataset
file_path = r"C:\Users\hp\Desktop\ML projects\CYBER_AI\preprocessed_train_data2.csv"  # Update with your processed dataset path
df = pd.read_csv(file_path)

# Check for missing values in the dataset
print("Missing values in the dataset:\n", df.isnull().sum())

# Check for missing values in the target variable
print("Missing values in the target variable:\n", df['Label'].isnull().sum())  # Update 'Label' as necessary

# Drop rows with missing values (if any)
df.dropna(subset=['Label'], inplace=True)  # Only drop if Label has NaN values
df.dropna(inplace=True)  # Drop any rows with missing values in other features

# Separate features and labels
X = df.drop(columns=['Label'])  # Replace 'Label' with your actual label column name
y = df['Label']  # Assuming 'Label' is the target variable

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest classifier
model = RandomForestClassifier(random_state=42)

# Fit the model on the training data
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Optional: Classification report for detailed metrics
print(classification_report(y_test, y_pred))


Missing values in the dataset:
 Event ID               0
Timestamp              0
Source IP              0
Destination IP         0
User Agent             0
Attack Type            0
Attack Severity        0
Data Exfiltrated       0
Threat Intelligence    0
Response Action        0
Label                  0
dtype: int64
Missing values in the target variable:
 0
Accuracy: 0.88
              precision    recall  f1-score   support

         0.0       0.89      0.99      0.94       323
         1.0       0.00      0.00      0.00        38

    accuracy                           0.88       361
   macro avg       0.45      0.49      0.47       361
weighted avg       0.80      0.88      0.84       361



In [7]:
from sklearn.metrics import classification_report

# Assuming you already have y_test and y_pred
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

         0.0       0.89      0.99      0.94       323
         1.0       0.00      0.00      0.00        38

    accuracy                           0.88       361
   macro avg       0.45      0.49      0.47       361
weighted avg       0.80      0.88      0.84       361

