In [1]:
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [3]:
# Load the dataset
df = pd.read_csv("/content/Training Dataset.csv")

In [4]:
# Selected columns
selected_columns = [
    'Financial Loss (in Million $)',
    'Number of Affected Users',
    'Incident Resolution Time (in Hours)',
    'Country',
    'Attack Type',
    'Attack Source',
    'Security Vulnerability Type'
]

df_selected = df[selected_columns].copy()

In [5]:
df_selected.head(5)

Unnamed: 0,Financial Loss (in Million $),Number of Affected Users,Incident Resolution Time (in Hours),Country,Attack Type,Attack Source,Security Vulnerability Type
0,80.53,773169,63,China,Phishing,Hacker Group,Unpatched Software
1,62.19,295961,71,China,Ransomware,Hacker Group,Unpatched Software
2,38.65,605895,20,India,Man-in-the-Middle,Hacker Group,Weak Passwords
3,41.44,659320,7,UK,Ransomware,Nation-state,Social Engineering
4,74.41,810682,68,Germany,Man-in-the-Middle,Insider,Social Engineering


In [6]:
# Step 1: Label Encoding for categorical features

categorical_cols = ['Country','Attack Type', 'Attack Source', 'Security Vulnerability Type']
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    df_selected[col] = le.fit_transform(df_selected[col])
    label_encoders[col] = le

In [7]:
# Step 2: Scale numerical features

numerical_cols = ['Financial Loss (in Million $)', 'Number of Affected Users', 'Incident Resolution Time (in Hours)']
scaler = StandardScaler()
df_selected[numerical_cols] = scaler.fit_transform(df_selected[numerical_cols])

In [8]:
df_selected.head(5)

Unnamed: 0,Financial Loss (in Million $),Number of Affected Users,Incident Resolution Time (in Hours),Country,Attack Type,Attack Source,Security Vulnerability Type
0,1.040626,0.928052,1.289283,2,3,0,1
1,0.40409,-0.718243,1.678366,2,4,0,1
2,-0.412926,0.350982,-0.802039,5,2,0,2
3,-0.316092,0.53529,-1.434299,8,4,2,0
4,0.828216,1.057467,1.53246,4,2,1,0


In [9]:
# Step 3: Apply Isolation Forest

model = IsolationForest(contamination=0.05, random_state=42)
df_selected['Anomaly'] = model.fit_predict(df_selected)

In [10]:
# Step 4: Map -1 = Anomaly, 1 = Normal

df_selected['Anomaly'] = df_selected['Anomaly'].map({1: 1, -1: -1})


In [11]:
# Step 5: Show value counts

anomaly_counts = df_selected['Anomaly'].value_counts().rename(index={1: 'Normal', -1: 'Anomaly'})
anomaly_counts.index.name = None

print("Anomaly Detection Summary:")
print(anomaly_counts)

Anomaly Detection Summary:
Normal     2802
Anomaly     148
Name: count, dtype: int64


In [12]:
# Step 6: Preview

df_selected.head(5)

Unnamed: 0,Financial Loss (in Million $),Number of Affected Users,Incident Resolution Time (in Hours),Country,Attack Type,Attack Source,Security Vulnerability Type,Anomaly
0,1.040626,0.928052,1.289283,2,3,0,1,1
1,0.40409,-0.718243,1.678366,2,4,0,1,1
2,-0.412926,0.350982,-0.802039,5,2,0,2,1
3,-0.316092,0.53529,-1.434299,8,4,2,0,1
4,0.828216,1.057467,1.53246,4,2,1,0,1


In [None]:
# Step 7: Save to CSV

df_selected.to_csv("anomaly_detection_output.csv", index=False)

# **Predict a new data**

In [13]:
import pandas as pd

# Sample new data (replace this with your actual unknown data)
new_data = pd.DataFrame({
    'Financial Loss (in Million $)': [69.39],
    'Number of Affected Users': [358933],
    'Incident Resolution Time (in Hours)': [3],
    'Country': ['Germany'],
    'Attack Type': ['Malware'],
    'Attack Source': ['Nation-state'],
    'Security Vulnerability Type': ['Zero-day']
})

In [14]:
# Label encode using previously fitted encoders
for col in ['Country', 'Attack Type', 'Attack Source', 'Security Vulnerability Type']:
    le = label_encoders[col]
    new_data[col] = le.transform(new_data[col])

In [15]:
# Scale numerical features using previously fitted scaler
new_data[numerical_cols] = scaler.transform(new_data[numerical_cols])

In [16]:
# Predict anomaly
prediction = model.predict(new_data)

# Interpret result
print("Prediction:", "Anomaly" if prediction[0] == -1 else "Normal")

Prediction: Normal


In [17]:
import joblib

# Save the trained model
joblib.dump(model, 'isolation_forest_model.joblib')

print("Model saved successfully as 'isolation_forest_model.joblib'")

Model saved successfully as 'isolation_forest_model.joblib'
