In [1]:
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [2]:
# Load the dataset
df = pd.read_csv("/content/Training Dataset.csv")

In [3]:
# Selected columns
selected_columns = [
    'Financial Loss (in Million $)',
    'Number of Affected Users',
    'Incident Resolution Time (in Hours)',
    'Country',
    'Attack Type',
    'Attack Source',
    'Security Vulnerability Type'
]

df_selected = df[selected_columns].copy()

In [4]:
df_selected.head(5)

Unnamed: 0,Financial Loss (in Million $),Number of Affected Users,Incident Resolution Time (in Hours),Country,Attack Type,Attack Source,Security Vulnerability Type
0,80.53,773169,63,China,Phishing,Hacker Group,Unpatched Software
1,62.19,295961,71,China,Ransomware,Hacker Group,Unpatched Software
2,38.65,605895,20,India,Man-in-the-Middle,Hacker Group,Weak Passwords
3,41.44,659320,7,UK,Ransomware,Nation-state,Social Engineering
4,74.41,810682,68,Germany,Man-in-the-Middle,Insider,Social Engineering


In [5]:
# Step 1: Label Encoding for categorical features

categorical_cols = ['Country','Attack Type', 'Attack Source', 'Security Vulnerability Type']
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    df_selected[col] = le.fit_transform(df_selected[col])
    label_encoders[col] = le

In [6]:
# Step 2: Scale numerical features

feature_cols = [
    'Financial Loss (in Million $)',
    'Number of Affected Users',
    'Incident Resolution Time (in Hours)',
    'Country', 'Attack Type', 'Attack Source',
    'Security Vulnerability Type'
]
scaler = StandardScaler()
df_selected[feature_cols] = scaler.fit_transform(df_selected[feature_cols])

In [7]:
df_selected.head(5)

Unnamed: 0,Financial Loss (in Million $),Number of Affected Users,Incident Resolution Time (in Hours),Country,Attack Type,Attack Source,Security Vulnerability Type
0,1.040626,0.928052,1.289283,-0.873644,0.299463,-1.402007,-0.459986
1,0.40409,-0.718243,1.678366,-0.873644,0.880277,-1.402007,-0.459986
2,-0.412926,0.350982,-0.802039,0.170059,-0.28135,-1.402007,0.428076
3,-0.316092,0.53529,-1.434299,1.213761,0.880277,0.408816,-1.348048
4,0.828216,1.057467,1.53246,-0.177842,-0.28135,-0.496595,-1.348048


In [21]:
# Step 3: Apply Isolation Forest

# Define the features to use for training, excluding 'Anomaly'
training_features = [col for col in feature_cols if col != 'Anomaly']

model = IsolationForest(n_estimators=100, contamination=0.05, random_state=42)
# Fit the model on the training features, excluding 'Anomaly'
model.fit(df_selected[training_features])

# Predict and add Anomaly column to the original df_selected
df_selected['Anomaly'] = model.predict(df_selected[training_features])

In [10]:
# Step 4: Map -1 = Anomaly, 1 = Normal

df_selected['Anomaly'] = df_selected['Anomaly'].map({1: 1, -1: -1})


In [11]:
# Step 5: Show value counts

anomaly_counts = df_selected['Anomaly'].value_counts().rename(index={1: 'Normal', -1: 'Anomaly'})
anomaly_counts.index.name = None

print("Anomaly Detection Summary:")
print(anomaly_counts)

Anomaly Detection Summary:
Normal     2802
Anomaly     148
Name: count, dtype: int64


In [12]:
# Step 6: Preview

df_selected.head(5)

Unnamed: 0,Financial Loss (in Million $),Number of Affected Users,Incident Resolution Time (in Hours),Country,Attack Type,Attack Source,Security Vulnerability Type,Anomaly
0,1.040626,0.928052,1.289283,-0.873644,0.299463,-1.402007,-0.459986,1
1,0.40409,-0.718243,1.678366,-0.873644,0.880277,-1.402007,-0.459986,1
2,-0.412926,0.350982,-0.802039,0.170059,-0.28135,-1.402007,0.428076,1
3,-0.316092,0.53529,-1.434299,1.213761,0.880277,0.408816,-1.348048,1
4,0.828216,1.057467,1.53246,-0.177842,-0.28135,-0.496595,-1.348048,1


In [13]:
# Step 7: Save to CSV

df_selected.to_csv("anomaly_detection_output.csv", index=False)

# **Predict a new data**

In [14]:
import pandas as pd

# Sample new data (replace this with your actual unknown data)
new_data = pd.DataFrame({
    'Financial Loss (in Million $)': [69.39],
    'Number of Affected Users': [358933],
    'Incident Resolution Time (in Hours)': [3],
    'Country': ['Germany'],
    'Attack Type': ['Malware'],
    'Attack Source': ['Nation-state'],
    'Security Vulnerability Type': ['Zero-day']
})

In [15]:
# Label encode using previously fitted encoders
for col in ['Country', 'Attack Type', 'Attack Source', 'Security Vulnerability Type']:
    le = label_encoders[col]
    new_data[col] = le.transform(new_data[col])

In [16]:
# Scale numerical features using previously fitted scaler
new_data[feature_cols] = scaler.transform(new_data[feature_cols])

In [22]:
# Define the features to use for prediction, excluding 'Anomaly'
prediction_features = [col for col in feature_cols if col != 'Anomaly']

# Predict anomaly
prediction = model.predict(new_data[prediction_features])

# Interpret result
print("Prediction:", "Anomaly" if prediction[0] == -1 else "Normal")

Prediction: Normal


In [23]:
# Save model & tools
import joblib

joblib.dump(model, 'isolation_forest_model.joblib')
joblib.dump(scaler, 'scaler.joblib')
joblib.dump(label_encoders, 'label_encoders.joblib')
joblib.dump(feature_cols, 'feature_columns.joblib')

print("Model and tools saved successfully.")

Model and tools saved successfully.
