In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


df = pd.read_csv("Global_Cybersecurity_Threats_2015-2024.csv")

In [11]:
# %% Step 1: Handle Missing Values
numeric_cols = [
    "Financial Loss (in Million $)",
    "Number of Affected Users",
    "Incident Resolution Time (in Hours)"
]

df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())

# تحقق من عدم وجود NaN بعد المعالجة
print(df[numeric_cols].isnull().sum())


Financial Loss (in Million $)          0
Number of Affected Users               0
Incident Resolution Time (in Hours)    0
dtype: int64


In [12]:
X = df.drop(columns=['Financial Loss (in Million $)', 'Number of Affected Users'])

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

categorical_features = X.select_dtypes(include=['object']).columns.tolist()
numerical_features = X.select_dtypes(exclude=['object']).columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(sparse_output=False, drop='first'), categorical_features)
    ],
    remainder='passthrough'
)

X_encoded = preprocessor.fit_transform(X)
print("X_encoded shape:", X_encoded.shape)


X_encoded shape: (3000, 32)


In [16]:
from sklearn.ensemble import IsolationForest
import numpy as np

# تحديد الشاذين
iso = IsolationForest(contamination=0.05, random_state=42)
outliers = iso.fit_predict(X_encoded)

# فهارس العينات الشاذة
outlier_indices = np.where(outliers == -1)[0]

# عدد العينات الشاذة
num_outliers = len(outlier_indices)
print("Number of outliers detected:", num_outliers)


Number of outliers detected: 150


In [18]:
X_clean_users = np.delete(X_encoded, outlier_indices, axis=0)
y_clean_users = np.delete(y_users, outlier_indices, axis=0)

print("New X shape:", X_clean_users.shape)
print("New y shape:", y_clean_users.shape)

NameError: name 'y_users' is not defined

In [13]:
from sklearn.ensemble import RandomForestRegressor
import numpy as np

# استخدم الهدف الأول: Financial Loss
y = df['Financial Loss (in Million $)'].values

# نموذج Random Forest
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_encoded, y)

# Feature importances
importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]

print("Feature Importances:")
for i in range(len(importances)):
    print(f"{i+1}. Feature {indices[i]}: {importances[indices[i]]:.4f}")


Feature Importances:
1. Feature 31: 0.2393
2. Feature 30: 0.1182
3. Feature 24: 0.0268
4. Feature 23: 0.0268
5. Feature 11: 0.0259
6. Feature 25: 0.0258
7. Feature 21: 0.0256
8. Feature 22: 0.0252
9. Feature 29: 0.0247
10. Feature 27: 0.0245
11. Feature 13: 0.0245
12. Feature 9: 0.0236
13. Feature 26: 0.0225
14. Feature 28: 0.0224
15. Feature 19: 0.0223
16. Feature 20: 0.0223
17. Feature 12: 0.0222
18. Feature 10: 0.0222
19. Feature 18: 0.0206
20. Feature 17: 0.0204
21. Feature 16: 0.0203
22. Feature 0: 0.0198
23. Feature 8: 0.0190
24. Feature 14: 0.0182
25. Feature 2: 0.0181
26. Feature 6: 0.0180
27. Feature 7: 0.0177
28. Feature 5: 0.0174
29. Feature 1: 0.0172
30. Feature 15: 0.0170
31. Feature 4: 0.0169
32. Feature 3: 0.0146


In [14]:
# تحديد threshold للأهمية
threshold = 0.02
important_indices = [i for i, imp in enumerate(importances) if imp >= threshold]

# إنشاء X_encoded جديد بالميزات المهمة فقط
X_encoded_reduced = X_encoded[:, important_indices]

print("New X_encoded shape:", X_encoded_reduced.shape)


New X_encoded shape: (3000, 21)


In [15]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
y_financial_log = np.log1p(df['Financial Loss (in Million $)'].values)

# تقسيم البيانات
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, random_state=42
)

# تدريب نموذج Random Forest
rf_financial = RandomForestRegressor(n_estimators=100, random_state=42)
rf_financial.fit(X_train, y_train)

# التنبؤ
y_pred_financial = rf_financial.predict(X_test)

# تقييم الأداء
mse = mean_squared_error(y_test, y_pred_financial)
r2 = r2_score(y_test, y_pred_financial)

print("MSE:", mse)
print("R2 Score:", r2)


MSE: 853.5314238669833
R2 Score: -0.05552305888696041
