In [47]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import xgboost as xgb
from imblearn.over_sampling import SMOTE


In [None]:
df = pd.read_csv("Global_Cybersecurity_Threats_2015-2024.csv")
df['AffectedUsersClass'] = pd.qcut(df['Number of Affected Users'], q=4, labels=['Low', 'Medium', 'High', 'Very High'])
df['FinancialLossClass'] = pd.qcut(df['Financial Loss (in Million $)'], q=4, labels=['Low', 'Medium', 'High', 'Very High'])

# التأكد من عدد العينات لكل فئة
print("عدد عينات فئات Affected Users بعد التقسيم:")
print(df['AffectedUsersClass'].value_counts())

print("\nعدد عينات فئات Financial Loss بعد التقسيم:")
print(df['FinancialLossClass'].value_counts())

عدد عينات فئات Affected Users بعد التقسيم:
AffectedUsersClass
Low          750
Medium       750
High         750
Very High    750
Name: count, dtype: int64

عدد عينات فئات Financial Loss بعد التقسيم:
FinancialLossClass
High         751
Low          750
Medium       750
Very High    749
Name: count, dtype: int64


In [49]:
print("عدد عينات فئات Affected Users:")
print(df['AffectedUsersClass'].value_counts())

print("\nعدد عينات فئات Financial Loss:")
print(df['FinancialLossClass'].value_counts())

عدد عينات فئات Affected Users:
AffectedUsersClass
Low          750
Medium       750
High         750
Very High    750
Name: count, dtype: int64

عدد عينات فئات Financial Loss:
FinancialLossClass
High         751
Low          750
Medium       750
Very High    749
Name: count, dtype: int64


In [50]:
X = df.drop(columns=[
    'Number of Affected Users', 'Financial Loss (in Million $)',
    'AffectedUsersClass', 'FinancialLossClass'
])
X['Year'] = X['Year'].astype(int)

In [51]:

categorical_features = X.select_dtypes(include=['object']).columns.tolist()
numerical_features = X.select_dtypes(exclude=['object']).columns.tolist()

In [52]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(sparse_output=False, drop='first'), categorical_features)
    ],
    remainder='passthrough'
)

X_encoded = preprocessor.fit_transform(X)

In [53]:
le_users = LabelEncoder()
y_users = le_users.fit_transform(df['AffectedUsersClass'])

le_loss = LabelEncoder()
y_loss = le_loss.fit_transform(df['FinancialLossClass'])

In [54]:
smote = SMOTE(random_state=42)
X_resampled_u, y_resampled_u = smote.fit_resample(X_encoded, y_users)

# oversampling لفئات Financial Loss
X_resampled_l, y_resampled_l = smote.fit_resample(X_encoded, y_loss)

In [55]:
X_train_u, X_test_u, y_train_u, y_test_u = train_test_split(
    X_encoded, y_users, test_size=0.2, random_state=42
)

X_train_l, X_test_l, y_train_l, y_test_l = train_test_split(
    X_encoded, y_loss, test_size=0.2, random_state=42
)

In [56]:
rf_params = {
    "use_label_encoder": False,
    "eval_metric": "mlogloss",
    "random_state": 42,
    "scale_pos_weight": 1  # يمكن تعديلها إذا كان imbalance شديد
}

In [57]:
xgb_users = xgb.XGBClassifier(**rf_params)
xgb_users.fit(X_train_u, y_train_u)
y_pred_u = xgb_users.predict(X_test_u)

xgb_loss = xgb.XGBClassifier(**rf_params)
xgb_loss.fit(X_train_l, y_train_l)
y_pred_l = xgb_loss.predict(X_test_l)

Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [58]:
print("Classification report for Affected Users (XGBoost):")
print(classification_report(y_test_u, y_pred_u, target_names=le_users.classes_))

print("Classification report for Financial Loss (XGBoost):")
print(classification_report(y_test_l, y_pred_l, target_names=le_loss.classes_))

Classification report for Affected Users (XGBoost):
              precision    recall  f1-score   support

        High       0.23      0.20      0.21       152
         Low       0.26      0.25      0.26       153
      Medium       0.24      0.30      0.27       139
   Very High       0.22      0.20      0.21       156

    accuracy                           0.24       600
   macro avg       0.24      0.24      0.24       600
weighted avg       0.24      0.24      0.23       600

Classification report for Financial Loss (XGBoost):
              precision    recall  f1-score   support

        High       0.25      0.25      0.25       163
         Low       0.18      0.19      0.18       143
      Medium       0.24      0.24      0.24       148
   Very High       0.21      0.21      0.21       146

    accuracy                           0.22       600
   macro avg       0.22      0.22      0.22       600
weighted avg       0.22      0.22      0.22       600

