In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy.stats import pearsonr, spearmanr

df = pd.read_csv("Global_Cybersecurity_Threats_2015-2024.csv")

In [5]:
log_users = np.log1p(df['Number of Affected Users'])
log_loss  = np.log1p(df['Financial Loss (in Million $)'])

pearson_corr, pearson_p = pearsonr(log_users, log_loss)

spearman_corr, spearman_p = spearmanr(log_users, log_loss)


print("Pearson correlation:", pearson_corr, "p-value:", pearson_p)
print("Spearman correlation:", spearman_corr, "p-value:", spearman_p)

Pearson correlation: -0.007022020410552449 p-value: 0.7006402410338618
Spearman correlation: 0.0017342787487140197 p-value: 0.9243538715105244


In [6]:
# الهدف الجديد
y_users = np.log1p(df['Number of Affected Users'].values)

# الميزات (نفس X، بدون أي من الهدفين)
X = df.drop(columns=['Financial Loss (in Million $)', 'Number of Affected Users'])
X['Year'] = X['Year'].astype(int)


# التعرف على categorical و numerical features
categorical_features = X.select_dtypes(include=['object']).columns.tolist()
numerical_features = X.select_dtypes(exclude=['object']).columns.tolist()

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# إنشاء preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(sparse_output=False, drop='first'), categorical_features)
    ],
    remainder='passthrough'
)

# تحويل الميزات
X_encoded_users = preprocessor.fit_transform(X)
print("X_encoded shape for Number of Affected Users:", X_encoded_users.shape)


X_encoded shape for Number of Affected Users: (3000, 32)


In [7]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

X_train, X_test, y_train, y_test = train_test_split(
    X_encoded_users,
    y_users,
    test_size=0.2,
    random_state=42
)

rf_users = RandomForestRegressor(n_estimators=100, random_state=42)
rf_users.fit(X_train, y_train)

y_pred_users = rf_users.predict(X_test)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)
print("Predictions for Number of Affected Users:", y_pred_users[:10])


X_train shape: (2400, 32)
X_test shape: (600, 32)
y_train shape: (2400,)
y_test shape: (600,)
Predictions for Number of Affected Users: [12.74583142 12.79742517 13.23471946 13.04707779 12.86921837 12.9144776
 12.79325086 12.8039724  12.43994661 12.75718447]


In [8]:
from sklearn.metrics import mean_squared_error, r2_score

mse_users = mean_squared_error(y_test, y_pred_users)
r2_users = r2_score(y_test, y_pred_users)

print("MSE (Number of Affected Users):", mse_users)
print("R2 Score (Number of Affected Users):", r2_users)


MSE (Number of Affected Users): 1.2109244501763374
R2 Score (Number of Affected Users): -0.05249031572306451
