In [1]:
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import joblib

# 📂 تحميل البيانات
df = pd.read_csv(r"C:\project grad linex\project\features\pair_pas_matrix.csv")
X = df.drop(columns=["label"])
y = df["label"]

# 🔀 تقسيم البيانات (80% train - 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# ⚖️ حساب وزن الفئة (label 0 أقل من 1)
scale_pos_weight = y_train.value_counts()[0] / y_train.value_counts()[1]

# 🚀 إعداد النموذج
model = XGBClassifier(
    n_estimators=200,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=scale_pos_weight,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)

# 🏋️‍♂️ تدريب النموذج
model.fit(X_train, y_train)

# ✅ التقييم
y_pred = model.predict(X_test)
print("📊 Classification Report:\n", classification_report(y_test, y_pred))
print("📉 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("🔵 ROC-AUC Score:", roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))

# 💾 حفظ النموذج
model_path = r"C:\project grad linex\project\models\xgboost_model.pkl"
joblib.dump(model, model_path)
print(f"✅ تم حفظ النموذج في: {model_path}")


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


📊 Classification Report:
               precision    recall  f1-score   support

           0       0.51      0.58      0.54        60
           1       0.80      0.74      0.77       131

    accuracy                           0.69       191
   macro avg       0.65      0.66      0.65       191
weighted avg       0.70      0.69      0.70       191

📉 Confusion Matrix:
 [[35 25]
 [34 97]]
🔵 ROC-AUC Score: 0.7447837150127226
✅ تم حفظ النموذج في: C:\project grad linex\project\models\xgboost_model.pkl


In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
import os

# 📥 تحميل البيانات الأصلية
data_path = r"C:\project grad linex\project\features\pair_pas_matrix.csv"
df = pd.read_csv(data_path)

# 🧾 فصل الميزات عن التصنيفات
X = df.drop(columns=["label"])
y = df["label"]

# 📁 المسار المطلوب للحفظ
save_path = r"C:\project grad linex\project\models"
os.makedirs(save_path, exist_ok=True)

# ✂️ الخطوة 1: تقسيم إلى 85% تدريب + تحقق و 15% اختبار
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.15, stratify=y, random_state=42
)

# ✂️ الخطوة 2: تقسيم X_temp إلى 70% تدريب و15% تحقق
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.176, stratify=y_temp, random_state=42
)

# 💾 حفظ الملفات
X_train.to_csv(os.path.join(save_path, "X_train.csv"), index=False)
X_val.to_csv(os.path.join(save_path, "X_val.csv"), index=False)
X_test.to_csv(os.path.join(save_path, "X_test.csv"), index=False)

y_train.to_csv(os.path.join(save_path, "y_train.csv"), index=False)
y_val.to_csv(os.path.join(save_path, "y_val.csv"), index=False)
y_test.to_csv(os.path.join(save_path, "y_test.csv"), index=False)

print("✅ تم التقسيم بنجاح.")
print("📂 الملفات محفوظة في:", save_path)


✅ تم التقسيم بنجاح.
📂 الملفات محفوظة في: C:\project grad linex\project\models


In [None]:
#  xgboost_model_smote
import pandas as pd
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from imblearn.over_sampling import SMOTE
import joblib
import os

# 📁 مسار الملفات
base_path = r"C:\project grad linex\project\models"

# 📥 تحميل البيانات المقسمة
X_train = pd.read_csv(os.path.join(base_path, "X_train.csv"))
X_val   = pd.read_csv(os.path.join(base_path, "X_val.csv"))
X_test  = pd.read_csv(os.path.join(base_path, "X_test.csv"))

y_train = pd.read_csv(os.path.join(base_path, "y_train.csv")).values.ravel()
y_val   = pd.read_csv(os.path.join(base_path, "y_val.csv")).values.ravel()
y_test  = pd.read_csv(os.path.join(base_path, "y_test.csv")).values.ravel()

# ⚠️ تطبيق SMOTE فقط على مجموعة التدريب
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# 🚀 إعداد وتدريب نموذج XGBoost
model = XGBClassifier(
    n_estimators=200,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)

model.fit(X_train_smote, y_train_smote)

# ✅ التقييم على مجموعة التحقق (Validation)
y_val_pred = model.predict(X_val)
print("📊 Validation Report:")
print(classification_report(y_val, y_val_pred))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_val_pred))
print("ROC-AUC:", roc_auc_score(y_val, model.predict_proba(X_val)[:, 1]))

# 💾 حفظ النموذج المدرب
model_path = os.path.join(base_path, "xgboost_model_smote.pkl")
joblib.dump(model, model_path)
print(f"✅ النموذج تم حفظه بنجاح في: {model_path}")


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


📊 Validation Report:
              precision    recall  f1-score   support

           0       0.39      0.40      0.40        45
           1       0.72      0.71      0.72        98

    accuracy                           0.62       143
   macro avg       0.56      0.56      0.56       143
weighted avg       0.62      0.62      0.62       143

Confusion Matrix:
 [[18 27]
 [28 70]]
ROC-AUC: 0.6240362811791383
✅ النموذج تم حفظه بنجاح في: C:\project grad linex\project\models\xgboost_model_smote.pkl


In [12]:
# xgboost with smote and with train test validation

import pandas as pd
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import joblib
import os

# 📁 مسار الملفات
base_path = r"C:\project grad linex\project\models"

# 📥 تحميل البيانات المقسمة
X_train = pd.read_csv(os.path.join(base_path, "X_train.csv"))
X_val   = pd.read_csv(os.path.join(base_path, "X_val.csv"))
X_test  = pd.read_csv(os.path.join(base_path, "X_test.csv"))

y_train = pd.read_csv(os.path.join(base_path, "y_train.csv")).values.ravel()
y_val   = pd.read_csv(os.path.join(base_path, "y_val.csv")).values.ravel()
y_test  = pd.read_csv(os.path.join(base_path, "y_test.csv")).values.ravel()

# ⚖️ حساب وزن الفئة للتوازن الداخلي في XGBoost
scale_pos_weight = y_train.tolist().count(0) / y_train.tolist().count(1)

# 🚀 إعداد وتدريب نموذج XGBoost باستخدام scale_pos_weight
model = XGBClassifier(
    n_estimators=200,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=scale_pos_weight,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)

model.fit(X_train, y_train)

# ✅ التقييم على مجموعة التحقق (Validation)
y_val_pred = model.predict(X_val)
print("📊 Validation Report:")
print(classification_report(y_val, y_val_pred))
print("🧩 Confusion Matrix:\n", confusion_matrix(y_val, y_val_pred))
print("🔵 ROC-AUC:", roc_auc_score(y_val, model.predict_proba(X_val)[:, 1]))

# 💾 حفظ النموذج
model_path = os.path.join(base_path, "xgboost_model_weighted.pkl")
joblib.dump(model, model_path)
print(f"✅ النموذج تم حفظه بنجاح في: {model_path}")


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


📊 Validation Report:
              precision    recall  f1-score   support

           0       0.42      0.47      0.44        45
           1       0.74      0.70      0.72        98

    accuracy                           0.63       143
   macro avg       0.58      0.59      0.58       143
weighted avg       0.64      0.63      0.63       143

🧩 Confusion Matrix:
 [[21 24]
 [29 69]]
🔵 ROC-AUC: 0.6224489795918369
✅ النموذج تم حفظه بنجاح في: C:\project grad linex\project\models\xgboost_model_weighted.pkl


In [None]:
import pandas as pd
import os
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import joblib

# 📁 المسار
base_path = r"C:\project grad linex\project\models"

# 🧪 تحميل البيانات المقسمة
X_train = pd.read_csv(os.path.join(base_path, "X_train.csv"))
y_train = pd.read_csv(os.path.join(base_path, "y_train.csv")).values.ravel()

X_val = pd.read_csv(os.path.join(base_path, "X_val.csv"))
y_val = pd.read_csv(os.path.join(base_path, "y_val.csv")).values.ravel()

# 🧠 تعريف نموذج XGBoost
model = XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)

# 🔍 تعريف شبكة القيم للتجربة
param_grid = {
    "n_estimators": [100, 200],
    "max_depth": [3, 5, 7],
    "learning_rate": [0.01, 0.05, 0.1],
    "scale_pos_weight": [1, 2, 3]
}

# ⚙️ إعداد الـ Grid Search
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=5,
    scoring='f1',
    verbose=1,
    n_jobs=-1
)

# 🚀 تشغيل البحث
grid_search.fit(X_train, y_train)

# ✅ أفضل نموذج
best_model = grid_search.best_estimator_

print("\n✅ Best Parameters:", grid_search.best_params_)
print("📈 Best F1 Score (CV):", grid_search.best_score_)

# 🔍 التقييم على مجموعة التحقق
y_val_pred = best_model.predict(X_val)
print("\n📊 Validation Classification Report:\n", classification_report(y_val, y_val_pred))
print("🔵 ROC-AUC:", roc_auc_score(y_val, best_model.predict_proba(X_val)[:, 1]))
print("📉 Confusion Matrix:\n", confusion_matrix(y_val, y_val_pred))

# 💾 حفظ النموذج الأفضل
joblib.dump(best_model, os.path.join(base_path, "xgboost_best_grid.pkl"))
print("✅ النموذج الأمثل تم حفظه بنجاح.")


Fitting 5 folds for each of 54 candidates, totalling 270 fits

✅ Best Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 200, 'scale_pos_weight': 2}
📈 Best F1 Score (CV): 0.8157607012258307

📊 Validation Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.02      0.04        45
           1       0.69      1.00      0.82        98

    accuracy                           0.69       143
   macro avg       0.85      0.51      0.43       143
weighted avg       0.79      0.69      0.57       143

🔵 ROC-AUC: 0.6370748299319727
📉 Confusion Matrix:
 [[ 1 44]
 [ 0 98]]
✅ النموذج الأمثل تم حفظه بنجاح.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
