In [2]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from imblearn.pipeline import Pipeline  # ✅ Switched to imblearn Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, recall_score
from imblearn.over_sampling import SMOTE  # ✅ Replaced SMOTEENN with SMOTE
import joblib
import os

In [3]:
# تحميل البيانات
data = pd.read_csv('../Data/es.csv')

In [4]:
# إزالة عمود معرف العميل
data.drop('customerID', axis=1, inplace=True)

In [5]:
# تحويل TotalCharges إلى رقمية ومعالجة القيم المفقودة
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce')
data['TotalCharges'] = data['TotalCharges'].fillna(data['TotalCharges'].median())

In [6]:
# فصل المتغير الهدف
y = data['Churn']

In [7]:
# ترميز المتغير الهدف
le_y = LabelEncoder()
y = le_y.fit_transform(y)

In [8]:
# حذف العمود الهدف من الميزات
X = data.drop('Churn', axis=1)

In [9]:
# تحديد الأعمدة الثنائية والمتعددة والفئوية والرقمية
binary_cols = [col for col in X.select_dtypes(include='object').columns if X[col].nunique() == 2]
multicat_cols = [col for col in X.select_dtypes(include='object').columns if X[col].nunique() > 2]
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

In [10]:
# دالة لترميز الأعمدة الثنائية
def binary_encode(df, columns):
    for col in columns:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
    return df

In [11]:
# تطبيق الترميز
X = binary_encode(X, binary_cols)

In [12]:
# إنشاء المحول العام للأعمدة
preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numerical_cols),
    ('cat', OneHotEncoder(drop='first', sparse_output=False), multicat_cols)
])

In [13]:
# تقسيم البيانات إلى تدريب واختبار
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)



In [14]:
# تعريف النماذج المستخدمة
models = {
    "LogisticRegression": LogisticRegression(class_weight={0: 1, 1: 2}, max_iter=1000, random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=3),
    "SVM": SVC(probability=True, class_weight={0: 1, 1: 2}, C=3.0, kernel='rbf', random_state=42)
}


In [15]:
# دالة لإنشاء البايبلاين الكامل
def build_pipeline(clf):
    return Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('balancer', SMOTE(random_state=42)),
        ('classifier', clf)
    ])


In [16]:
# دالة لتدريب وتقييم النموذج
def evaluate(model_name, threshold=0.5):
    print(f"\n🔍 Evaluating: {model_name} with threshold={threshold}")
    pipeline = build_pipeline(models[model_name])
    pipeline.fit(X_train, y_train)

    # التنبؤ باحتمالات والتعديل على العتبة لو لزم
    if threshold != 0.5 and hasattr(pipeline.named_steps['classifier'], 'predict_proba'):
        y_train_probs = pipeline.predict_proba(X_train)[:, 1]
        y_train_pred = (y_train_probs >= threshold).astype(int)
    else:
        y_train_pred = pipeline.predict(X_train)

    # تقارير الأداء
    print("📊 Confusion Matrix (Train):")
    print(confusion_matrix(y_train, y_train_pred))
    print("📄 Classification Report (Train):")
    print(classification_report(y_train, y_train_pred))
    print("🔁 Train Recall:", recall_score(y_train, y_train_pred))

    return pipeline


In [17]:
# تجربة النماذج المختلفة
log_model = evaluate("LogisticRegression", threshold=0.4)
knn_model = evaluate("KNN")
svm_model = evaluate("SVM")



🔍 Evaluating: LogisticRegression with threshold=0.4
📊 Confusion Matrix (Train):
[[2084 2055]
 [  89 1406]]
📄 Classification Report (Train):
              precision    recall  f1-score   support

           0       0.96      0.50      0.66      4139
           1       0.41      0.94      0.57      1495

    accuracy                           0.62      5634
   macro avg       0.68      0.72      0.61      5634
weighted avg       0.81      0.62      0.64      5634

🔁 Train Recall: 0.9404682274247491

🔍 Evaluating: KNN with threshold=0.5
📊 Confusion Matrix (Train):
[[3488  651]
 [ 110 1385]]
📄 Classification Report (Train):
              precision    recall  f1-score   support

           0       0.97      0.84      0.90      4139
           1       0.68      0.93      0.78      1495

    accuracy                           0.86      5634
   macro avg       0.82      0.88      0.84      5634
weighted avg       0.89      0.86      0.87      5634

🔁 Train Recall: 0.9264214046822743

🔍 Evalua

In [18]:
# اختيار النموذج النهائي
final_model = log_model  # يمكن تغييره حسب الأداء

# حفظ النموذج والبيانات المساعدة
os.makedirs("../Model", exist_ok=True)
os.makedirs("../Scaler", exist_ok=True)
joblib.dump(final_model, "../Model/fraud_model.pkl")
joblib.dump(X.columns.tolist(), "../Model/model_columns.pkl")
joblib.dump(le_y, "../Model/target_encoder.pkl")

print("✅ Model and pipeline saved successfully.")


✅ Model and pipeline saved successfully.
