In [39]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Load dataset
df = pd.read_csv(r"D:\task 1\customer_churn_dataset-training-master.csv")

# Drop CustomerID
df.drop("CustomerID", axis=1, inplace=True)

# Drop the one row that has all NaNs
df.dropna(inplace=True)

# Convert target to integer
df["Churn"] = df["Churn"].astype(int)

# One-hot encode categorical columns
df = pd.get_dummies(df, drop_first=True)


In [40]:
# Split features and target
X = df.drop("Churn", axis=1)
y = df["Churn"]

# Impute missing values if any
imputer = SimpleImputer(strategy="most_frequent")
X = imputer.fit_transform(X)

# Split into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Optional: scale data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)


In [41]:
# Logistic Regression
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train, y_train)
y_pred_log = log_model.predict(X_val)

# Random Forest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_val)

# XGBoost
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_val)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [42]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

def evaluate_model(y_true, y_pred, name):
    print(f"\n📌 {name}")
    print("✅ Accuracy:", accuracy_score(y_true, y_pred))
    print("📊 Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
    print("📋 Classification Report:\n", classification_report(y_true, y_pred))
    print("🧠 ROC-AUC Score:", roc_auc_score(y_true, y_pred))

# تقييم الموديلات
evaluate_model(y_val, y_pred_log, "Logistic Regression")
evaluate_model(y_val, y_pred_rf, "Random Forest")
evaluate_model(y_val, y_pred_xgb, "XGBoost")



📌 Logistic Regression
✅ Accuracy: 0.8964465162702598
📊 Confusion Matrix:
 [[34558  3505]
 [ 5625 44479]]
📋 Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.91      0.88     38063
           1       0.93      0.89      0.91     50104

    accuracy                           0.90     88167
   macro avg       0.89      0.90      0.90     88167
weighted avg       0.90      0.90      0.90     88167

🧠 ROC-AUC Score: 0.8978246690280691

📌 Random Forest
✅ Accuracy: 0.999625710299772
📊 Confusion Matrix:
 [[38061     2]
 [   31 50073]]
📋 Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     38063
           1       1.00      1.00      1.00     50104

    accuracy                           1.00     88167
   macro avg       1.00      1.00      1.00     88167
weighted avg       1.00      1.00      1.00     88167

🧠 ROC-AUC Score: 0.9996643712287229

📌 XGBoost
✅ Accu

In [43]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

# ✅ Load training data (to get columns structure)
train_df = pd.read_csv(r"D:\task 1\customer_churn_dataset-training-master.csv")
for col in train_df.select_dtypes(include="object").columns:
    if col != "Churn":
        train_df[col] = LabelEncoder().fit_transform(train_df[col].astype(str))
train_processed = train_df.copy()

# ✅ Load test data
test_df = pd.read_csv(r"D:\task 1\customer_churn_dataset-testing-master.csv")
ids = test_df["CustomerID"]
test_df.drop("CustomerID", axis=1, inplace=True)

# ✅ Encode categorical test data
for col in test_df.select_dtypes(include="object").columns:
    if col != "Churn":
        test_df[col] = LabelEncoder().fit_transform(test_df[col].astype(str))

# ✅ Split test features and target (if exists)
if "Churn" in test_df.columns:
    y_test_true = test_df["Churn"].astype(int)
    X_test = test_df.drop("Churn", axis=1)
else:
    y_test_true = None
    X_test = test_df

# ✅ Ensure test data has all train columns used in imputer
train_cols = imputer.feature_names_in_
for col in train_cols:
    if col not in X_test.columns:
        X_test[col] = 0

X_test = X_test[train_cols]  # Reorder columns exactly

# ✅ Apply same imputer and scaler from training
X_test_imputed = imputer.transform(X_test)
X_test_scaled = scaler.transform(X_test_imputed)

# ✅ Predict using trained XGBoost model
test_preds = xgb_model.predict(X_test_scaled)

# ✅ Evaluate if true labels exist
if y_test_true is not None:
    print("✅ Test Accuracy:", accuracy_score(y_test_true, test_preds))
    print("📋 Test Classification Report:\n", classification_report(y_test_true, test_preds))
    print("🧠 ROC-AUC Score:", roc_auc_score(y_test_true, test_preds))

# ✅ Save predictions
output = pd.DataFrame({"CustomerID": ids, "PredictedChurn": test_preds})
output.to_csv("test_predictions_final.csv", index=False)
print("✅ Predictions saved to: test_predictions_final.csv")


✅ Test Accuracy: 0.5012893404169385
📋 Test Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.05      0.10     33881
           1       0.49      1.00      0.65     30493

    accuracy                           0.50     64374
   macro avg       0.74      0.53      0.38     64374
weighted avg       0.75      0.50      0.36     64374

🧠 ROC-AUC Score: 0.5261946233226413
✅ Predictions saved to: test_predictions_final.csv
