In [64]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [65]:
# 1. Load Training Data

train_data = pd.read_csv("Customer_Data.csv")

# Map actual statuses to 0 (not churned) / 1 (churned)
status_map = {
    'Stayed': 0,
    'Joined': 0,
    'Churned': 1
}
train_data['Customer_Status'] = train_data['Customer_Status'].map(status_map)

# Drop rows with NaN target (if any)
train_data = train_data[train_data['Customer_Status'].isin([0,1])]

# Drop unnecessary columns
cols_to_drop = ['Customer_ID', 'Churn_Category', 'Churn_Reason']
train_data = train_data.drop(columns=[col for col in cols_to_drop if col in train_data.columns])

# Handle missing numeric values
num_cols = train_data.select_dtypes(include=[np.number]).columns
for col in num_cols:
    train_data[col] = train_data[col].fillna(train_data[col].median())

# Handle missing categorical values
cat_cols = train_data.select_dtypes(include=['object']).columns
label_encoders = {}
for col in cat_cols:
    train_data[col] = train_data[col].fillna(train_data[col].mode()[0])
    le = LabelEncoder()
    train_data[col] = le.fit_transform(train_data[col])
    label_encoders[col] = le

# Split features and target
X = train_data.drop('Customer_Status', axis=1)
y = train_data['Customer_Status']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [66]:
# 2. Train Models

models = {
    "Logistic Regression": LogisticRegression(max_iter=500),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "SVM": SVC(kernel='linear', probability=True, random_state=42)
}

for name, model in models.items():
    print(f"\n{name} Evaluation:")
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))


Logistic Regression Evaluation:
Accuracy: 0.8099688473520249
Confusion Matrix:
 [[860 105]
 [139 180]]
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.89      0.88       965
           1       0.63      0.56      0.60       319

    accuracy                           0.81      1284
   macro avg       0.75      0.73      0.74      1284
weighted avg       0.80      0.81      0.81      1284


Random Forest Evaluation:
Accuracy: 0.8348909657320872
Confusion Matrix:
 [[903  62]
 [150 169]]
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.94      0.89       965
           1       0.73      0.53      0.61       319

    accuracy                           0.83      1284
   macro avg       0.79      0.73      0.75      1284
weighted avg       0.83      0.83      0.83      1284


SVM Evaluation:
Accuracy: 0.8200934579439252
Confusion Matrix:
 [[869  96]
 [135 184]]
Classifi

In [67]:
# 3. Load Prediction Data

predict_data = pd.read_csv("Predicting_Data.csv")
original_data = predict_data.copy()  # Keep original for output

# Drop unnecessary columns
predict_data = predict_data.drop(columns=[col for col in cols_to_drop if col in predict_data.columns], errors='ignore')

# Keep only columns present in training
common_cols = [col for col in X.columns if col in predict_data.columns]
predict_data = predict_data[common_cols]

# Handle missing numeric values
num_cols = predict_data.select_dtypes(include=[np.number]).columns
for col in num_cols:
    predict_data[col] = predict_data[col].fillna(X[col].median())

# Handle categorical variables
cat_cols = predict_data.select_dtypes(include=['object']).columns
for col in cat_cols:
    if col in label_encoders:
        le = label_encoders[col]
        most_freq_class = le.classes_[0]
        predict_data[col] = predict_data[col].apply(lambda x: x if x in le.classes_ else most_freq_class)
        predict_data[col] = le.transform(predict_data[col])

# Scale prediction data
predict_data_scaled = scaler.transform(predict_data)

In [68]:
# 4. Make Predictions

for name, model in models.items():
    original_data[f'Customer_Status_Predicted_{name}'] = model.predict(predict_data_scaled)


# 5. Save Predictions

original_data.to_csv(r"C:\Users\DELL\OneDrive\Desktop\DATA_SCIENCE_PROJECT\Telecom Customer Churn Prediction\Predictions.csv", index=False)
print("Predictions saved successfully ")

Predictions saved successfully 
