In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Data Generation (Simulating a customer dataset for demo purposes)
np.random.seed(42)
n_samples = 1000
data = {
    'customer_id': range(1, n_samples + 1),
    'tenure': np.random.randint(1, 72, n_samples),
    'monthly_charges': np.random.uniform(20, 120, n_samples),
    'total_charges': np.random.uniform(50, 5000, n_samples),
    'contract_type': np.random.choice(['Month-to-month', 'One year', 'Two year'], n_samples),
    'internet_service': np.random.choice(['DSL', 'Fiber optic', 'No'], n_samples),
    'payment_method': np.random.choice(['Electronic check', 'Mailed check', 'Bank transfer', 'Credit card'], n_samples),
    'churn': np.random.choice([0, 1], n_samples, p=[0.73, 0.27])  # Imbalanced dataset
}
df = pd.DataFrame(data)

# Save dataset for reference
df.to_csv('customer_churn_data.csv', index=False)

# 2. Data Preprocessing
# Handle missing values (none in this simulated data, but included for completeness)
df.fillna(df.mean(numeric_only=True), inplace=True)

# Encode categorical variables
categorical_cols = ['contract_type', 'internet_service', 'payment_method']
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Feature engineering: Create a new feature (e.g., average monthly charge)
df_encoded['avg_monthly_charge'] = df_encoded['total_charges'] / (df_encoded['tenure'] + 1)  # Avoid division by zero

# Define features and target
X = df_encoded.drop(['customer_id', 'churn'], axis=1)
y = df_encoded['churn']

# Scale numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 3. Handle Class Imbalance with SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# 4. Split Data
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# 5. Train Logistic Regression Model
log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train, y_train)

# 6. Evaluate Logistic Regression Model
y_pred_log_reg = log_reg.predict(X_test)
accuracy_log_reg = accuracy_score(y_test, y_pred_log_reg)
print("Logistic Regression Results:")
print(f"Accuracy: {accuracy_log_reg:.2f}")
print("Classification Report:")
print(classification_report(y_test, y_pred_log_reg))

# Confusion Matrix Plot
cm = confusion_matrix(y_test, y_pred_log_reg)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Logistic Regression Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.savefig('log_reg_confusion_matrix.png')
plt.close()

# 7. Train Random Forest Model (for comparison)
rf = RandomForestClassifier(random_state=42, n_estimators=100)
rf.fit(X_train, y_train)

# 8. Evaluate Random Forest Model
y_pred_rf = rf.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print("\nRandom Forest Results:")
print(f"Accuracy: {accuracy_rf:.2f}")
print("Classification Report:")
print(classification_report(y_test, y_pred_rf))

# Feature Importance Plot
feature_importance = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)
plt.figure(figsize=(8, 6))
sns.barplot(x=feature_importance, y=feature_importance.index)
plt.title('Random Forest Feature Importance')
plt.xlabel('Importance')
plt.savefig('feature_importance.png')
plt.close()

# 9. Save Models (for deployment or later use)
import joblib
joblib.dump(log_reg, 'log_reg_model.pkl')
joblib.dump(rf, 'rf_model.pkl')
joblib.dump(scaler, 'scaler.pkl')

# 10. Example Prediction for a New Customer
new_customer = X.iloc[0:1].copy()  # Take first customer as example
new_customer_scaled = scaler.transform(new_customer)
pred_log_reg = log_reg.predict(new_customer_scaled)
pred_rf = rf.predict(new_customer_scaled)
print("\nNew Customer Prediction:")
print(f"Logistic Regression: {'Churn' if pred_log_reg[0] == 1 else 'No Churn'}")
print(f"Random Forest: {'Churn' if pred_rf[0] == 1 else 'No Churn'}")

Logistic Regression Results:
Accuracy: 0.53
Classification Report:
              precision    recall  f1-score   support

           0       0.53      0.52      0.53       147
           1       0.53      0.53      0.53       148

    accuracy                           0.53       295
   macro avg       0.53      0.53      0.53       295
weighted avg       0.53      0.53      0.53       295


Random Forest Results:
Accuracy: 0.70
Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.71      0.70       147
           1       0.71      0.70      0.70       148

    accuracy                           0.70       295
   macro avg       0.70      0.70      0.70       295
weighted avg       0.70      0.70      0.70       295


New Customer Prediction:
Logistic Regression: No Churn
Random Forest: No Churn
