In [2]:
# ================================
# 1. IMPORT LIBRARIES
# ================================
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# ================================
# 2. LOAD DATASET
# ================================
df = pd.read_csv("customer_churn.csv")

# ================================
# 3. BASIC EXPLORATION
# ================================
print(df.head())
print(df.tail())
print(df.shape)
print(df.info())
print(df.describe())

# ================================
# 4. CHECK MISSING VALUES
# ================================
print(df.isnull().sum())

# ================================
# 5. DATA CLEANING
# ================================
# Convert TotalCharges to numeric (common telecom issue)
if 'TotalCharges' in df.columns:
    df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
    df['TotalCharges'].fillna(df['TotalCharges'].mean(), inplace=True)

# ================================
# 6. ENCODE CATEGORICAL COLUMNS
# ================================
le = LabelEncoder()
for col in df.select_dtypes(include='object').columns:
    df[col] = le.fit_transform(df[col])

# ================================
# 7. EDA â€“ CHURN DISTRIBUTION
# ================================
sns.countplot(x='Churn', data=df)
plt.title("Churn Distribution")
plt.show()

# ================================
# 8. CONTRACT TYPE VS CHURN
# ================================
if 'Contract' in df.columns:
    sns.barplot(x='Contract', y='Churn', data=df)
    plt.title("Contract Type vs Churn")
    plt.show()

# ================================
# 9. CORRELATION HEATMAP
# ================================
plt.figure(figsize=(12,8))
sns.heatmap(df.corr(), cmap='coolwarm', linewidths=0.5)
plt.title("Correlation Heatmap")
plt.show()

# ================================
# 10. MONTHLY CHARGES VS CHURN
# ================================
if 'MonthlyCharges' in df.columns:
    sns.boxplot(x='Churn', y='MonthlyCharges', data=df)
    plt.title("Monthly Charges vs Churn")
    plt.show()

# ================================
# 11. FEATURE & TARGET SPLIT
# ================================
X = df.drop('Churn', axis=1)
y = df['Churn']

# ================================
# 12. TRAIN-TEST SPLIT
# ================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ================================
# 13. FEATURE SCALING
# ================================
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# ================================
# 14. LOGISTIC REGRESSION
# ================================
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)

print("\nLOGISTIC REGRESSION RESULTS")
print("Accuracy:", accuracy_score(y_test, lr_pred))
print(classification_report(y_test, lr_pred))

# ================================
# 15. DECISION TREE
# ================================
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_test)

print("\nDECISION TREE RESULTS")
print("Accuracy:", accuracy_score(y_test, dt_pred))
print(classification_report(y_test, dt_pred))

# ================================
# 16. RANDOM FOREST
# ================================
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

print("\nRANDOM FOREST RESULTS")
print("Accuracy:", accuracy_score(y_test, rf_pred))
print(classification_report(y_test, rf_pred))

# ================================
# 17. CONFUSION MATRIX (BEST MODEL)
# ================================
cm = confusion_matrix(y_test, rf_pred)

sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix - Random Forest")
plt.show()

# ================================
# 18. FEATURE IMPORTANCE
# ================================
feature_importance = pd.Series(
    rf.feature_importances_,
    index=X.columns
).sort_values(ascending=False)

print("\nTOP FACTORS INFLUENCING CHURN")
print(feature_importance.head(10))

# ================================
# 19. FINAL BUSINESS INSIGHTS
# ================================
print("\nBUSINESS INSIGHTS:")
print("1. Month-to-month contract customers churn more.")
print("2. High monthly charges increase churn risk.")
print("3. Long-tenure customers are more loyal.")
print("4. Contract type strongly impacts churn.")
print("5. Payment and service usage affect retention.")

# ================================
# 20. CONCLUSION
# ================================
print("\nCONCLUSION:")
print("Random Forest performed best in predicting churn.")
print("Understanding churn drivers helps improve customer retention.")


FileNotFoundError: [Errno 2] No such file or directory: 'customer_churn.csv'