<a href="https://colab.research.google.com/github/Arasavelli-Sai-Sankar/Machine-Learning/blob/main/Customer_Churn_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [None]:
# Step 1: Load the dataset
dataset_path = "/content/Churn_Modelling.csv"  # Update with the correct dataset path
data = pd.read_csv(dataset_path)

In [None]:
# Step 2: Explore the dataset (Optional, for debugging)
print("Dataset Shape:", data.shape)
print("\nDataset Sample:\n", data.head())
print("\nColumn Info:\n", data.info())

Dataset Shape: (10000, 14)

Dataset Sample:
    RowNumber  CustomerId   Surname  CreditScore Geography  Gender  Age  \
0          1    15634602  Hargrave          619    France  Female   42   
1          2    15647311      Hill          608     Spain  Female   41   
2          3    15619304      Onio          502    France  Female   42   
3          4    15701354      Boni          699    France  Female   39   
4          5    15737888  Mitchell          850     Spain  Female   43   

   Tenure    Balance  NumOfProducts  HasCrCard  IsActiveMember  \
0       2       0.00              1          1               1   
1       1   83807.86              1          0               1   
2       8  159660.80              3          1               0   
3       1       0.00              2          0               0   
4       2  125510.82              1          1               1   

   EstimatedSalary  Exited  
0        101348.88       1  
1        112542.58       0  
2        113931.57       1

In [None]:
# Step 3: Preprocessing
# Assume the target variable is 'churn' and drop unnecessary columns (e.g., customer ID)
if 'customer_id' in data.columns:
    data = data.drop(columns=['customer_id'])

# Separate features (X) and target (y)
X = data.drop(columns=['Exited'])  # Features
y = data['Exited']  # Target (0: No Churn, 1: Churn)

# Handle categorical variables (if any) using one-hot encoding
X = pd.get_dummies(X, drop_first=True)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features (important for models like Logistic Regression)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [None]:
# Step 4: Train models
# Logistic Regression
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)

# Random Forest
rf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
rf.fit(X_train, y_train)

# Gradient Boosting
gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
gb.fit(X_train, y_train)


In [None]:
# Step 5: Evaluate models
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    print(f"\nModel: {model.__class__.__name__}")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))

print("\n--- Model Evaluation ---")
evaluate_model(log_reg, X_test, y_test)
evaluate_model(rf, X_test, y_test)
evaluate_model(gb, X_test, y_test)


--- Model Evaluation ---

Model: LogisticRegression
Accuracy: 0.781
Confusion Matrix:
 [[1476  131]
 [ 307   86]]
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.92      0.87      1607
           1       0.40      0.22      0.28       393

    accuracy                           0.78      2000
   macro avg       0.61      0.57      0.58      2000
weighted avg       0.74      0.78      0.76      2000


Model: RandomForestClassifier
Accuracy: 0.8035
Confusion Matrix:
 [[1607    0]
 [ 393    0]]
Classification Report:
               precision    recall  f1-score   support

           0       0.80      1.00      0.89      1607
           1       0.00      0.00      0.00       393

    accuracy                           0.80      2000
   macro avg       0.40      0.50      0.45      2000
weighted avg       0.65      0.80      0.72      2000


Model: GradientBoostingClassifier
Accuracy: 0.8675
Confusion Matrix:
 [[1547   60]
 [ 20

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# Step 6: Feature Importance (Random Forest and Gradient Boosting)
print("\nRandom Forest Feature Importances:")
for feature, importance in zip(X.columns, rf.feature_importances_):
    print(f"{feature}: {importance:.4f}")

print("\nGradient Boosting Feature Importances:")
for feature, importance in zip(X.columns, gb.feature_importances_):
    print(f"{feature}: {importance:.4f}")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Surname_Fielding: 0.0002
Surname_Fields: 0.0000
Surname_Finch: 0.0000
Surname_Findlay: 0.0000
Surname_Fink: 0.0000
Surname_Fiore: 0.0000
Surname_Fiorentini: 0.0000
Surname_Fiorentino: 0.0000
Surname_Fischer: 0.0001
Surname_Fishbourne: 0.0000
Surname_Fisher: 0.0000
Surname_Fisk: 0.0000
Surname_Fitch: 0.0000
Surname_Fitts: 0.0000
Surname_Fitzgerald: 0.0000
Surname_Fitzpatrick: 0.0000
Surname_Flannagan: 0.0000
Surname_Flannery: 0.0000
Surname_Fleetwood-Smith: 0.0000
Surname_Fleming: 0.0001
Surname_Flemming: 0.0016
Surname_Fletcher: 0.0000
Surname_Flores: 0.0000
Surname_Floyd: 0.0000
Surname_Flynn: 0.0005
Surname_Fokina: 0.0000
Surname_Fokine: 0.0000
Surname_Foley: 0.0000
Surname_Folliero: 0.0000
Surname_Fomin: 0.0000
Surname_Fomina: 0.0000
Surname_Fontaine: 0.0000
Surname_Fontenot: 0.0000
Surname_Foran: 0.0004
Surname_Forbes: 0.0000
Surname_Ford: 0.0001
Surname_Forlonge: 0.0000
Surname_Forster: 0.0000
Surname_Forwood: 0.0000

In [None]:
# Step 7: Predict new customer churn
new_customer = np.array([X_test[0]])  # Example: Using the first test sample
rf_prediction = rf.predict(new_customer)
gb_prediction = gb.predict(new_customer)
print("\nRandom Forest Predicted Churn (0=No, 1=Yes):", rf_prediction[0])
print("Gradient Boosting Predicted Churn (0=No, 1=Yes):", gb_prediction[0])


Random Forest Predicted Churn (0=No, 1=Yes): 0
Gradient Boosting Predicted Churn (0=No, 1=Yes): 0
