<a href="https://colab.research.google.com/github/BebishaC/Customer-Churn-Prediction/blob/main/Customer_Churn_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ‚úÖ Install libraries if needed:
# pip install pandas scikit-learn seaborn matplotlib

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# -----------------------------
# 1Ô∏è‚É£ LOAD DATA
# -----------------------------

# Replace these paths with your actual file names!
train_df = pd.read_csv('/content/customer_churn_dataset-training-master.csv')
test_df = pd.read_csv('/content/customer_churn_dataset-testing-master.csv')

print("Train Data Shape:", train_df.shape)
print("Test Data Shape:", test_df.shape)

# Peek at data
print(train_df.head())
print(test_df.head())

# -----------------------------
# 2Ô∏è‚É£ EDA (Train)
# -----------------------------

print("\nTrain Churn Distribution:\n", train_df['Churn'].value_counts())
sns.countplot(x='Churn', data=train_df)
plt.title("Churn Distribution in Train Data")
plt.show()

# -----------------------------
# 3Ô∏è‚É£ EDA (Test)
# -----------------------------

# ‚ö†Ô∏è If test labels exist, you can do:
if 'Churn' in test_df.columns:
    print("\nTest Churn Distribution:\n", test_df['Churn'].value_counts())
    sns.countplot(x='Churn', data=test_df)
    plt.title("Churn Distribution in Test Data")
    plt.show()
else:
    print("\nNo churn labels in test data (common in Kaggle).")

# -----------------------------
# 4Ô∏è‚É£ HANDLE MISSING VALUES IN TARGET VARIABLE
# -----------------------------
# Drop rows with NaN values in the 'Churn' column from the training data
train_df.dropna(subset=['Churn'], inplace=True)


# -----------------------------
# 5Ô∏è‚É£ ENCODE CATEGORICAL FEATURES
# -----------------------------

# Example: Encode all object columns
cat_cols = train_df.select_dtypes(include=['object']).columns

# Exclude ID columns from encoding
cat_cols = [col for col in cat_cols if col not in ['CustomerID']]

encoder = LabelEncoder()
for col in cat_cols:
    train_df[col] = encoder.fit_transform(train_df[col].astype(str))
    if col in test_df.columns:
        test_df[col] = encoder.transform(test_df[col].astype(str))

# -----------------------------
# 6Ô∏è‚É£ SPLIT FEATURES & LABELS
# -----------------------------

X_train = train_df.drop(['CustomerID', 'Churn'], axis=1)
y_train = train_df['Churn']

# For test data: features only
X_test = test_df.drop(['CustomerID', 'Churn'], axis=1)


# -----------------------------
# 7Ô∏è‚É£ TRAIN MODEL
# -----------------------------

model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# -----------------------------
# 8Ô∏è‚É£ PREDICT ON TEST DATA
# -----------------------------

y_test_pred = model.predict(X_test)

# If your test data has true labels, check accuracy:
if 'Churn' in test_df.columns:
    y_test_true = test_df['Churn']
    print("\n‚úÖ TEST RESULTS")
    print("Accuracy:", accuracy_score(y_test_true, y_test_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test_true, y_test_pred))
    print("Classification Report:\n", classification_report(y_test_true, y_test_pred))

# -----------------------------
# 9Ô∏è‚É£ CREATE SUBMISSION FILE (Kaggle-style)
# -----------------------------

submission = pd.DataFrame({
    'customerID': test_df['CustomerID'],
    'Churn': y_test_pred
})

submission.to_csv('submission.csv', index=False)
print("\nSubmission file saved: submission.csv ‚úÖ")

# -----------------------------
# üîü FEATURE IMPORTANCE (Optional)
# -----------------------------

importances = pd.Series(model.feature_importances_, index=X_train.columns)
importances.sort_values(ascending=False).plot(kind='bar', figsize=(12,6))
plt.title("Feature Importances")
plt.show()

from sklearn.model_selection import train_test_split

df = pd.concat([train_df, test_df])
X = df.drop(['CustomerID', 'Churn'], axis=1)
y = df['Churn']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

# Remember to apply SMOTE only to X_train, y_train afterwards!
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

print("Balanced y_train:\n", y_train_res.value_counts())

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

model =  RandomForestClassifier(class_weight='balanced', random_state=42)

model.fit(X_train_res, y_train_res)y_test_pred = model.predict(X_test)
y_test_pred = model.predict(X_test)
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

acc = accuracy_score(y_test, y_test_pred)
print("Accuracy:", acc)

print("Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred))
print("Classification Report:\n", classification_report(y_test, y_test_pred))
