# **Customer Churn Prediction**

In [1]:
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [10]:
# Load the training and testing datasets
df = pd.read_csv('/content/drive/MyDrive/Machine Learning Lab/employee_churn_dataset.csv')

In [11]:
df

Unnamed: 0,Age,Gender,Income,MaritalStatus,Education,Employment,WorkExperience,Location,HealthStatus,Churn
0,50-60,Female,Low,Divorced,PhD,Employed,0,Urban,Good,1
1,>60,Female,High,Married,High School,Employed,38,Urban,Good,0
2,40-50,Female,High,Married,PhD,Unemployed,31,Suburban,Excellent,1
3,>60,Male,Low,Divorced,Bachelor,Employed,28,Rural,Average,1
4,>60,Male,Medium,Single,PhD,Unemployed,16,Rural,Average,0
...,...,...,...,...,...,...,...,...,...,...
99995,30-40,Male,Medium,Married,Bachelor,Employed,19,Rural,Average,1
99996,<30,Male,High,Married,PhD,Unemployed,22,Suburban,Excellent,0
99997,>60,Female,High,Single,High School,Unemployed,12,Urban,Good,0
99998,50-60,Male,Low,Single,High School,Employed,30,Suburban,Excellent,1


In [12]:
df.value_counts("Churn")

Unnamed: 0_level_0,count
Churn,Unnamed: 1_level_1
0,50215
1,49785


In [13]:
# Print the names of the features (columns)
print(df.columns)

Index(['Age', 'Gender', 'Income', 'MaritalStatus', 'Education', 'Employment',
       'WorkExperience', 'Location', 'HealthStatus', 'Churn'],
      dtype='object')


In [14]:
missing_values = df.isnull().sum()
print(missing_values)

Age               0
Gender            0
Income            0
MaritalStatus     0
Education         0
Employment        0
WorkExperience    0
Location          0
HealthStatus      0
Churn             0
dtype: int64


In [15]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Replace inf with NaN
df = df.replace([np.inf, -np.inf], np.nan)
df = df.dropna().copy()

# Separate target
y = df["Churn"]
X = df.drop(columns=["Churn"])

# Identify column types
num_cols = X.select_dtypes(include=["int64", "float64"]).columns
cat_cols = X.select_dtypes(include=["object"]).columns

# Convert numeric columns to float
X[num_cols] = X[num_cols].astype(float)

# Scale numeric columns
scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])

# Encode categorical columns
encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    encoders[col] = le

# Final dataset
df_processed = X.copy()
df_processed["Churn"] = y

In [16]:
df = df_processed.copy()

In [17]:
df

Unnamed: 0,Age,Gender,Income,MaritalStatus,Education,Employment,WorkExperience,Location,HealthStatus,Churn
0,2,0,1,0,3,0,-1.694952,2,2,1
1,4,0,0,1,1,0,1.598997,2,2,0
2,1,0,0,1,3,1,0.992217,1,1,1
3,4,1,1,0,0,0,0.732169,0,0,1
4,4,1,2,2,3,1,-0.308026,0,0,0
...,...,...,...,...,...,...,...,...,...,...
99995,0,1,2,1,0,0,-0.047977,0,0,1
99996,3,1,0,1,3,1,0.212071,1,1,0
99997,4,0,0,2,1,1,-0.654757,2,2,0
99998,2,1,1,2,1,0,0.905534,1,1,1


In [18]:
df.value_counts("Churn")

Unnamed: 0_level_0,count
Churn,Unnamed: 1_level_1
0,50215
1,49785


## **Churn Prediction with GaussianNB**

In [20]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, classification_report
)


## 1) Split into X (features) and y (target)
X = df.drop(columns=["Churn"]).values.astype(np.float32)
y = df["Churn"].values.astype(np.int64)

## Train/test split (stratified keeps 0/1 ratio similar in both sets)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

## 2) Train Naive Bayes (GaussianNB works for continuous features)
nb = GaussianNB()
nb.fit(X_train, y_train)


## Predict
y_pred = nb.predict(X_test)


## Metrics
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average="binary", zero_division=0)
rec = recall_score(y_test, y_pred, average="binary", zero_division=0)
f1 = f1_score(y_test, y_pred, average="binary", zero_division=0)

print("Naive Bayes (GaussianNB) Evaluation")
print(f"Accuracy : {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall   : {rec:.4f}")
print(f"F1-score : {f1:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred, digits=4))


Naive Bayes (GaussianNB) Evaluation
Accuracy : 0.4981
Precision: 0.4948
Recall   : 0.3821
F1-score : 0.4312

Classification Report:
              precision    recall  f1-score   support

           0     0.5002    0.6132    0.5510     10043
           1     0.4948    0.3821    0.4312      9957

    accuracy                         0.4981     20000
   macro avg     0.4975    0.4977    0.4911     20000
weighted avg     0.4975    0.4981    0.4914     20000



## **Churn Prediction with SVM**

In [21]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, classification_report
)


## Split into X (features) and y (target)
X = df_processed.drop(columns=["Churn"]).values.astype(np.float32)
y = df_processed["Churn"].values.astype(np.int64)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


## Train SVM
#    - kernel='linear' is best for a clean baseline
#    - class_weight='balanced' helps if classes are imbalanced (optional)
# -----------------------------
svm = SVC(kernel="linear", C=1.0, random_state=42)
svm.fit(X_train, y_train)


## Predict
y_pred = svm.predict(X_test)


## Metrics
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average="binary", zero_division=0)
rec = recall_score(y_test, y_pred, average="binary", zero_division=0)
f1 = f1_score(y_test, y_pred, average="binary", zero_division=0)

print("SVM (Linear Kernel) Evaluation")
print(f"Accuracy : {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall   : {rec:.4f}")
print(f"F1-score : {f1:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred, digits=4))


SVM (Linear Kernel) Evaluation
Accuracy : 0.5020
Precision: 0.4999
Recall   : 0.5022
F1-score : 0.5010

Classification Report:
              precision    recall  f1-score   support

           0     0.5042    0.5019    0.5031     10043
           1     0.4999    0.5022    0.5010      9957

    accuracy                         0.5020     20000
   macro avg     0.5021    0.5021    0.5020     20000
weighted avg     0.5021    0.5020    0.5021     20000



# **Thank You**