<a href="https://colab.research.google.com/github/Amasha03/Customer_Churn_Prediction/blob/main/notebooks/02_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Telco Customer Churn Prediction
### CM2604 Machine Learning CourseWork
02-Preprocessing  

Student Name : Amasha Widanagamage  
RGU ID : 2425800  
IIT ID : 20241246

In [12]:
#import libraries
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
import joblib

In [2]:
#Load dataset
from google.colab import drive
df=pd.read_csv('/content/drive/MyDrive/ML/coursework/Telco-Customer-Churn.csv')
print("Original shape:", df.shape)


Original shape: (7043, 21)


In [3]:
#Handle missing/ invalid values
df=df.replace(' ',np.nan)
df=df.dropna()
print("Shape after handling missing values:", df.shape)


#Convert TotalCharges to numeric
df['TotalCharges']=pd.to_numeric(df['TotalCharges'],errors='coerce')
df['TotalCharges']=df['TotalCharges'].fillna(df['TotalCharges'].median())


Shape after handling missing values: (7032, 21)


In [4]:
#Encode Churn
#Check what values exist
print("\nChurn values before encoding:")
print(df['Churn'].value_counts(dropna=False))


#Clean and map
df['Churn']=df['Churn'].str.strip().str.capitalize()
df['Churn']=df['Churn'].map({'Yes':1,'No':0})

#Check for unmapped values
print("\nChurn values after mapping:")
print(df['Churn'].value_counts(dropna=False))

#Handle NaN from unmapped values
if df['Churn'].isnull().any():
  df=df.dropna(subset=['Churn'])
  print(f"Dropped rows with NaN in churn.")

df['Churn']=df['Churn'].astype(int)


Churn values before encoding:
Churn
No     5163
Yes    1869
Name: count, dtype: int64

Churn values after mapping:
Churn
0    5163
1    1869
Name: count, dtype: int64


In [5]:
#Encoding binary column
binary_cols=['gender','Partner','Dependents','PhoneService','PaperlessBilling']
for col in binary_cols:
    df[col]=df[col].astype(str).str.strip().str.capitalize()
    df[col]=df[col].map({'Yes':1,'No':0,'Female':1,'Male':0})
    #Fill any NaN with mode
    if df[col].isnull().any():
      df[col].fillna(df[col].mode()[0])
    df[col]=df[col].astype(int)

In [6]:
#Scale numerical features
numerical_cols=['tenure','MonthlyCharges','TotalCharges']
scaler=StandardScaler()
df[numerical_cols]=scaler.fit_transform(df[numerical_cols])

In [7]:
#One-hot encode other categorical columns
categorical_cols=['MultipleLines','InternetService','OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies','Contract','PaymentMethod']

for col in categorical_cols:
  df[col]=df[col].astype(str).str.strip()

df=pd.get_dummies(df,columns=categorical_cols,drop_first=True)



In [8]:
#Drop customerID
if 'customerID' in df.columns:
  df=df.drop('customerID',axis=1)

In [9]:
#Verify
print(f"\nFinal DataFrame shape: {df.shape}")
print(f"Columns:{len(df.columns)}")
print(f"Rows: {len(df)}")
print(f"Churn distribution: {df['Churn'].value_counts().to_dict()}")
print(f"NaN values: {df.isnull().sum().sum()}")


Final DataFrame shape: (7032, 31)
Columns:31
Rows: 7032
Churn distribution: {0: 5163, 1: 1869}
NaN values: 0


In [10]:
#Split dataset
X=df.drop('Churn',axis=1)
y=df['Churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)

print(f"\nTrain-Test split:")
print(f"X_train: {X_train.shape}")
print(f"X_test: {X_test.shape}")
print(f"y_train classes: {np.bincount(y_train)}")
print(f"y_test classes: {np.bincount(y_test)}")


Train-Test split:
X_train: (5625, 30)
X_test: (1407, 30)
y_train classes: [4130 1495]
y_test classes: [1033  374]


In [16]:
#Save preprocessed data
folder="data"
os.makedirs(folder,exist_ok=True)

print("Folder created:"+folder)

joblib.dump(X_train,f"{folder}/X_train.pkl")
joblib.dump(X_test,f"{folder}/X_test.pkl")
joblib.dump(y_train,f"{folder}/y_train.pkl")
joblib.dump(y_test,f"{folder}/y_test.pkl")

print("Saved all files in folder:", os.listdir(folder))

Folder created:data
Saved all files in folder: ['y_test.pkl', 'y_train.pkl', 'X_train.pkl', 'X_test.pkl']
