# Libraries

In [95]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.linear_model import LogisticRegression , LinearRegression
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier

# Loading dataset from Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
filepath= '/content/drive/MyDrive/ICT /Dataset/Training_data.csv'
df_train= pd.read_csv(filepath)
df_train.head()

In [None]:
filepath= '/content/drive/MyDrive/ICT /Dataset/Testing_data.csv'
df_test= pd.read_csv(filepath)
df_test.head()

# Exploratory Data Analysis (EDA)

In [None]:
df_train.shape

In [None]:
df_test.shape

In [None]:
df_train.info()

In [None]:
df_test.info()

In [None]:
df_train.describe()

In [None]:
df_test.describe()

## Missing Values Identification

In [None]:
df_train.isna().sum()

In [None]:
df_test.isna().sum()

## Duplicate Identification

In [None]:
df_train.duplicated().sum()

In [None]:
df_test.duplicated().sum()

In [None]:
df_train.nunique()

In [None]:
df_test.nunique()

## Outlier Identification

In [None]:
for col in df_train.columns:
  sns.boxplot(df_train[col])
  plt.show()

In [None]:
for col in df_test.columns:
  sns.boxplot(df_test[col])
  plt.show()

## Visualization

Scatterplot

In [None]:
#Train dataset
sns.scatterplot(data = df_train, x = 'TotalCharges', y = 'Churn',hue="gender")
plt.title("TotalChargess vs Churns")

In [None]:
#Test Dataset
sns.scatterplot(data = df_test, x = 'TotalCharges', y = 'Churn',hue="gender")
plt.title("TotalChargess vs Churns")

Histogram

In [None]:
#Train
df_train['PaymentMethod'].hist(bins=30)
plt.title('Histogram of PaymentMethod')
plt.xlabel('PaymentMethod')
plt.ylabel('Frequency')
plt.show()

In [None]:
#Test
df_test['PaymentMethod'].hist(bins=30)
plt.title('Histogram of PaymentMethod')
plt.xlabel('PaymentMethod')
plt.ylabel('Frequency')
plt.show()

kdeplot

In [None]:
sns.kdeplot(df_train ['MonthlyCharges'].dropna())
plt.title('KDE of MonthlyChargesMonthlyCharges')
plt.show()

Pairplot

In [None]:
#used sns.pairplot() to plot relationships between all numeric features in the dataset
sns.pairplot(data = df_train, hue = 'Churn')

In [None]:
sns.pairplot(data = df_test, hue = 'Churn')

In [None]:
df_train_num = df_train.select_dtypes(include=['int64','float64'])
df_train_num.head()

Visualize Correlation Matrix (Heatmap)

In [None]:
corr_matrix = df_train_num.corr()
corr_matrix

In [None]:
#heatmap of correalation matrix
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')

# Data Preprocessing

## Missing values handling

In [None]:
df_train.isna().sum()

In [None]:
df_train['gender']=df_train['gender'].fillna(df_train['gender'].mode()[0])

In [None]:
df_train['OnlineSecurity']=df_train['OnlineSecurity'].fillna(df_train['OnlineSecurity'].mode()[0])

In [None]:
df_train['MonthlyCharges']=df_train['MonthlyCharges'].fillna(df_train['MonthlyCharges'].mean())

In [None]:
df_train['TotalCharges']=df_train['TotalCharges'].fillna(df_train['TotalCharges'].mode()[0])

In [None]:
df_train.isna().sum()

## Feature Engineering

In [None]:
df_train['TotalCharges'] = pd.to_numeric(df_train['TotalCharges'], errors='coerce')

In [None]:
df_test['TotalCharges'] = pd.to_numeric(df_test['TotalCharges'], errors='coerce')

In [None]:
# Convert Contract type to numeric (if needed)
df_train['Contract'] = df_train['Contract'].map({'Month-to-month': 1, 'One year': 12, 'Two year': 24})

# Create interaction feature
df_train['Tenure_Contract'] = df_train['tenure'] * df_train['Contract']

In [None]:
df_train['CLV'] = df_train['tenure'] * df_train['MonthlyCharges']

In [None]:
#Convert Contract type to numeric (if needed)
df_test['Contract'] = df_test['Contract'].map({'Month-to-month': 1, 'One year': 12, 'Two year': 24})

# Create interaction feature
df_test['Tenure_Contract'] = df_test['tenure'] * df_test['Contract']

In [None]:
df_test['CLV'] = df_test['tenure'] * df_test['MonthlyCharges']

## Scaling

In [None]:
min_scale = MinMaxScaler()
num_cols = ['SeniorCitizen','tenure',
   'MonthlyCharges',
   'TotalCharges']
df_train[num_cols] = min_scale.fit_transform(df_train[num_cols])
df_train

In [None]:
min_scale = MinMaxScaler()
num_cols = ['SeniorCitizen','tenure',
   'MonthlyCharges','TotalCharges']
df_test[num_cols] = min_scale.fit_transform(df_test[num_cols])
df_test

## Encoding Categorical values

In [None]:
df_train.info()

In [None]:
label_enc=LabelEncoder()
df_train['gender']=label_enc.fit_transform(df_train['gender'])
df_train

In [None]:
label_enc=LabelEncoder()
df_train['Partner']=label_enc.fit_transform(df_train['Partner'])

In [None]:
df_train['Dependents']=label_enc.fit_transform(df_train['Dependents'])

In [None]:
df_train['PhoneService']=label_enc.fit_transform(df_train['PhoneService'])

In [None]:
df_train['PaperlessBilling']=label_enc.fit_transform(df_train['PaperlessBilling'])

In [None]:
df_train['Churn']=label_enc.fit_transform(df_train['Churn'])
df_train

In [None]:
df_train= pd.get_dummies(df_train, columns = ['InternetService','Contract','PaymentMethod'],dtype=int,prefix=['Internetservice','Contract','Paymentmethod'])
df_train

In [None]:
df_train.info()

In [None]:
df_train['MultipleLines'] = df_train['MultipleLines'].str.lower().map({'yes': 1, 'no': 0})
df_train['OnlineSecurity'] = df_train['OnlineSecurity'].str.lower().map({'yes': 1, 'no': 0})
df_train['OnlineBackup'] = df_train['OnlineBackup'].str.lower().map({'yes': 1, 'no': 0})
df_train['DeviceProtection'] = df_train['DeviceProtection'].str.lower().map({'yes': 1, 'no': 0})
df_train['TechSupport'] = df_train['TechSupport'].str.lower().map({'yes': 1, 'no': 0})
df_train['StreamingTV'] = df_train['StreamingTV'].str.lower().map({'yes': 1, 'no': 0})
df_train['StreamingMovies'] = df_train['StreamingMovies'].str.lower().map({'yes': 1, 'no': 0})

In [None]:
df_train.drop('customerID',axis=1,inplace=True)

In [None]:
df_train.info()

In [None]:
df_train

In [None]:
df_train.isna().sum()

In [None]:
df_train = df_train.apply(pd.to_numeric, errors='coerce')

columns_to_impute_mode = ['MultipleLines', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']
for col in columns_to_impute_mode:
    df_train[col] = df_train[col].fillna(df_train[col].mode()[0])

df_train['TotalCharges']=df_train['TotalCharges'].fillna(df_train['TotalCharges'].median())

In [None]:
df_train.isna().sum()

In [None]:
df_test.info()

In [None]:
label_enc=LabelEncoder()
df_test['Partner']=label_enc.fit_transform(df_test['Partner'])

In [None]:
df_test['gender']=label_enc.fit_transform(df_test['gender'])

In [None]:
df_test['Dependents']=label_enc.fit_transform(df_test['Dependents'])

In [None]:
df_test['PhoneService']=label_enc.fit_transform(df_test['PhoneService'])

In [None]:
df_test['PaperlessBilling']=label_enc.fit_transform(df_test['PaperlessBilling'])

In [None]:
df_test['Churn']=label_enc.fit_transform(df_test['Churn'])

In [None]:
df_test= pd.get_dummies(df_test, columns = ['InternetService','Contract','PaymentMethod'],dtype=int,prefix=['Internetservice','Contract','Paymentmethod'])

In [None]:
df_test.info()


In [None]:
df_test['MultipleLines'] = df_test['MultipleLines'].str.lower().map({'yes': 1, 'no': 0})
df_test['OnlineSecurity'] = df_test['OnlineSecurity'].str.lower().map({'yes': 1, 'no': 0})
df_test['OnlineBackup'] = df_test['OnlineBackup'].str.lower().map({'yes': 1, 'no': 0})
df_test['DeviceProtection'] = df_test['DeviceProtection'].str.lower().map({'yes': 1, 'no': 0})
df_test['TechSupport'] = df_test['TechSupport'].str.lower().map({'yes': 1, 'no': 0})
df_test['StreamingTV'] = df_test['StreamingTV'].str.lower().map({'yes': 1, 'no': 0})
df_test['StreamingMovies'] = df_test['StreamingMovies'].str.lower().map({'yes': 1, 'no': 0})

In [None]:
df_test.drop('customerID',axis=1,inplace=True)

In [None]:
df_test.info()

In [None]:
df_test.isna().sum()

In [None]:
df_test = df_test.apply(pd.to_numeric, errors='coerce')

columns_to_impute_mode = ['MultipleLines', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']
for col in columns_to_impute_mode:
    df_test[col] = df_test[col].fillna(df_test[col].mode()[0])

df_test['TotalCharges']=df_test['TotalCharges'].fillna(df_test['TotalCharges'].median())

In [None]:
df_test.isna().sum()

# Model Building

In [None]:
X_train = df_train.drop('Churn', axis=1)
y_train = df_train['Churn']

X_test = df_test.drop('Churn', axis=1)
y_test = df_test['Churn']

# Convert TotalCharges to numeric, coercing errors to NaN
X_train['TotalCharges'] = pd.to_numeric(X_train['TotalCharges'], errors='coerce')

# Impute missing values in TotalCharges column in X_train with the median
X_train['TotalCharges'] = X_train['TotalCharges'].fillna(X_train['TotalCharges'].median())

In [None]:
log_model = LogisticRegression(max_iter=1000)
rf_model = RandomForestClassifier()
svm_model = SVC(probability=True)

In [None]:
log_model.fit(X_train, y_train)
rf_model.fit(X_train, y_train)
svm_model.fit(X_train, y_train)

# Model Evaluation

In [None]:
def evaluate_model(model, name):
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None

    print(f"\n📌 Evaluation for {name}")
    print(f"Accuracy  : {accuracy_score(y_test, y_pred):.4f}")
    print(f"Precision : {precision_score(y_test, y_pred):.4f}")
    print(f"Recall    : {recall_score(y_test, y_pred):.4f}")
    print(f"F1 Score  : {f1_score(y_test, y_pred):.4f}")


# Evaluate each
evaluate_model(log_model, "Logistic Regression")
evaluate_model(rf_model, "Random Forest")
evaluate_model(svm_model, "Support Vector Machine")


## Model Tuning

Fine tuning with Randomized search cv

In [None]:
param_dist = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [5, 10, 15, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None],
    'bootstrap': [True, False]
}



Model implementation with RandomForestClassifier

In [93]:
rf = RandomForestClassifier(random_state=42)
random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=50,             # number of random combinations to try
    cv=5,                  # 5-fold cross-validation
    scoring='f1',          # or 'roc_auc' depending on your goal
    verbose=1,
    n_jobs=-1,
    random_state=42
)
random_search.fit(X_train, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


In [94]:
print(" Best Parameters:")
print(random_search.best_params_)

best_rf = random_search.best_estimator_


 Best Parameters:
{'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': None, 'max_depth': 5, 'bootstrap': True}


In [None]:
y_pred = best_rf.predict(X_test)
y_proba = best_rf.predict_proba(X_test)[:, 1]

print("Evaluation Metrics (After Tuning):")
print("Accuracy :", accuracy_score(y_test, y_pred))
print("F1 Score :", f1_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall   :", recall_score(y_test, y_pred))

# Bagging

In [None]:
bagging = BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=50,
    random_state=42
)
bagging.fit(X_train, y_train)
y_pred_bag = bagging.predict(X_test)

In [None]:
print("Bagging Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_bag))
print(classification_report(y_test, y_pred_bag))

# Boosting

In [None]:
#adaboost
adaboost = AdaBoostClassifier(
    estimator=DecisionTreeClassifier(max_depth=1),
    n_estimators=100,
    random_state=42
)
adaboost.fit(X_train, y_train)
y_pred_ada = adaboost.predict(X_test)

In [None]:
print("AdaBoost Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_ada))
print(classification_report(y_test, y_pred_ada))

In [None]:
#gradient boosting
gradboost = GradientBoostingClassifier(
    n_estimators=100,
    learning_rate=0.1,
    random_state=42
)
gradboost.fit(X_train, y_train)
y_pred_gb = gradboost.predict(X_test)

In [None]:
print("\nGradient Boosting Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_gb))
print(classification_report(y_test, y_pred_gb))

# Cross Validation

In [None]:
models = {
    "Bagging": bagging,
    "AdaBoost": adaboost,
    "GradientBoost": gradboost
}

print("\nCross Validation (5-fold):")
for name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring="accuracy")
    print(f"{name}: Mean CV Accuracy = {scores.mean():.4f} (+/- {scores.std():.4f})")