### Importing Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

### Loading Data

In [2]:
# Load the dataset
telecom_data = pd.read_csv("Churn-Data.csv")

In [3]:
telecom_data

Unnamed: 0,cID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,TV_Streaming,Movie_Streaming,Contract,PaperlessBilling,Method_Payment,Charges_Month,TotalCharges,Churn
0,4223-BKEOR,Female,0,No,Yes,21,Yes,No,DSL,Yes,...,Yes,No,No,Yes,One year,No,Mailed check,64.85,1336.8,No
1,6035-RIIOM,Female,0,No,No,54,Yes,Yes,Fiber optic,No,...,No,No,Yes,Yes,Two year,Yes,Bank transfer (automatic),97.20,5129.45,No
2,3797-VTIDR,Male,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,23.45,23.45,Yes
3,2568-BRGYX,Male,0,No,No,4,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.20,237.95,Yes
4,2775-SEFEE,Male,0,No,Yes,0,Yes,Yes,DSL,Yes,...,No,Yes,No,No,Two year,Yes,Bank transfer (automatic),61.90,,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5629,0684-AOSIH,Male,0,Yes,No,1,Yes,No,Fiber optic,Yes,...,No,No,Yes,Yes,Month-to-month,Yes,Electronic check,95.00,95,Yes
5630,5982-PSMKW,Female,0,Yes,Yes,23,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,Two year,Yes,Credit card (automatic),91.10,2198.3,No
5631,8044-BGWPI,Male,0,Yes,Yes,12,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Month-to-month,Yes,Electronic check,21.15,306.05,No
5632,7450-NWRTR,Male,1,No,No,12,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,99.45,1200.15,Yes


### Data Exploration and Preprocessing and Handle missing values

In [4]:
telecom_data.fillna(0, inplace=True)  # Replace missing values with 0

In [5]:
telecom_data.isna().sum()

cID                 0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
TV_Streaming        0
Movie_Streaming     0
Contract            0
PaperlessBilling    0
Method_Payment      0
Charges_Month       0
TotalCharges        0
Churn               0
dtype: int64

In [6]:
telecom_data

Unnamed: 0,cID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,TV_Streaming,Movie_Streaming,Contract,PaperlessBilling,Method_Payment,Charges_Month,TotalCharges,Churn
0,4223-BKEOR,Female,0,No,Yes,21,Yes,No,DSL,Yes,...,Yes,No,No,Yes,One year,No,Mailed check,64.85,1336.8,No
1,6035-RIIOM,Female,0,No,No,54,Yes,Yes,Fiber optic,No,...,No,No,Yes,Yes,Two year,Yes,Bank transfer (automatic),97.20,5129.45,No
2,3797-VTIDR,Male,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,23.45,23.45,Yes
3,2568-BRGYX,Male,0,No,No,4,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.20,237.95,Yes
4,2775-SEFEE,Male,0,No,Yes,0,Yes,Yes,DSL,Yes,...,No,Yes,No,No,Two year,Yes,Bank transfer (automatic),61.90,,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5629,0684-AOSIH,Male,0,Yes,No,1,Yes,No,Fiber optic,Yes,...,No,No,Yes,Yes,Month-to-month,Yes,Electronic check,95.00,95,Yes
5630,5982-PSMKW,Female,0,Yes,Yes,23,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,Two year,Yes,Credit card (automatic),91.10,2198.3,No
5631,8044-BGWPI,Male,0,Yes,Yes,12,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Month-to-month,Yes,Electronic check,21.15,306.05,No
5632,7450-NWRTR,Male,1,No,No,12,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,99.45,1200.15,Yes


### Converting the Data

In [7]:
telecom_data['gender'] = telecom_data['gender'].apply(lambda x: 1 if x == 'Male' else (0 if x == 'Female' else x))
telecom_data['Partner'] = telecom_data['Partner'].apply(lambda x: 1 if x == 'Yes' else (0 if x == 'No' else x))
telecom_data['Churn'] = telecom_data['Churn'].apply(lambda x: 1 if x == 'Yes' else (0 if x == 'No' else x))
telecom_data['PaperlessBilling'] = telecom_data['PaperlessBilling'].apply(lambda x: 1 if x == 'Yes' else (0 if x == 'No' else x))
telecom_data['Movie_Streaming'] = telecom_data['Partner'].apply(lambda x: 1 if x == 'Yes' else (0 if x == 'No' else x))
telecom_data['TV_Streaming'] = telecom_data['PaperlessBilling'].apply(lambda x: 1 if x == 'Yes' else (0 if x == 'No' else x))
telecom_data['TechSupport'] = telecom_data['TechSupport'].apply(lambda x: 1 if x == 'Yes' else (0 if x == 'No' else x))
telecom_data['DeviceProtection'] = telecom_data['DeviceProtection'].apply(lambda x: 1 if x == 'Yes' else (0 if x == 'No' else x))
telecom_data['OnlineSecurity'] = telecom_data['OnlineSecurity'].apply(lambda x: 1 if x == 'Yes' else (0 if x == 'No' else x))
telecom_data['OnlineBackup'] = telecom_data['OnlineBackup'].apply(lambda x: 1 if x == 'Yes' else (0 if x == 'No' else x))
telecom_data['MultipleLines'] = telecom_data['MultipleLines'].apply(lambda x: 1 if x == 'Yes' else (0 if x == 'No' else x))
telecom_data['PhoneService'] = telecom_data['PhoneService'].apply(lambda x: 1 if x == 'Yes' else (0 if x == 'No' else x))
telecom_data['Dependents'] = telecom_data['Dependents'].apply(lambda x: 1 if x == 'Yes' else (0 if x == 'No' else x))
telecom_data['MultipleLines'] = telecom_data['MultipleLines'].apply(lambda x: 0 if x == 'No phone service' else x)
telecom_data['OnlineSecurity'] = telecom_data['OnlineSecurity'].apply(lambda x: 0 if x == 'No internet service' else x)
telecom_data['DeviceProtection'] = telecom_data['DeviceProtection'].apply(lambda x: 0 if x == 'No internet service' else x)
telecom_data['OnlineBackup'] = telecom_data['OnlineBackup'].apply(lambda x: 0 if x == 'No internet service' else x)
telecom_data['TechSupport'] = telecom_data['TechSupport'].apply(lambda x: 0 if x == 'No internet service' else x)
telecom_data['TV_Streaming'] = telecom_data['TV_Streaming'].apply(lambda x: 0 if x == 'No internet service' else x)
telecom_data['Movie_Streaming'] = telecom_data['Movie_Streaming'].apply(lambda x: 0 if x == 'No internet service' else x)

In [8]:
telecom_data

Unnamed: 0,cID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,TV_Streaming,Movie_Streaming,Contract,PaperlessBilling,Method_Payment,Charges_Month,TotalCharges,Churn
0,4223-BKEOR,0,0,0,1,21,1,0,DSL,1,...,1,0,0,0,One year,0,Mailed check,64.85,1336.8,0
1,6035-RIIOM,0,0,0,0,54,1,1,Fiber optic,0,...,0,0,1,0,Two year,1,Bank transfer (automatic),97.20,5129.45,0
2,3797-VTIDR,1,0,1,0,1,0,0,DSL,0,...,0,0,1,1,Month-to-month,1,Electronic check,23.45,23.45,1
3,2568-BRGYX,1,0,0,0,4,1,0,Fiber optic,0,...,0,0,1,0,Month-to-month,1,Electronic check,70.20,237.95,1
4,2775-SEFEE,1,0,0,1,0,1,1,DSL,1,...,0,1,1,0,Two year,1,Bank transfer (automatic),61.90,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5629,0684-AOSIH,1,0,1,0,1,1,0,Fiber optic,1,...,0,0,1,1,Month-to-month,1,Electronic check,95.00,95,1
5630,5982-PSMKW,0,0,1,1,23,1,1,DSL,1,...,1,1,1,1,Two year,1,Credit card (automatic),91.10,2198.3,0
5631,8044-BGWPI,1,0,1,1,12,1,0,No,0,...,0,0,1,1,Month-to-month,1,Electronic check,21.15,306.05,0
5632,7450-NWRTR,1,1,0,0,12,1,1,Fiber optic,0,...,1,0,1,0,Month-to-month,1,Electronic check,99.45,1200.15,1


### OneHot Encoding

In [9]:
columns_to_encode = ["InternetService", "Contract", "Method_Payment"]
data_to_encode = telecom_data[columns_to_encode]

encoder = OneHotEncoder(sparse=False, drop='first')  # drop='first' to avoid multicollinearity
encoded_data = encoder.fit_transform(data_to_encode)

encoded_columns = encoder.get_feature_names_out(columns_to_encode)

encoded_df = pd.DataFrame(encoded_data, columns=encoded_columns)

telecom_data_encoded = pd.concat([telecom_data.drop(columns_to_encode, axis=1), encoded_df], axis=1)

# # Save the encoded data to a new CSV file
# telecom_data_encoded.to_csv("telecom_data_encoded.csv", index=False)



In [10]:
# Convert the column to numeric (if not already)
telecom_data_encoded['TotalCharges'] = pd.to_numeric(telecom_data_encoded['TotalCharges'], errors='coerce')

# Calculate the mean of the column with missing values
mean_value = telecom_data_encoded['TotalCharges'].mean()

# Fill missing values with the mean
telecom_data_encoded['TotalCharges'] = telecom_data_encoded['TotalCharges'].fillna(mean_value)

In [11]:
telecom_data_encoded

Unnamed: 0,cID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,OnlineSecurity,OnlineBackup,...,Charges_Month,TotalCharges,Churn,InternetService_Fiber optic,InternetService_No,Contract_One year,Contract_Two year,Method_Payment_Credit card (automatic),Method_Payment_Electronic check,Method_Payment_Mailed check
0,4223-BKEOR,0,0,0,1,21,1,0,1,0,...,64.85,1336.800000,0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,6035-RIIOM,0,0,0,0,54,1,1,0,1,...,97.20,5129.450000,0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
2,3797-VTIDR,1,0,1,0,1,0,0,0,0,...,23.45,23.450000,1,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,2568-BRGYX,1,0,0,0,4,1,0,0,0,...,70.20,237.950000,1,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,2775-SEFEE,1,0,0,1,0,1,1,1,1,...,61.90,2291.154605,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5629,0684-AOSIH,1,0,1,0,1,1,0,1,0,...,95.00,95.000000,1,1.0,0.0,0.0,0.0,0.0,1.0,0.0
5630,5982-PSMKW,0,0,1,1,23,1,1,1,1,...,91.10,2198.300000,0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
5631,8044-BGWPI,1,0,1,1,12,1,0,0,0,...,21.15,306.050000,0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
5632,7450-NWRTR,1,1,0,0,12,1,1,0,0,...,99.45,1200.150000,1,1.0,0.0,0.0,0.0,0.0,1.0,0.0


### Separate features and target variable

In [12]:
x = telecom_data_encoded.drop(columns=['cID','Churn'])
y = telecom_data_encoded['Churn']

In [13]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [14]:
x

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,...,PaperlessBilling,Charges_Month,TotalCharges,InternetService_Fiber optic,InternetService_No,Contract_One year,Contract_Two year,Method_Payment_Credit card (automatic),Method_Payment_Electronic check,Method_Payment_Mailed check
0,0,0,0,1,21,1,0,1,0,1,...,0,64.85,1336.800000,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,0,0,0,0,54,1,1,0,1,0,...,1,97.20,5129.450000,1.0,0.0,0.0,1.0,0.0,0.0,0.0
2,1,0,1,0,1,0,0,0,0,0,...,1,23.45,23.450000,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1,0,0,0,4,1,0,0,0,0,...,1,70.20,237.950000,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,1,0,0,1,0,1,1,1,1,0,...,1,61.90,2291.154605,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5629,1,0,1,0,1,1,0,1,0,0,...,1,95.00,95.000000,1.0,0.0,0.0,0.0,0.0,1.0,0.0
5630,0,0,1,1,23,1,1,1,1,1,...,1,91.10,2198.300000,0.0,0.0,0.0,1.0,1.0,0.0,0.0
5631,1,0,1,1,12,1,0,0,0,0,...,1,21.15,306.050000,0.0,1.0,0.0,0.0,0.0,1.0,0.0
5632,1,1,0,0,12,1,1,0,0,1,...,1,99.45,1200.150000,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [15]:
y

0       0
1       0
2       1
3       1
4       0
       ..
5629    1
5630    0
5631    0
5632    1
5633    0
Name: Churn, Length: 5634, dtype: int64

### Modeling

In [16]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(random_state=42)
model.fit(x_train, y_train)

y_pred = model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("F1 Score:", f1)

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42),
                           param_grid=param_grid,
                           scoring='accuracy',
                           cv=5,
                           n_jobs=-1)
grid_search.fit(x_train, y_train)

best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

y_pred_tuned = best_model.predict(x_test)
accuracy_tuned = accuracy_score(y_test, y_pred_tuned)
f1_tuned = f1_score(y_test, y_pred_tuned)

print("\nBest Parameters:", best_params)
print("Tuned Accuracy:", accuracy_tuned)
print("Tuned F1 Score:", f1_tuned)

Accuracy: 0.7861579414374446
F1 Score: 0.5426944971537001

Best Parameters: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 200}
Tuned Accuracy: 0.8021295474711624
Tuned F1 Score: 0.5736137667304015
