<a href="https://colab.research.google.com/github/Dav7d-007/Hamoye-/blob/main/Customer_Churn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **Hamoye Stage C: Telco Customer Churn**

In [None]:
#Import Python Libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, f1_score, precision_score, recall_score, roc_auc_score, roc_curve
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from xgboost import XGBClassifier

In [None]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)

%config InlineBackend.figure_format = 'retina'

# to display all columns and rows:
pd.set_option('display.max_columns', None); pd.set_option('display.max_rows', None);

In [None]:
#reading the data
df = pd.read_csv("TelcoCustomerChurn.csv", index_col=0)

# The first 5 observation
df.head()

Unnamed: 0_level_0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
customerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [None]:
# The size of the data set
df.shape

(7043, 20)

In [None]:
# Feature information
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7043 entries, 7590-VHVEG to 3186-AJIEK
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   MultipleLines     7043 non-null   object 
 7   InternetService   7043 non-null   object 
 8   OnlineSecurity    7043 non-null   object 
 9   OnlineBackup      7043 non-null   object 
 10  DeviceProtection  7043 non-null   object 
 11  TechSupport       7043 non-null   object 
 12  StreamingTV       7043 non-null   object 
 13  StreamingMovies   7043 non-null   object 
 14  Contract          7043 non-null   object 
 15  PaperlessBilling  7043 non-null   object 
 16  PaymentMethod     7043 non-null 

In [None]:
df.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
count,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692
std,0.368612,24.559481,30.090047
min,0.0,0.0,18.25
25%,0.0,9.0,35.5
50%,0.0,29.0,70.35
75%,0.0,55.0,89.85
max,1.0,72.0,118.75


**PREPROCESSING**

In [None]:
# Convert 'TotalCharges' to numeric (assuming it's a string containing numbers)
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

df['TotalCharges'].fillna(0, inplace=True)

# Check for remaining missing values
print(df.isnull().sum())

gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64


In [None]:
# Create a LabelEncoder object
le = LabelEncoder()

# Encode the 'Churn' column (0 for 'No', 1 for 'Yes')
df['Churn'] = le.fit_transform(df['Churn'])
df['Churn']
# Check the mapping
print(le.classes_)  # This will print the unique labels and their corresponding codes

[0 1]


In [None]:
# Split the data into features (X) and target variable (y)
X = df.drop('Churn', axis=1)  # Assuming 'Churn' is the target variable
y = df['Churn']

# Split the data into training and testing sets (80/20 split, random state of 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [None]:
# categorical Variables
categorical_variables = [col for col in df.columns if col in "O"
                        or df[col].nunique() <=11
                        and col not in "Churn"]

categorical_variables

['gender',
 'SeniorCitizen',
 'Partner',
 'Dependents',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod']

In [None]:
# Numeric Variables
numeric_variables = [col for col in df.columns if df[col].dtype != "object"
                        and df[col].nunique() >11
                        and col not in "CustomerId"]
numeric_variables

['tenure', 'MonthlyCharges', 'TotalCharges']

In [None]:
# Create a StandardScaler object
scaler = StandardScaler()

# Get only the numerical features
X_train_numerical = X_train[numeric_variables]

# Fit the scaler on the training data
scaler.fit(X_train_numerical)

# Transform the training and testing sets
X_train_scaled = pd.DataFrame(scaler.transform(X_train_numerical), columns=numeric_variables)
X_test_scaled = pd.DataFrame(scaler.transform(X_test[numeric_variables]), columns=numeric_variables)

# Combine scaled numerical features with categorical features
#X_train_combined = pd.concat([X_train_scaled, X_train.drop(numeric_variables, axis=1)], axis=1)
#X_test_combined = pd.concat([X_test_scaled, X_test.drop(numeric_variables, axis=1)], axis=1)

In [None]:
# Import libraries
from sklearn.preprocessing import OneHotEncoder

# Create a OneHotEncoder object (sparse_output=False for dense output)
encoder = OneHotEncoder(sparse=False)

# Get only the categorical features
X_train_categorical = X_train[categorical_variables]
X_test_categorical = X_test[categorical_variables]

# Fit the encoder on the training data (only categorical features)
encoder.fit(X_train_categorical)

# Transform (encode) the training and testing sets (categorical features)
X_train_encoded = pd.DataFrame(encoder.transform(X_train_categorical))
X_test_encoded = pd.DataFrame(encoder.transform(X_test_categorical))

# Get the encoded feature names
encoded_feature_names = encoder.get_feature_names_out(categorical_variables)

# Add column names to the encoded DataFrames
X_train_encoded.columns = encoded_feature_names
X_test_encoded.columns = encoded_feature_names

In [None]:
# Combine scaled numerical features and encoded categorical features
X_train_final = pd.concat([X_train_scaled, X_train_encoded], axis=1)
X_test_final = pd.concat([X_test_scaled, X_test_encoded], axis=1)

In [None]:
#Random Forrest and Extra Tree Classifier

# Define model parameters
n_estimators = 100  # Number of trees in the forest
max_depth = None  # Allow maximum depth for trees (can be adjusted)

# Create Random Forest Classifier
clf_rf = RandomForestClassifier(random_state = 1)

# Create Extra Trees Classifier
clf_et = ExtraTreesClassifier(random_state = 1)

# Train the models on the prepared data
clf_rf.fit(X_train_final, y_train)
clf_et.fit(X_train_final, y_train)

In [None]:
#XGBoost
import lightgbm as lgb

# Define and train XGBoost Classifier (random_state through params)
xgb_model = XGBClassifier(objective='binary:logistic', random_state=1) # Assuming binary classification
xgb_model.fit(X_train_final, y_train)

# Define and train LightGBM Classifier (random_seed through params)
lgb_train = lgb.Dataset(X_train_final, label=y_train)
lgb_eval = lgb.Dataset(X_test_final, label=None, reference=lgb_train)  # No label for test set
params = {'objective': 'binary', 'metric': 'binary_logloss', 'random_seed': 1}  # Assuming binary classification
lgb_model = lgb.train(params, lgb_train, valid_sets=[lgb_eval])  # Suppress training messages

# Make predictions on the testing set for all models
y_pred_rfc = clf_rf.predict(X_test_final)
y_pred_etc = clf_et.predict(X_test_final)
y_pred_xgb = xgb_model.predict(X_test_final)
y_pred_lgb = lgb_model.predict(X_test_final)

# Evaluate model performance using appropriate metrics (replace with your desired metrics)
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rfc))
print("Extra Trees Accuracy:", accuracy_score(y_test, y_pred_etc))
print("XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb))

[LightGBM] [Info] Number of positive: 1521, number of negative: 4113
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000724 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 670
[LightGBM] [Info] Number of data points in the train set: 5634, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.269968 -> initscore=-0.994785
[LightGBM] [Info] Start training from score -0.994785
Random Forest Accuracy: 0.7913413768630234
Extra Trees Accuracy: 0.7672107877927609
XGBoost Accuracy: 0.7934705464868701


In [None]:
#print("LightGBM Accuracy:", accuracy_score(y_test, y_pred_lgb))