<a href="https://colab.research.google.com/github/Blessing-nwachukwu1/Hamoye-C/blob/main/BlessingHamoye_C.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**Importing libraries**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [None]:
df=pd.read_csv("/content/WA_Fn-UseC_-Telco-Customer-Churn.csv")

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


#**Data Preprocessing**

In [None]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

In [None]:
# Fill NaN values with 0
df['TotalCharges'].fillna(0, inplace=True)

In [None]:
# Map 'Churn' column to binary values
df['Churn'] = df['Churn'].map({'No': 0, 'Yes': 1})

In [None]:
# Select the features
categorical = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService',
               'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',
               'Contract', 'PaperlessBilling', 'PaymentMethod']
numerical = ['tenure', 'MonthlyCharges', 'TotalCharges']


#**Feature Engineering**

In [None]:
# Combine the features into one list
features = categorical + numerical

In [None]:
# Select the features and the target variable
X = df[features]
y = df['Churn']

In [None]:
# Split the data into an 80-20 train-test split with a random state of 1
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)


In [None]:
# Initialize the StandardScaler
scaler = StandardScaler()


In [None]:
X_train_numerical_scaled = scaler.fit_transform(X_train[numerical])
X_test_numerical_scaled = scaler.transform(X_test[numerical])


In [None]:
# Convert scaled numerical features back to DataFrame
X_train_numerical_scaled = pd.DataFrame(X_train_numerical_scaled, columns=numerical, index=X_train.index)
X_test_numerical_scaled = pd.DataFrame(X_test_numerical_scaled, columns=numerical, index=X_test.index)


In [None]:
# Initialize the OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)

In [None]:
# Fit the encoder on the training data and transform both training and test data
X_train_categorical_encoded = encoder.fit_transform(X_train[categorical])
X_test_categorical_encoded = encoder.transform(X_test[categorical])

In [None]:
# Convert encoded categorical features back to DataFrame with column names
encoded_categorical_columns = encoder.get_feature_names_out(categorical)
X_train_categorical_encoded = pd.DataFrame(X_train_categorical_encoded, columns=encoded_categorical_columns, index=X_train.index)
X_test_categorical_encoded = pd.DataFrame(X_test_categorical_encoded, columns=encoded_categorical_columns, index=X_test.index)


In [None]:
# Concatenate the scaled numerical features and the encoded categorical features
X_train_prepared = pd.concat([X_train_numerical_scaled, X_train_categorical_encoded], axis=1)
X_test_prepared = pd.concat([X_test_numerical_scaled, X_test_categorical_encoded], axis=1)


In [None]:
# Initialize the models with random_state=1
rf_model = RandomForestClassifier(random_state=1)
et_model = ExtraTreesClassifier(random_state=1)
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=1)
lgb_model = LGBMClassifier(random_state=1)


In [None]:
# Train the models
rf_model.fit(X_train_prepared, y_train)
et_model.fit(X_train_prepared, y_train)
xgb_model.fit(X_train_prepared, y_train)
lgb_model.fit(X_train_prepared, y_train)


[LightGBM] [Info] Number of positive: 1521, number of negative: 4113
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001022 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 670
[LightGBM] [Info] Number of data points in the train set: 5634, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.269968 -> initscore=-0.994785
[LightGBM] [Info] Start training from score -0.994785


In [None]:
# Predict on the test set
rf_pred = rf_model.predict(X_test_prepared)
et_pred = et_model.predict(X_test_prepared)
xgb_pred = xgb_model.predict(X_test_prepared)
lgb_pred = lgb_model.predict(X_test_prepared)


In [None]:
# Evaluate the models
print("Random Forest Classifier")
print("Accuracy:", accuracy_score(y_test, rf_pred))
print(classification_report(y_test, rf_pred))
print(confusion_matrix(y_test, rf_pred))

Random Forest Classifier
Accuracy: 0.7913413768630234
              precision    recall  f1-score   support

           0       0.85      0.88      0.86      1061
           1       0.58      0.53      0.56       348

    accuracy                           0.79      1409
   macro avg       0.72      0.71      0.71      1409
weighted avg       0.79      0.79      0.79      1409

[[929 132]
 [162 186]]


In [None]:
print("\nXGBoost Classifier")
print("Accuracy:", accuracy_score(y_test, xgb_pred))
print(classification_report(y_test, xgb_pred))
print(confusion_matrix(y_test, xgb_pred))


XGBoost Classifier
Accuracy: 0.7934705464868701
              precision    recall  f1-score   support

           0       0.86      0.87      0.86      1061
           1       0.59      0.56      0.57       348

    accuracy                           0.79      1409
   macro avg       0.72      0.71      0.72      1409
weighted avg       0.79      0.79      0.79      1409

[[924 137]
 [154 194]]


In [None]:
print("\nExtra Trees Classifier")
print("Accuracy:", accuracy_score(y_test, et_pred))
print(classification_report(y_test, et_pred))
print(confusion_matrix(y_test, et_pred))



Extra Trees Classifier
Accuracy: 0.7672107877927609
              precision    recall  f1-score   support

           0       0.83      0.86      0.85      1061
           1       0.53      0.47      0.50       348

    accuracy                           0.77      1409
   macro avg       0.68      0.67      0.67      1409
weighted avg       0.76      0.77      0.76      1409

[[916 145]
 [183 165]]


In [None]:
print("\nLightGBM Classifier")
print("Accuracy:", accuracy_score(y_test, lgb_pred))
print(classification_report(y_test, lgb_pred))
print(confusion_matrix(y_test, lgb_pred))


LightGBM Classifier
Accuracy: 0.8034066713981547
              precision    recall  f1-score   support

           0       0.87      0.87      0.87      1061
           1       0.61      0.59      0.60       348

    accuracy                           0.80      1409
   macro avg       0.74      0.73      0.73      1409
weighted avg       0.80      0.80      0.80      1409

[[928 133]
 [144 204]]
