In [1]:
# import necessary libraries
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score , recall_score, f1_score, classification_report
from sklearn.model_selection import train_test_split

In [126]:
# load csv file
df=pd.read_csv('/content/WA_Fn-UseC_-Telco-Customer-Churn.csv')
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [127]:
# print information about the data contained in the dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [128]:
df.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [129]:
# Convert TotalCharges column to float data type
df['TotalCharges'] = df['TotalCharges'].str.strip().replace('', np.nan)
df.dropna(subset=['TotalCharges'], inplace=True)
df['TotalCharges'] = df['TotalCharges'].astype(np.float64)


In [131]:
# Check if there are any missing values
df['TotalCharges'].isna().sum()

0

In [132]:
# Function to print unique values for non numeric columns
def print_unique_values(dfx) :
  D=[]
  dfx=dfx.drop(columns=['customerID','MonthlyCharges','TotalCharges','tenure'],inplace=False)
  for column in dfx.columns :
      unique_values=dfx[column].unique().tolist()
      D.append({column:unique_values})
  return D

D=print_unique_values(df)
D

[{'gender': ['Female', 'Male']},
 {'SeniorCitizen': [0, 1]},
 {'Partner': ['Yes', 'No']},
 {'Dependents': ['No', 'Yes']},
 {'PhoneService': ['No', 'Yes']},
 {'MultipleLines': ['No phone service', 'No', 'Yes']},
 {'InternetService': ['DSL', 'Fiber optic', 'No']},
 {'OnlineSecurity': ['No', 'Yes', 'No internet service']},
 {'OnlineBackup': ['Yes', 'No', 'No internet service']},
 {'DeviceProtection': ['No', 'Yes', 'No internet service']},
 {'TechSupport': ['No', 'Yes', 'No internet service']},
 {'StreamingTV': ['No', 'Yes', 'No internet service']},
 {'StreamingMovies': ['No', 'Yes', 'No internet service']},
 {'Contract': ['Month-to-month', 'One year', 'Two year']},
 {'PaperlessBilling': ['Yes', 'No']},
 {'PaymentMethod': ['Electronic check',
   'Mailed check',
   'Bank transfer (automatic)',
   'Credit card (automatic)']},
 {'Churn': ['No', 'Yes']}]

In [133]:
# Function to convert categorical columns with object data type to numeric representation
def mapping_columns_values(dataframe):
  dfx=dataframe
  mapping_yes_no={"No":0,"Yes":1}
  l=["No phone service","No internet service"]
  payement_methode_mapping={'Electronic check':1,'Mailed check':2,'Bank transfer (automatic)':3,'Credit card (automatic)':4}
  mapping_contract={'Month-to-month':1, 'One year':2, 'Two year':3}
  columns_with_more=['MultipleLines','InternetService','OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies']
  yes_no_columns=['PaperlessBilling','PhoneService','Partner','Dependents','Churn']
  for column in columns_with_more :
    for element in l:
      if element in dfx[column].unique().tolist() :
        dfx[column]=dfx[column].map({element:'No','No':'No','Yes':'Yes'})
        dfx[column]=dfx[column].map(mapping_yes_no)
  for column in yes_no_columns :
    dfx[column]=dfx[column].map(mapping_yes_no)
  dfx['PaymentMethod']=dfx['PaymentMethod'].map(payement_methode_mapping)
  dfx['gender']=dfx['gender'].map({'Female':0, 'Male':1})
  dfx['Contract']=dfx['Contract'].map(mapping_contract)
  dfx['InternetService']=dfx['InternetService'].map({'No':0,'Fiber optic':1,'DSL':2})
  return dfx

df1=mapping_columns_values(df)
df1.info()


<class 'pandas.core.frame.DataFrame'>
Index: 7032 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7032 non-null   object 
 1   gender            7032 non-null   int64  
 2   SeniorCitizen     7032 non-null   int64  
 3   Partner           7032 non-null   int64  
 4   Dependents        7032 non-null   int64  
 5   tenure            7032 non-null   int64  
 6   PhoneService      7032 non-null   int64  
 7   MultipleLines     7032 non-null   int64  
 8   InternetService   7032 non-null   int64  
 9   OnlineSecurity    7032 non-null   int64  
 10  OnlineBackup      7032 non-null   int64  
 11  DeviceProtection  7032 non-null   int64  
 12  TechSupport       7032 non-null   int64  
 13  StreamingTV       7032 non-null   int64  
 14  StreamingMovies   7032 non-null   int64  
 15  Contract          7032 non-null   int64  
 16  PaperlessBilling  7032 non-null   int64  
 17  

In [134]:
df1['InternetService'].value_counts()

Unnamed: 0_level_0,count
InternetService,Unnamed: 1_level_1
1,3096
2,2416
0,1520


In [135]:
# Check if there are any missing values among columns
df1.isna().sum()

Unnamed: 0,0
customerID,0
gender,0
SeniorCitizen,0
Partner,0
Dependents,0
tenure,0
PhoneService,0
MultipleLines,0
InternetService,0
OnlineSecurity,0


In [136]:
# check data type after transformation
df1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7032 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7032 non-null   object 
 1   gender            7032 non-null   int64  
 2   SeniorCitizen     7032 non-null   int64  
 3   Partner           7032 non-null   int64  
 4   Dependents        7032 non-null   int64  
 5   tenure            7032 non-null   int64  
 6   PhoneService      7032 non-null   int64  
 7   MultipleLines     7032 non-null   int64  
 8   InternetService   7032 non-null   int64  
 9   OnlineSecurity    7032 non-null   int64  
 10  OnlineBackup      7032 non-null   int64  
 11  DeviceProtection  7032 non-null   int64  
 12  TechSupport       7032 non-null   int64  
 13  StreamingTV       7032 non-null   int64  
 14  StreamingMovies   7032 non-null   int64  
 15  Contract          7032 non-null   int64  
 16  PaperlessBilling  7032 non-null   int64  
 17  

In [146]:
# Train xgboost and randomforest models to predict churn and evaluate them
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

# Resolve unbalanced classes problem
minority_class= df1[df1['Churn']==1]
majority_class= df1[df1['Churn']==0]

majority_class_sampled = majority_class.sample(n=len(minority_class), random_state=42)
df1sampled = pd.concat([minority_class, majority_class_sampled])
df1sampled = df1sampled.sample(frac=1, random_state=42).reset_index(drop=True)

x = df1sampled.drop(columns=['Churn','customerID'])
y = df1sampled['Churn']
x_train ,x_test, y_train ,y_test = train_test_split(x, y, test_size=0.2)

#Training
rdf= RandomForestClassifier(n_estimators=85)
xgb= XGBClassifier()

rdf.fit(x_train,y_train)
xgb.fit(x_train,y_train)

#evaluation :
y_pred_rdf = rdf.predict(x_test)
y_pred_xgb = xgb.predict(x_test)

accuracy_rdf=accuracy_score(y_pred_rdf,y_test)
accuracy_xgb=accuracy_score(y_pred_xgb,y_test)

recall_rdf= recall_score(y_pred_rdf,y_test)
recall_xgb= recall_score(y_pred_xgb,y_test)

print('*************  Evaluation metrics *************')
print('RandomForest --- Accuracy : ',accuracy_rdf,' recall : ',recall_rdf)
print('XGBoost --- Accuracy : ',accuracy_xgb,' recall : ',recall_xgb)


*************  Evaluation metrics *************
RandomForest --- Accuracy :  0.75  recall :  0.7493472584856397
XGBoost --- Accuracy :  0.7098930481283422  recall :  0.7080103359173127


In [148]:
print(classification_report(y_pred_rdf,y_test))
print('****************************************************')
print(classification_report(y_pred_xgb,y_test))

              precision    recall  f1-score   support

           0       0.74      0.75      0.75       365
           1       0.76      0.75      0.75       383

    accuracy                           0.75       748
   macro avg       0.75      0.75      0.75       748
weighted avg       0.75      0.75      0.75       748

****************************************************
              precision    recall  f1-score   support

           0       0.69      0.71      0.70       361
           1       0.72      0.71      0.72       387

    accuracy                           0.71       748
   macro avg       0.71      0.71      0.71       748
weighted avg       0.71      0.71      0.71       748

