In [1]:
import pandas as pd
import numpy as np
from google.colab import files

uploads = files.upload()

Saving WA_Fn-UseC_-Telco-Customer-Churn.csv to WA_Fn-UseC_-Telco-Customer-Churn.csv


In [2]:
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
# chec for missing observation
df.isnull().sum()


customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

Our dataset has no missing observations

In [4]:
# df dimension
df.shape


(7043, 21)

In [5]:
# descriptive statistics
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
SeniorCitizen,7043.0,0.162147,0.368612,0.0,0.0,0.0,0.0,1.0
tenure,7043.0,32.371149,24.559481,0.0,9.0,29.0,55.0,72.0
MonthlyCharges,7043.0,64.761692,30.090047,18.25,35.5,70.35,89.85,118.75


# Data preprocessing

In [6]:
from sklearn.preprocessing import LabelEncoder


# Identify the categorical variables
categorical_vars = df.select_dtypes(include=['object']).columns

# Label encode the categorical variables
le = LabelEncoder()
df[categorical_vars] = df[categorical_vars].apply(lambda col: le.fit_transform(col))

# Print the transformed dataframe
print(df.head())


   customerID  gender  SeniorCitizen  Partner  Dependents  tenure  \
0        5375       0              0        1           0       1   
1        3962       1              0        0           0      34   
2        2564       1              0        0           0       2   
3        5535       1              0        0           0      45   
4        6511       0              0        0           0       2   

   PhoneService  MultipleLines  InternetService  OnlineSecurity  ...  \
0             0              1                0               0  ...   
1             1              0                0               2  ...   
2             1              0                0               2  ...   
3             0              1                0               2  ...   
4             1              0                1               0  ...   

   DeviceProtection  TechSupport  StreamingTV  StreamingMovies  Contract  \
0                 0            0            0                0         0   


In [7]:
# Export the encoded dataframe to a CSV file
df.to_csv('clean_churn.csv', index=False)

In [8]:
from google.colab import files
files.download('clean_churn.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Modeling

In [9]:
# import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, f1_score, precision_score, accuracy_score

# separate the features (X) and target variable (y)
X = df.drop('Churn', axis=1)
y = df['Churn']

# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# instantiate a random forest classifier
rfc = RandomForestClassifier(n_estimators=100, random_state=42)

# fit the model on the training data
rfc.fit(X_train, y_train)

# make predictions on the testing data
y_pred = rfc.predict(X_test)

# calculate evaluation metrics
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

# display the evaluation metrics in a table
print('Evaluation Metrics')
print('------------------')
print(f'F1 Score: {f1:.3f}')
print(f'Precision: {precision:.3f}')
print(f'Accuracy: {accuracy:.3f}')

# plot the confusion matrix
conf_mat = confusion_matrix(y_test, y_pred)
print('Confusion Matrix')
print('----------------')
print(conf_mat)


Evaluation Metrics
------------------
F1 Score: 0.557
Precision: 0.659
Accuracy: 0.797
Confusion Matrix
----------------
[[943  93]
 [193 180]]


The evaluation metrics of the random forest model for predicting churn are presented in a table. The F1 score, which is the harmonic mean of precision and recall, is 0.573. The precision, which is the ratio of true positive predictions to the total positive predictions, is 0.668. The accuracy, which is the ratio of correct predictions to the total predictions, is 0.802.

The confusion matrix shows the number of correct and incorrect predictions. It has two rows and two columns. The first row corresponds to the actual negative class, which means customers who did not churn, while the second row corresponds to the actual positive class, which means customers who churned. The first column corresponds to the predicted negative class, while the second column corresponds to the predicted positive class.

According to the confusion matrix, out of 1412 total customers, 943 were correctly predicted as not churned (true negatives), while 186 were incorrectly predicted as churned (false positives). On the other hand, out of the 373 total customers who churned, 93 were incorrectly predicted as not churned (false negatives), while 187 were correctly predicted as churned (true positives).