In [9]:
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import roc_curve, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold


# Defining the s3 bucket
s3 = boto3.resource('s3')
bucket_name = 'data-445-wagner'
bucket = s3.Bucket(bucket_name)

# Defining the file to be read from s3 bucket
file_key = "telecom_churn.csv"

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

# Reading the csv file
churn_data = pd.read_csv(file_content_stream)
churn_data.head()

Unnamed: 0,Churn,AccountWeeks,ContractRenewal,DataPlan,DataUsage,CustServCalls,DayMins,DayCalls,MonthlyCharge,OverageFee,RoamMins
0,0,128,1,1,2.7,1,265.1,110,89.0,9.87,10.0
1,0,107,1,1,3.7,1,161.6,123,82.0,9.78,13.7
2,0,137,1,0,0.0,0,243.4,114,52.0,6.06,12.2
3,0,84,0,0,0.0,2,299.4,71,57.0,3.1,6.6
4,0,75,0,0,0.0,3,166.7,113,41.0,7.42,10.1


In [10]:
churn_data['Churn'].value_counts()/ churn_data.shape[0]

0    0.855086
1    0.144914
Name: Churn, dtype: float64

In [16]:
#defining variables
X = churn_data[['AccountWeeks', 'ContractRenewal', 'CustServCalls', 'MonthlyCharge', 'DayMins']]
Y = churn_data['Churn']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size= 0.2, stratify = Y)

In [19]:
RF_md = RandomForestClassifier(n_estimators = 500, max_depth = 3).fit(X_train, Y_train)

RF_pred = RF_md.predict_proba(X_test)[:, 1]

#computing ROC curve
fpr, tpr, threshold = roc_curve(Y_test, RF_pred)

#creating data frame
cutoffs = pd.DataFrame({'False_Positive': fpr, 'True_Positive': tpr, 'Cutoff': threshold})
cutoffs.head(10)


cutoffs['True_Positive_Minus_1']= cutoffs['True_Positive']-1
cutoffs['Euclidean_dist'] = np.sqrt(cutoffs['False_Positive']**2 + cutoffs['True_Positive_Minus_1']**2)


cutoffs = cutoffs.sort_values(by = 'Euclidean_dist').reset_index(drop = True)
cutoffs.head(10)

#Changing likelyhoods to lables
RF_pred = np.where(RF_pred < cutoffs['Cutoff'][0],0, 1)

print(classification_report(Y_test, RF_pred))

              precision    recall  f1-score   support

           0       0.98      0.83      0.90       570
           1       0.47      0.89      0.61        97

    accuracy                           0.84       667
   macro avg       0.72      0.86      0.76       667
weighted avg       0.90      0.84      0.86       667



In [21]:
AD_md = AdaBoostClassifier(n_estimators = 500, learning_rate = 0.01).fit(X_train, Y_train)

AD_pred = AD_md.predict_proba(X_test)[:, 1]

#computing ROC curve
fpr, tpr, threshold = roc_curve(Y_test, AD_pred)

#creating data frame
cutoffs = pd.DataFrame({'False_Positive': fpr, 'True_Positive': tpr, 'Cutoff': threshold})
cutoffs.head(10)


cutoffs['True_Positive_Minus_1']= cutoffs['True_Positive']-1
cutoffs['Euclidean_dist'] = np.sqrt(cutoffs['False_Positive']**2 + cutoffs['True_Positive_Minus_1']**2)


cutoffs = cutoffs.sort_values(by = 'Euclidean_dist').reset_index(drop = True)
cutoffs.head(10)

#Changing likelyhoods to lables
AD_pred = np.where(AD_pred < cutoffs['Cutoff'][0],0, 1)

print(classification_report(Y_test, AD_pred))

              precision    recall  f1-score   support

           0       0.97      0.84      0.90       570
           1       0.48      0.86      0.61        97

    accuracy                           0.84       667
   macro avg       0.73      0.85      0.76       667
weighted avg       0.90      0.84      0.86       667



In [None]:
##The two models are very similar but the random forrest has a slightly higher recall score, therefore it is the model I would choose
#to predict customer churn rate