In [2]:
pip install imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Collecting imbalanced-learn
  Downloading imbalanced_learn-0.9.0-py3-none-any.whl (199 kB)
     |████████████████████████████████| 199 kB 14.4 MB/s            
  Downloading imbalanced_learn-0.8.1-py3-none-any.whl (189 kB)
     |████████████████████████████████| 189 kB 78.4 MB/s            
Installing collected packages: imbalanced-learn, imblearn
Successfully installed imbalanced-learn-0.8.1 imblearn-0.0
Note: you may need to restart the kernel to use updated packages.


In [5]:
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_curve, classification_report

## Defining the bucket
s3 = boto3.resource('s3')
bucket_name = 'data-445-bucket-callaghan'
bucket = s3.Bucket(bucket_name)

## Defining the csv file
file_key = 'telecom_churn.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## 1. Using the pandas library to read the csv data file and create a data-frame called churn_data

churn_data = pd.read_csv(file_content_stream)

churn_data.head()

Unnamed: 0,Churn,AccountWeeks,ContractRenewal,DataPlan,DataUsage,CustServCalls,DayMins,DayCalls,MonthlyCharge,OverageFee,RoamMins
0,0,128,1,1,2.7,1,265.1,110,89.0,9.87,10.0
1,0,107,1,1,3.7,1,161.6,123,82.0,9.78,13.7
2,0,137,1,0,0.0,0,243.4,114,52.0,6.06,12.2
3,0,84,0,0,0.0,2,299.4,71,57.0,3.1,6.6
4,0,75,0,0,0.0,3,166.7,113,41.0,7.42,10.1


In [6]:
## 2. Using AccountWeeks, ContractRenewal, CustServCalls, MonthlyCharge, and DayMins as the predictor variables, 
## and Churn as the target variable, splitting the data into two data-frames (taking into account the proportion 
## of 0s and 1s): train (80%) and test (20%).


## Defining the input and target variables
X = churn_data[['AccountWeeks', 'ContractRenewal', 'CustServCalls', 'MonthlyCharge', 'DayMins']]
Y = churn_data['Churn']

## Splitting the data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)

In [7]:
## 3. Creating a synthetic training dataset, called them X_over and Y_over by running over-sampling on the train dataset

X_over, Y_over = RandomOverSampler().fit_resample(X_train, Y_train)

In [8]:
## 4. Using X_over and Y_over datasets to build a random forest classification model with 500 trees
## and the maximum depth of each tree equal to 3

## Building the model
rf_md = RandomForestClassifier(n_estimators = 500, max_depth = 3).fit(X_over, Y_over)

## Predicting on the test set
rf_preds = rf_md.predict_proba(X_test)[:, 1]

## Estimate the cutoff value with the ROC-curve
fpr, tpr, threshold = roc_curve(Y_test, rf_preds)

## Creating a data frame to store ROC-Curve results
cutoffs = pd.DataFrame({'False_Positive': fpr, 'True_Positive': tpr, 'Cutoff': threshold})

## Calculating the Euclidean distance between each point and our optimal model (fpr = 0, tpr = 1)
cutoffs['True_Positive_Minus_1'] = cutoffs['True_Positive'] - 1
cutoffs['Distance'] = np.sqrt(cutoffs['False_Positive']**2 + cutoffs['True_Positive_Minus_1']**2)

## Sorting the data frame based on Euclidean distance
cutoffs = cutoffs.sort_values('Distance', ascending = True).reset_index(drop = True)

## Changing likelihoods to labels
rf_preds = np.where(rf_preds < cutoffs['Cutoff'][0], 0, 1)

## Using the optimal cutoff value to create the classification report
print(classification_report(Y_test, rf_preds))

              precision    recall  f1-score   support

           0       0.95      0.84      0.89       570
           1       0.44      0.75      0.56        97

    accuracy                           0.82       667
   macro avg       0.70      0.79      0.72       667
weighted avg       0.88      0.82      0.84       667



In [10]:
## 5. Using X_over and Y_over datasets to build a ada-boost classification model with 500 trees,
## the maximum depth of each tree equal to 3, and learning rate equal to 0.01

## Building the model
ada_md = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(max_depth = 3), 
                            n_estimators = 500, learning_rate = 0.01).fit(X_over, Y_over)

## Predicting on the test set
ada_preds = ada_md.predict_proba(X_test)[:, 1]

## Estimate the cutoff value with the ROC-curve
fpr, tpr, threshold = roc_curve(Y_test, ada_preds)

## Creating a data frame to store ROC-Curve results
cutoffs = pd.DataFrame({'False_Positive': fpr, 'True_Positive': tpr, 'Cutoff': threshold})

## Calculating the Euclidean distance between each point and our optimal model (0,1)
cutoffs['True_Positive_Minus_1'] = cutoffs['True_Positive'] - 1
cutoffs['Distance'] = np.sqrt(cutoffs['False_Positive']**2 + cutoffs['True_Positive_Minus_1']**2)

## Sorting the data frame based on Euclidean distance
cutoffs = cutoffs.sort_values('Distance', ascending = True).reset_index(drop = True)

## Changing likelihoods to labels
ada_preds_label = np.where(ada_preds < cutoffs['Cutoff'][0], 0, 1)

## Using the optimal cutoff value to create the classification report
print(classification_report(Y_test, ada_preds_label))

              precision    recall  f1-score   support

           0       0.95      0.89      0.92       570
           1       0.54      0.74      0.62        97

    accuracy                           0.87       667
   macro avg       0.75      0.82      0.77       667
weighted avg       0.89      0.87      0.88       667



In [None]:
## 6. Using the results from part 4 and 5, we would use the AdaBoost Classification model to 
## predict customer churn because it shows higher precision testing scores for the minority class
## and right around the same recall testing score.