In [1]:
import numpy as np
import pandas as pd

from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import classification_report, confusion_matrix, recall_score,\
accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

import prepare

In [2]:
df = prepare.wrangle()
train, validate, test = prepare.split_data(df)

opening data from file


In [3]:
train.churn.value_counts()

0    2897
1    1046
Name: churn, dtype: int64

Our baseline prediction will be that a customer will not churn (churn = 0)

In [4]:
baseline = [0] * len(train)

In [5]:
len(baseline) == len(train)

True

In [6]:
print(f'The baseline prediction score accuracy is: {(train.churn == baseline).mean():.2%}')

The baseline prediction score accuracy is: 73.47%


In [8]:
train.columns

Index(['customer_id', 'gender', 'senior_citizen', 'partner', 'dependents',
       'tenure', 'phone_service', 'multiple_lines', 'online_security',
       'online_backup', 'device_protection', 'tech_support', 'streaming_tv',
       'streaming_movies', 'paperless_billing', 'monthly_charges',
       'total_charges', 'churn', 'contract_type', 'internet_service_type',
       'payment_type', 'gender_male', 'multiple_lines_no_phone_service',
       'multiple_lines_yes', 'online_security_no_internet_service',
       'online_security_yes', 'online_backup_no_internet_service',
       'online_backup_yes', 'device_protection_no_internet_service',
       'device_protection_yes', 'tech_support_no_internet_service',
       'tech_support_yes', 'streaming_tv_no_internet_service',
       'streaming_tv_yes', 'streaming_movies_no_internet_service',
       'streaming_movies_yes', 'contract_type_one_year',
       'contract_type_two_year', 'internet_service_type_fiber_optic',
       'internet_service_type_non

In [9]:
# create a list of columns to be used as the ML model parameters 
X_cols = ['multiple_lines_no_phone_service', 'dependents',
           'multiple_lines_yes', 'online_security_yes',
           'online_backup_yes', 'device_protection_yes', 
           'tech_support_yes', 'contract_type_one_year',
           'contract_type_two_year', 'internet_service_type_fiber_optic',
           'internet_service_type_none', 'payment_type_credit_card_automatic',
           'payment_type_electronic_check', 'payment_type_mailed_check']
# create a variable for the column used as the prediction target of the ML models
y_col = 'churn'

In [10]:
def model_prep(train, validate, test):
    '''
    This function will take in train, validate and test DataFrames and return 
    DataFrames that contain the columns to be used with the ML models.
    '''
    # create new dataframes with the desired columns
    train_X = train[X_cols]
    validate_X = validate[X_cols]
    test_X = test[X_cols]
    train_y = train[y_col]
    validate_y = validate[y_col]
    test_y = test[y_col]
    # return the new dataframes
    return train_X, validate_X, test_X, train_y, validate_y, test_y

In [11]:
train_X, validate_X, test_X, train_y, validate_y, test_y = model_prep(train,validate,test)

In [12]:
def get_decision_tree(train_X, validate_X, train_y, validate_y):
    '''
    This function will use a decision tree machine learning model to predict 
    customer churn using the columns chosen during the exploration process.
    '''
    # make the decision tree object
    dt = DecisionTreeClassifier()
    # fit the data to the dt object
    dt.fit(train_X, train_y)
    # predict with the dt object
    dt_preds = dt.predict(train_X)
    dt_val_preds = dt.predict(validate_X)
    # "Model Type" 
    # "evaluation metric" on train: "evaluation result" 
    # "evaluation metric" on validate: "evaluation result"
    print('Decision Tree Model')
    print(f'Recall score on train: {recall_score(train_y, dt_preds):.2%}')
    print(f'Recall score on validate: {recall_score(validate_y, dt_val_preds):.2%}')
    # return the decision tree model for use in other functions
    return dt

In [13]:
dt = get_decision_tree(train_X, validate_X, train_y, validate_y)

Decision Tree Model
Recall score on train: 65.97%
Recall score on validate: 56.79%


In [14]:
def get_random_forest(train_X, validate_X, train_y, validate_y):
    '''
    This function will use a random forest machine learning model to predict 
    customer churn using the columns chosen during the exploration process.
    '''
    # make the decision tree object
    rf = RandomForestClassifier()
    # fit the data to the rf object
    rf.fit(train_X, train_y)
    # predict with the rf object
    rf_preds = rf.predict(train_X)
    rf_val_preds = rf.predict(validate_X)
    # "Model Type" 
    # "evaluation metric" on train: "evaluation result" 
    # "evaluation metric" on validate: "evaluation result"
    print('Random Forest Model')
    print(f'Recall score on train: {recall_score(train_y, rf_preds):.2%}')
    print(f'Recall score on validate: {recall_score(validate_y, rf_val_preds):.2%}')
    # return the random forest model for use in other functions
    return rf

In [15]:
rf = get_random_forest(train_X, validate_X, train_y, validate_y)

Random Forest Model
Recall score on train: 70.08%
Recall score on validate: 57.24%


In [16]:
def get_logistic_regression(train_X, validate_X, train_y, validate_y):
    '''
    This function will use a logistic regression machine learning model to predict 
    customer churn using the columns chosen during the exploration process.
    '''
    # make the decision tree object
    lr = LogisticRegression()
    # fit the data to the lr object
    lr.fit(train_X, train_y)
    # predict with the lr object
    lr_preds = lr.predict(train_X)
    lr_val_preds = lr.predict(validate_X)
    # "Model Type" 
    # "evaluation metric" on train: "evaluation result" 
    # "evaluation metric" on validate: "evaluation result"
    print('Logistic Regression Model')
    print(f'Recall score on train: {recall_score(train_y, lr_preds):.2%}')
    print(f'Recall score on validate: {recall_score(validate_y, lr_val_preds):.2%}')
    # return the logistic regression model for use in other functions
    return lr

In [17]:
lr = get_logistic_regression(train_X, validate_X, train_y, validate_y)

Logistic Regression Model
Recall score on train: 51.05%
Recall score on validate: 54.12%


In [18]:
def get_rf_test(test_X, test_y, rf):
    '''
    This function will take in a random forest model in order to predict customer
    churn rate using the test data.
    '''
    # make a prediction using the test data and passed rf model
    rf_test_preds = rf.predict(test_X)
    # print the recall score for the test data
    print('Random Forest Model')
    print(f'Recall score on test: {recall_score(test_y, rf_test_preds):.2%}')

In [19]:
get_rf_test(test_X, test_y, rf)

Random Forest Model
Recall score on test: 55.61%


In [23]:
# create a df with predictions from the test data
predictions = pd.concat([
    # get the customer_ids, and reset the index to ordered 0-1408, dropping the old index
    pd.DataFrame(test.customer_id).reset_index().drop(columns='index'),
    # get the churn prediction from the final random forest ml model
    pd.DataFrame(rf.predict(test_X)).rename(columns={0:'prediction_of_churn'}),
    # get the churn probability predicitons from the final random forest model
    pd.DataFrame(rf.predict_proba(
        # drop the no_churn probalities since it is not needed
        test_X)).drop(columns=0).rename(
        # rename the column
        columns={1:'probability_of_churn'})
],axis=1)
predictions

Unnamed: 0,customer_id,prediction_of_churn,probability_of_churn
0,2851-MMUTZ,0,0.292968
1,4257-GAESD,0,0.128659
2,2688-BHGOG,1,0.696733
3,7005-CYUIL,0,0.032667
4,2228-BZDEE,0,0.027736
...,...,...,...
1404,5480-TBGPH,0,0.285667
1405,1098-TDVUQ,0,0.296536
1406,2486-WYVVE,0,0.181667
1407,5076-YVXCM,1,0.613517


In [22]:
# output the predictions to a csv file, commented so I don't repeatedly overwrite file
# predictions.to_csv('predictions.csv')