In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

from explore import tts

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import sklearn.preprocessing

import warnings
warnings.filterwarnings('ignore')

In [2]:
df=pd.read_csv('Customer-Churn-Records.csv')

In [3]:
df.drop(columns=['RowNumber','CustomerId','Surname'], inplace=True)
df.rename(columns={'CreditScore': 'credit_score',
                   'Geography': 'geography', 'Gender': 'gender',
                   'Age': 'age', 'Tenure': 'tenure', 'Balance': 'balance',
                   'NumOfProducts': 'num_products', 'HasCrCard': 'has_cr_card',
                   'IsActiveMember': 'is_active_member',
                   'EstimatedSalary': 'estimated_salary', 'Exited': 'churn',
                   'Complain': 'complain', 'Satisfaction Score': 'satisfaction_score',
                   'Card Type': 'card_type', 'Point Earned': 'point_earned'}, inplace=True)

In [4]:
df.head()

Unnamed: 0,credit_score,geography,gender,age,tenure,balance,num_products,has_cr_card,is_active_member,estimated_salary,churn,complain,satisfaction_score,card_type,point_earned
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1,1,2,DIAMOND,464
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0,1,3,DIAMOND,456
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1,1,3,DIAMOND,377
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0,0,5,GOLD,350
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0,0,5,GOLD,425


In [5]:
df=pd.get_dummies(df, columns=['geography','gender','num_products','card_type','satisfaction_score']
                  , drop_first=True)

In [6]:
df.head()

Unnamed: 0,credit_score,age,tenure,balance,has_cr_card,is_active_member,estimated_salary,churn,complain,point_earned,...,num_products_2,num_products_3,num_products_4,card_type_GOLD,card_type_PLATINUM,card_type_SILVER,satisfaction_score_2,satisfaction_score_3,satisfaction_score_4,satisfaction_score_5
0,619,42,2,0.0,1,1,101348.88,1,1,464,...,0,0,0,0,0,0,1,0,0,0
1,608,41,1,83807.86,0,1,112542.58,0,1,456,...,0,0,0,0,0,0,0,1,0,0
2,502,42,8,159660.8,1,0,113931.57,1,1,377,...,0,1,0,0,0,0,0,1,0,0
3,699,39,1,0.0,0,0,93826.63,0,0,350,...,1,0,0,1,0,0,0,0,0,1
4,850,43,2,125510.82,1,1,79084.1,0,0,425,...,0,0,0,1,0,0,0,0,0,1


In [7]:
df.shape

(10000, 23)

In [8]:
train, val, test=tts(df)

In [9]:
train.shape, val.shape, test.shape

((7200, 23), (1800, 23), (1000, 23))

In [10]:
def modeling_prep():
    df=pd.read_csv('Customer-Churn-Records.csv')
    df.drop(columns=['RowNumber','CustomerId','Surname'], inplace=True)
    df.rename(columns={'CreditScore': 'credit_score',
                       'Geography': 'geography', 'Gender': 'gender',
                       'Age': 'age', 'Tenure': 'tenure', 'Balance': 'balance',
                       'NumOfProducts': 'num_products', 'HasCrCard': 'has_cr_card',
                       'IsActiveMember': 'is_active_member',
                       'EstimatedSalary': 'estimated_salary', 'Exited': 'churn',
                       'Complain': 'complain', 'Satisfaction Score': 'satisfaction_score',
                       'Card Type': 'card_type', 'Point Earned': 'point_earned'}, inplace=True)
    df=pd.get_dummies(df, columns=['geography','gender','num_products','card_type','satisfaction_score']
                      , drop_first=True)
    train, val, test=tts(df)
    return train, val, test

In [11]:
train, val, test=modeling_prep()

In [12]:
train.shape, val.shape, test.shape

((7200, 23), (1800, 23), (1000, 23))

In [13]:
train['churn'].value_counts()

0    5745
1    1455
Name: churn, dtype: int64

In [14]:
def get_baseline(train):
    '''
    this will give the baseline for the model to beat
    '''
    train['baseline']=0
    y_train=train['churn']
    baseline = accuracy_score(y_train, train['baseline'])
    train.drop(columns='baseline', inplace=True)
    return print(f'The baseline our model needs to beat is: {round(baseline,4)*100}%')

In [15]:
get_baseline(train)

The baseline our model needs to beat is: 79.79%


In [18]:
x_train=train.drop(columns=['churn'])
y_train=train['churn']

x_val=val.drop(columns=['churn'])
y_val=val['churn']

In [24]:
results=[]
for i in range(1,20):
    for n in range(1,20):
        dtc=DecisionTreeClassifier(max_depth=n, min_samples_leaf=i, random_state=8675309)
        dtc.fit(x_train, y_train)
        in_sample= dtc.score(x_train, y_train)
        out_of_sample= dtc.score(x_val, y_val)
        output={
        'model': 'DecisionTreeClassifier',
        'train_accuracy': in_sample,
        'validate_accuracy': out_of_sample,
        'max_depth': n,
        'min_sample_leaf': i
        }
        results.append(output)
results=pd.DataFrame(data=results)
results['difference']=results['train_accuracy']-results['validate_accuracy'] 
results=results.sort_values('difference', ascending=True)

In [58]:
results[298:300]

Unnamed: 0,model,train_accuracy,validate_accuracy,max_depth,min_sample_leaf,difference
124,DecisionTreeClassifier,0.99875,0.998889,11,7,-0.000139
21,DecisionTreeClassifier,0.998472,0.998333,3,2,0.000139


In [59]:
#This is the closest to 0 for the decision tree

In [16]:
#optimize random forest

In [None]:
#get KNN working

In [25]:
def models(train, val):
    '''
    this function prints results for models
    '''
    x_train=train.drop(columns=['churn'])
    y_train=train['churn']

    x_val=val.drop(columns=['churn'])
    y_val=val['churn']
    
    results=[]
    logit = LogisticRegression(C=.5, random_state=8675309, intercept_scaling=1, solver='lbfgs')
    logit.fit(x_train, y_train)
    in_sample=logit.score(x_train,y_train)
    out_of_sample=logit.score(x_val, y_val)
    output={
        'model': 'LogisticRegression (lbfgs)',
        'train_accuracy': in_sample,
        'validate_accuracy': out_of_sample
    }
    results.append(output)
    
    logit = LogisticRegression(C=1, random_state=8675309, solver='liblinear')
    logit.fit(x_train, y_train)
    in_sample=logit.score(x_train,y_train)
    out_of_sample=logit.score(x_val, y_val)
    output={
        'model': 'LogisticRegression (liblinear)',
        'train_accuracy': in_sample,
        'validate_accuracy': out_of_sample
    }
    results.append(output)
    
    #knn= KNeighborsClassifier(n_neighbors=5, weights='uniform')
    #scaler = sklearn.preprocessing.MinMaxScaler()
    #scaler.fit(x_train)
    #x_train_scaled = scaler.transform(x_train)
    #x_val_scaled = scaler.transform(x_val)
    #knn.fit(x_train_scaled,y_train)
    #in_sample= knn.score(x_train_scaled, y_train)
    #out_of_sample= knn.score(x_val_scaled, y_val)
    #output={
     #   'model': 'KNeighborsClassifier',
      #  'train_accuracy': in_sample,
       # 'validate_accuracy': out_of_sample
    #}
    #results.append(output)
    
    x_train=train.drop(columns=['churn'])
    y_train=train['churn']

    x_val=val.drop(columns=['churn'])
    y_val=val['churn']
    
    dtc=DecisionTreeClassifier(max_depth=11, min_samples_leaf=7, random_state=8675309)
    dtc.fit(x_train, y_train)
    in_sample= dtc.score(x_train, y_train)
    out_of_sample= dtc.score(x_val, y_val)
    output={
        'model': 'DecisionTreeClassifier',
        'train_accuracy': in_sample,
        'validate_accuracy': out_of_sample
    }
    results.append(output)
    
    rm= RandomForestClassifier(max_depth= 9, min_samples_leaf= 10, random_state=8675309)
    rm.fit(x_train, y_train)
    in_sample= rm.score(x_train, y_train)
    out_of_sample= rm.score(x_val, y_val)
    output={
        'model': 'RandomForestClassifier',
        'train_accuracy': in_sample,
        'validate_accuracy': out_of_sample
    }
    results.append(output)
    
    results=pd.DataFrame(data=results)
    results['difference']=results['train_accuracy']-results['validate_accuracy'] 
    results=results.sort_values('difference', ascending=False)
    return results

In [26]:
models(train,val)

Unnamed: 0,model,train_accuracy,validate_accuracy,difference
0,LogisticRegression (lbfgs),0.791111,0.782222,0.008889
1,LogisticRegression (liblinear),0.793611,0.785,0.008611
2,DecisionTreeClassifier,0.998472,0.999444,-0.000972
3,RandomForestClassifier,0.998472,0.999444,-0.000972
