In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [77]:
df = pd.read_csv('../../data/Customer Churn Data.csv')
df['international plan'] = (df['international plan'] == 'yes').astype(int)
df['voice mail plan'] = (df['voice mail plan'] == 'yes').astype(int)

ohe = OneHotEncoder(sparse = False)
ohe_states = pd.DataFrame(ohe.fit_transform(pd.DataFrame(df['state'])), columns = ohe.get_feature_names())
df = pd.concat([df, ohe_states], axis = 1)
df = df.drop(['state'], axis = 1)

df['daily rev'] = (df['total day charge'] + df['total eve charge'] + df['total night charge'] + df['total intl charge']) / df['account length']

In [78]:
y = df['churn']
X = df.copy()
X.drop(['churn', 'area code', 'phone number'], axis = 1, inplace = True)

In [79]:
def metrics(y_true, y_pred):
    print('Accuracy: ' + str(accuracy_score(y_true, y_pred)))
    print('Precision: ' + str(precision_score(y_true, y_pred)))
    print('Recall: ' + str(recall_score(y_true, y_pred)))
    print('F1: ' + str(f1_score(y_true, y_pred)))
    
def model(X_train, X_test, y_train, y_test):
    estimators = [('rf', RandomForestClassifier()),
                  ('log', LogisticRegression(solver = 'liblinear')),
                  ('grad', GradientBoostingClassifier())]
    stack = StackingClassifier(estimators = estimators, final_estimator = LogisticRegression(), cv = 5)
    stack.fit(X_train, y_train)
    metrics(y_test, stack.predict(X_test))
    return stack

X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, random_state = 42)
X_train1, X_test1, y_train1, y_test1 = train_test_split(X_train, y_train, random_state = 420)

ss = StandardScaler()
X_train1 = ss.fit_transform(X_train1)
X_test1 = ss.transform(X_test1)

model = model(X_train1, X_test1, y_train1, y_test1);

Accuracy: 0.9584
Precision: 0.9493670886075949
Recall: 0.7731958762886598
F1: 0.8522727272727272


In [7]:
for i in model.estimators_:
    metrics(y_test1, i.predict(X_test1))

Accuracy: 0.9504
Precision: 0.9583333333333334
Recall: 0.711340206185567
F1: 0.8165680473372781
Accuracy: 0.8448
Precision: 0.5
Recall: 0.17525773195876287
F1: 0.2595419847328244
Accuracy: 0.9568
Precision: 0.9487179487179487
Recall: 0.7628865979381443
F1: 0.8457142857142856


In [12]:
features_importances = {}
for i in model.estimators_:
    try:
        features = pd.DataFrame(i.feature_importances_).transpose()
        features.columns = X.columns
        features_importances[i] = features
    except:
        features_importances[i] = ':('


In [14]:
features_importances[model.estimators_[2]].transpose()[0].sort_values(ascending = False).head(17)

total day charge          0.187968
international plan        0.171371
customer service calls    0.146247
total day minutes         0.110133
total eve minutes         0.078664
total intl calls          0.060388
total eve charge          0.057893
voice mail plan           0.047395
number vmail messages     0.040193
total intl charge         0.023149
total intl minutes        0.023033
total night minutes       0.017999
total night charge        0.017479
total day calls           0.009325
total eve calls           0.004263
total night calls         0.002567
account length            0.001933
Name: 0, dtype: float64

In [9]:
model.estimators_

[RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                        criterion='gini', max_depth=None, max_features='auto',
                        max_leaf_nodes=None, max_samples=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, n_estimators=100,
                        n_jobs=None, oob_score=False, random_state=None,
                        verbose=0, warm_start=False),
 LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=100,
                    multi_class='auto', n_jobs=None, penalty='l2',
                    random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                    warm_start=False),
 GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                            