In [1]:
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn import ensemble
from sklearn import datasets
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error

BONUS FEATURES

In [2]:
df = pd.read_csv('ESSdata_Thinkful.csv').dropna()



In [3]:
y = df['partner'] - 1
X = df.loc[:, ~df.columns.isin(['partner', 'cntry', 'idno'])]

# Make the categorical variable 'country' into dummies.
X = pd.concat([X, pd.get_dummies(df['cntry'])], axis=1)

# Create training and test sets.
offset = int(X.shape[0] * 0.9)

# Put 90% of the data in the training set.
X_train, y_train = X[:offset], y[:offset]

# And put 10% in the test set.
X_test, y_test = X[offset:], y[offset:]

Since we're now working with a binary outcome, we've switched to a classifier.  Now our loss function can't be the residuals.  Our options are "deviance", or "exponential".  Deviance is used for logistic regression, and we'll try that here.

In [4]:
# We'll make 500 iterations, use 2-deep trees, and set our loss function.
params = {'n_estimators': 500,
          'max_depth': 2,
          'loss': 'deviance'}

# Initialize and fit the model.
clf = ensemble.GradientBoostingClassifier(**params)
clf.fit(X_train, y_train)

predict_train = clf.predict(X_train)
predict_test = clf.predict(X_test)

# Accuracy tables.
table_train = pd.crosstab(y_train, predict_train, margins=True)
table_test = pd.crosstab(y_test, predict_test, margins=True)

train_tI_errors = table_train.loc[0.0,1.0] / table_train.loc['All','All']
train_tII_errors = table_train.loc[1.0,0.0] / table_train.loc['All','All']

test_tI_errors = table_test.loc[0.0,1.0]/table_test.loc['All','All']
test_tII_errors = table_test.loc[1.0,0.0]/table_test.loc['All','All']

print(
    'training_confusion: \n',
     table_train,
    '\nTraining set accuracy: {}\n'.format((table_train.iloc[0, 0] + table_train.iloc[1, 1])/table_train.iloc[2, 2]),
    'Percent Type I errors: {}\n'.format(train_tI_errors),
    'Percent Type II errors: {}\n\n'.format(train_tII_errors),
    
    'test_confusion: \n' ,
     table_test,
    '\nTest set accuracy: {}\n'.format((table_test.iloc[0, 0] + table_test.iloc[1, 1])/table_test.iloc[2, 2]),
    'Percent Type I errors: {}\n'.format(test_tI_errors),
    'Percent Type II errors: {}'.format(test_tII_errors)
)


training_confusion: 
 col_0     0.0   1.0   All
partner                  
0.0      4167   341  4508
1.0      1291  1533  2824
All      5458  1874  7332 
Training set accuracy: 0.7774140752864157
 Percent Type I errors: 0.04650845608292417
 Percent Type II errors: 0.17607746863066012

 test_confusion: 
 col_0    0.0  1.0  All
partner               
0.0      454   51  505
1.0      151  159  310
All      605  210  815 
Test set accuracy: 0.7521472392638037
 Percent Type I errors: 0.06257668711656442
 Percent Type II errors: 0.18527607361963191


In [6]:
# Best improvement was from changing parameters:
params = {'n_estimators': 1000,
          'max_depth': 3,
          'loss': 'deviance',
          'learning_rate': .017,
          'subsample': 1,
          'min_samples_leaf': 92}


clf = ensemble.GradientBoostingClassifier(**params)
clf.fit(X_train, y_train)

predict_train = clf.predict(X_train)
predict_test = clf.predict(X_test)
table_train = pd.crosstab(y_train, predict_train, margins=True)
table_test = pd.crosstab(y_test, predict_test, margins=True)

train_tI_errors = table_train.loc[0.0,1.0] / table_train.loc['All','All']
train_tII_errors = table_train.loc[1.0,0.0] / table_train.loc['All','All']
test_tI_errors = table_test.loc[0.0,1.0]/table_test.loc['All','All']
test_tII_errors = table_test.loc[1.0,0.0]/table_test.loc['All','All']

print(
    'training_confusion: \n',
     table_train,
    '\nTraining set accuracy: {}\n'.format((table_train.iloc[0, 0] + table_train.iloc[1, 1])/table_train.iloc[2, 2]),
    'Percent Type I errors: {}\n'.format(train_tI_errors),
    'Percent Type II errors: {}\n\n'.format(train_tII_errors),
    
    'test_confusion: \n' ,
     table_test,
    '\nTest set accuracy: {}\n'.format((table_test.iloc[0, 0] + table_test.iloc[1, 1])/table_test.iloc[2, 2]),
    'Percent Type I errors: {}\n'.format(test_tI_errors),
    'Percent Type II errors: {}'.format(test_tII_errors)
)

training_confusion: 
 col_0     0.0   1.0   All
partner                  
0.0      4171   337  4508
1.0      1307  1517  2824
All      5478  1854  7332 
Training set accuracy: 0.7757774140752864
 Percent Type I errors: 0.04596290234588107
 Percent Type II errors: 0.1782596835788325

 test_confusion: 
 col_0    0.0  1.0  All
partner               
0.0      462   43  505
1.0      151  159  310
All      613  202  815 
Test set accuracy: 0.7619631901840491
 Percent Type I errors: 0.05276073619631902
 Percent Type II errors: 0.18527607361963191
