### Using Gradient Boosting to Improve Scores:

In [329]:
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn import ensemble
from sklearn.metrics import mean_squared_error

In [330]:
# Importing the data and then dropping some of the unnecessary columns:

df = pd.read_csv((
    "https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/"
    "master/ESS_practice_data/ESSdata_Thinkful.csv")).dropna()
df = pd.DataFrame(df.drop(['cntry', 'idno', 'year'], axis = 1))

In [331]:
df['partner'] = (df.partner.values) - 1
df['ageb'] = df['agea']

In [332]:
# More realistic/meaningful feature engineering:

# Grouped by age:

grp_1 = np.arange(17, 23, 1)
grp_2 = np.arange(23, 33, 1)
grp_3 = np.arange(33, 43, 1)
grp_4 = np.arange(43, 48, 1)
grp_5 = np.arange(48, 55, 1)
grp_6 = np.arange(54, 68, 1)
grp_7 = np.arange(68, 74, 1)
grp_8 = np.arange(74, 80, 1)
grp_9 = np.arange(80, 85, 1)
grp_10 = np.arange(85, 115, 1)

df.ageb.replace(grp_1, 1, inplace = True)
df.ageb.replace(grp_2, 2, inplace = True)
df.ageb.replace(grp_3, 3, inplace = True)
df.ageb.replace(grp_4, 4, inplace = True)
df.ageb.replace(grp_5, 5, inplace = True)
df.ageb.replace(grp_6, 6, inplace = True)
df.ageb.replace(grp_7, 7, inplace = True)
df.ageb.replace(grp_8, 8, inplace = True)
df.ageb.replace(grp_9, 9, inplace = True)
df.ageb.replace(grp_10, 10, inplace = True)

In [333]:
# Making another column on Happiness:

df['happy_group'] = df['happy']
df['happy_group'] = df.happy_group.replace({'happy_group':{'0':3, '1':3, '2':3,  
                                       '3':2, '4':2, '5':2, '6':2,
                                       '7':1, '8':1, '9':1, '10':1}})

In [481]:
df.groupby('agea').agg('count')

Unnamed: 0_level_0,tvtot,ppltrst,pplfair,pplhlp,happy,sclmeet,sclact,gndr,partner,ageb,happy_group,avg_happiness,happiness_factor,avg_helpful,helpful_factor
agea,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
15.0,23,23,23,23,23,23,23,23,23,23,23,23,23,23,23
16.0,118,118,118,118,118,118,118,118,118,118,118,118,118,118,118
17.0,143,143,143,143,143,143,143,143,143,143,143,143,143,143,143
18.0,138,138,138,138,138,138,138,138,138,138,138,138,138,138,138
19.0,126,126,126,126,126,126,126,126,126,126,126,126,126,126,126
20.0,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
21.0,109,109,109,109,109,109,109,109,109,109,109,109,109,109,109
22.0,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129
23.0,135,135,135,135,135,135,135,135,135,135,135,135,135,135,135
24.0,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114


In [334]:
# Behold the happiness factor:

df['avg_happiness'] = df.groupby('agea').transform('mean')['happy']
df['happiness_factor'] = df.happy - df.avg_happiness

### Model 1: 

In [335]:
# Importing the packages:

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix

plusminus = u"\u00B1"

In [336]:
# Selecting Data and splitting for train/test:

X = df.drop('partner', 1)
y = df['partner']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .25)

In [337]:
# Fitting the model and defining our score variables:

gb1 = ensemble.GradientBoostingClassifier(n_estimators = 200, max_depth = 2)
gb1.fit(X_train, y_train)

y_pred_ = gb1.predict(X_test)

gb1_confusion = confusion_matrix(y_test, y_pred_, labels = [0, 1])
gb1_cv = cross_val_score(gb1, X, y, cv = 5)
gb1_train_score = gb1.score(X_train, y_train)
gb1_test_score = gb1.score(X_test, y_test)

In [338]:
print("The training score is: {:.2%}".format(gb1_train_score))
print("\nThe mean cross-validation score is: {:.2%} {}{:.2%}\n".format(gb1_cv.mean(), plusminus, gb1_cv.std()))
print("The cv scores are: \n{}".format(gb1_cv))
print("\nThe test score is: {:.2%}".format(gb1_test_score))

The training score is: 77.12%

The mean cross-validation score is: 75.27% ±1.32%

The cv scores are: 
[0.76687117 0.73558282 0.76932515 0.74462861 0.74692875]

The test score is: 74.72%


In [339]:
gb1_confusion_df = pd.DataFrame(gb1_confusion)

FP = gb1_confusion_df.loc[0, 1]
FN = gb1_confusion_df.loc[1, 0]
NegT = gb1_confusion_df.iloc[0].sum()
PosT = gb1_confusion_df.iloc[1].sum()

gb1_confusion_df

Unnamed: 0,0,1
0,1139,116
1,399,383


In [340]:
print('False Positive (Type I Error): {}\n({:.2%})\n'.format(FP, (FP / NegT)))
print('False Negative (Type II Error): {}\n({:.2%})\n'.format(FN, (FN / PosT)))

False Positive (Type I Error): 116
(9.24%)

False Negative (Type II Error): 399
(51.02%)



In [341]:
feat_imp = pd.DataFrame(gb1.feature_importances_.round(2), index = X_train.columns, columns = ["Importances"])
display(feat_imp.sort_values('Importances', ascending = False))

Unnamed: 0,Importances
agea,0.71
happiness_factor,0.16
sclmeet,0.04
tvtot,0.02
gndr,0.02
ppltrst,0.01
pplfair,0.01
pplhlp,0.01
sclact,0.01
ageb,0.01


> While our training score is still higher than our test score, we are nearing our goal of 85% consistency with as low of a standard deviation as possible.  The interesting thing is that our engineered feature is second most important.

> For our next model, we will try tweaking our estimator and depth count.

### Model 2:

> Here we are tweaking things a bit more.  We increased the number of estimators while keeping the max depth of each estimator to 2.  The learning rate was raised considerably (from a default of .1 to a .7, which gives each tree a little more say in the prediction compared to the initial model).  We took a subsample of the dataset to try and limit overfitting as well. This is more overfit than the model above, so wel will continue tweaking things (perhaps lowering the number of estimators and increasing the max-depth by one level).

In [475]:
# Fitting the model and defining our score variables:

gb2 = ensemble.GradientBoostingClassifier(n_estimators = 150, 
                                          max_depth = 8, learning_rate = .7, subsample = .7, presort = True)
gb2.fit(X_train, y_train)

y_pred_ = gb2.predict(X_test)

gb2_confusion = confusion_matrix(y_test, y_pred_, labels = [0, 1])
gb2_cv = cross_val_score(gb2, X, y, cv = 5)
gb2_train_score = gb2.score(X_train, y_train)
gb2_test_score = gb2.score(X_test, y_test)

In [476]:
print("The training score is: {:.2%}".format(gb2_train_score))
print("\nThe mean cross-validation score is: {:.2%} {}{:.2%}\n".format(gb2_cv.mean(), plusminus, gb2_cv.std()))
print("The cv scores are: \n{}".format(gb2_cv))
print("\nThe test score is: {:.2%}".format(gb2_test_score))

The training score is: 99.53%

The mean cross-validation score is: 66.96% ±2.11%

The cv scores are: 
[0.69509202 0.64171779 0.64907975 0.68876611 0.67321867]

The test score is: 67.55%


In [477]:
gb2_confusion_df = pd.DataFrame(gb2_confusion)

FP = gb2_confusion_df.loc[0, 1]
FN = gb2_confusion_df.loc[1, 0]
NegT = gb2_confusion_df.iloc[0].sum()
PosT = gb2_confusion_df.iloc[1].sum()

gb2_confusion_df

Unnamed: 0,0,1
0,925,330
1,331,451


In [478]:
print('False Positive (Type I Error): {}\n({:.2%})\n'.format(FP, (FP / NegT)))
print('False Negative (Type II Error): {}\n({:.2%})\n'.format(FN, (FN / PosT)))

False Positive (Type I Error): 330
(26.29%)

False Negative (Type II Error): 331
(42.33%)



In [479]:
feat_imp = pd.DataFrame(gb2.feature_importances_.round(2), index = X_train.columns, columns = ["Importances"])
display(feat_imp.sort_values('Importances', ascending = False))

Unnamed: 0,Importances
agea,0.18
happiness_factor,0.17
pplhlp,0.1
pplfair,0.09
sclmeet,0.09
tvtot,0.08
ppltrst,0.08
avg_happiness,0.08
sclact,0.05
gndr,0.04


### Model 3:

In [347]:
df[0:2]

Unnamed: 0,tvtot,ppltrst,pplfair,pplhlp,happy,sclmeet,sclact,gndr,agea,partner,ageb,happy_group,avg_happiness,happiness_factor
0,3.0,3.0,10.0,5.0,8.0,5.0,4.0,2.0,60.0,0.0,6.0,8.0,7.571429,0.428571
1,6.0,5.0,7.0,5.0,9.0,3.0,2.0,2.0,59.0,0.0,6.0,9.0,7.626087,1.373913


In [356]:
df2 = pd.DataFrame(df)

In [357]:
df2['avg_helpful'] = df2.groupby('agea').transform('mean')['pplhlp']
df2['helpful_factor'] = df2.pplhlp - df2.avg_helpful

In [381]:
X2 = df2.drop(['partner', 'ageb', 'happy_group', 'gndr', 'tvtot', 'pplhlp', 'happy', 'sclact'], 1)
y2 = df2['partner']

X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size = .25)

In [462]:
# Fitting the model and defining our score variables:

gb3 = ensemble.GradientBoostingClassifier(n_estimators = 175, 
                                          max_depth = 2, learning_rate = .75, subsample = .2, presort = True)
gb3.fit(X2_train, y2_train)

y2_pred_ = gb3.predict(X2_test)

gb3_confusion = confusion_matrix(y2_test, y2_pred_, labels = [0, 1])
gb3_cv = cross_val_score(gb3, X2, y2, cv = 5)
gb3_train_score = gb3.score(X2_train, y2_train)
gb3_test_score = gb3.score(X2_test, y2_test)

In [463]:
print("The training score is: {:.2%}".format(gb3_train_score))
print("\nThe mean cross-validation score is: {:.2%} {}{:.2%}\n".format(gb3_cv.mean(), plusminus, gb3_cv.std()))
print("The cv scores are: \n{}".format(gb3_cv))
print("\nThe test score is: {:.2%}".format(gb3_test_score))

The training score is: 71.78%

The mean cross-validation score is: 70.07% ±1.39%

The cv scores are: 
[0.7202454  0.69509202 0.6803681  0.71147944 0.6965602 ]

The test score is: 69.71%


In [464]:
gb3_confusion_df = pd.DataFrame(gb3_confusion)

FP = gb3_confusion_df.loc[0, 1]
FN = gb3_confusion_df.loc[1, 0]
NegT = gb3_confusion_df.iloc[0].sum()
PosT = gb3_confusion_df.iloc[1].sum()

gb3_confusion_df

Unnamed: 0,0,1
0,969,265
1,352,451


In [465]:
print('False Positive (Type I Error): {}\n({:.2%})\n'.format(FP, (FP / NegT)))
print('False Negative (Type II Error): {}\n({:.2%})\n'.format(FN, (FN / PosT)))

False Positive (Type I Error): 265
(21.47%)

False Negative (Type II Error): 352
(43.84%)



In [466]:
feat_imp = pd.DataFrame(gb3.feature_importances_.round(2), index = X2_train.columns, columns = ["Importances"])
display(feat_imp.sort_values('Importances', ascending = False))

Unnamed: 0,Importances
helpful_factor,0.44
pplfair,0.15
agea,0.13
happiness_factor,0.1
ppltrst,0.08
avg_happiness,0.05
sclmeet,0.02
avg_helpful,0.02


In [469]:
df2.iloc[3:4, 4:6]

Index(['tvtot', 'ppltrst', 'pplfair', 'pplhlp', 'happy', 'sclmeet', 'sclact',
       'gndr', 'agea', 'partner', 'ageb', 'happy_group', 'avg_happiness',
       'happiness_factor', 'avg_helpful', 'helpful_factor'],
      dtype='object')