In [1]:
#Importing all of our libraries in one shot:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, LogisticRegression, SGDClassifier
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, PowerTransformer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, ExtraTreesClassifier, RandomForestRegressor, ExtraTreesRegressor, BaggingClassifier, BaggingRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier, AdaBoostRegressor
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn import metrics
from sklearn.base import BaseEstimator
from sklearn.svm import SVC, SVR 
from sklearn.metrics import mean_squared_error
from sklearn.base import BaseEstimator

In [2]:
#Reading in the data we just stored:

%store -r df
%store -r preds

In [3]:
df.head()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,wage,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,...,native-country_ Portugal,native-country_ Puerto-Rico,native-country_ Scotland,native-country_ South,native-country_ Taiwan,native-country_ Thailand,native-country_ Trinadad&Tobago,native-country_ United-States,native-country_ Vietnam,native-country_ Yugoslavia
0,4.025352,12.754289,5,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,3.332205,11.474455,9,0,0,45,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3.496508,12.433686,13,7688,0,50,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,3.258097,12.090325,13,0,0,45,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,3.688879,10.947204,14,14084,0,55,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0


In [4]:
#Defining our Xs and ys:

X = df.drop(['wage'], axis = 1)
y = df['wage']

In [5]:
#Instantiating StandardScaler:

ss = StandardScaler()

In [6]:
#Fitting and transforming our X

ss.fit_transform(X)

array([[ 1.21020085,  1.20274566, -1.95761158, ...,  0.34861264,
        -0.03508772, -0.02147849],
       [-0.70529478, -0.83610079, -0.41289103, ...,  0.34861264,
        -0.03508772, -0.02147849],
       [-0.25124723,  0.69200717,  1.13182952, ...,  0.34861264,
        -0.03508772, -0.02147849],
       ...,
       [-0.42402074,  1.35674103, -0.02671089, ...,  0.34861264,
        -0.03508772, -0.02147849],
       [-0.08864292,  1.37444309,  1.13182952, ...,  0.34861264,
        -0.03508772, -0.02147849],
       [ 0.28036788,  0.50479495,  1.13182952, ...,  0.34861264,
        -0.03508772, -0.02147849]])

In [7]:
ss.transform(preds)

array([[-1.01847589,  0.52974894, -1.1852513 , ...,  0.34861264,
        -0.03508772, -0.02147849],
       [ 0.13862009, -0.94595582, -0.41289103, ...,  0.34861264,
        -0.03508772, -0.02147849],
       [-0.70529478,  1.16037578,  0.74564938, ...,  0.34861264,
        -0.03508772, -0.02147849],
       ...,
       [ 0.13862009,  1.3307421 ,  1.13182952, ...,  0.34861264,
        -0.03508772, -0.02147849],
       [ 0.54375528, -1.05463804,  1.13182952, ...,  0.34861264,
        -0.03508772, -0.02147849],
       [-0.08864292,  0.18045919,  1.13182952, ...,  0.34861264,
        -0.03508772, -0.02147849]])

In [8]:
#We're going to create a custom Base Estimator that can switch between different classifiers:

class ClfSwitcher(BaseEstimator):

    def __init__(
        self, 
        estimator = SGDClassifier(),
    ):
        self.estimator = estimator


    def fit(self, X, y=None, **kwargs):
        self.estimator.fit(X, y)
        return self


    def predict(self, X, y=None):
        return self.estimator.predict(X)


    def predict_proba(self, X):
        return self.estimator.predict_proba(X)


    def score(self, X, y):
        return self.estimator.score(X, y)

In [9]:
#Creating one giant pipeline with multiple models in it:
pipe_params = [
    {
        'model__estimator': [LogisticRegression()],
        'model__estimator__penalty': ['l1', 'l2']
    },
    {
        'model__estimator': [RandomForestClassifier()],
        'model__estimator__min_samples_split': [2, 3, 4, 5],
        'model__estimator__min_samples_leaf' : [13, 14, 15, 16, 17],
    },
    {
        'model__estimator': [AdaBoostClassifier()],
        'model__estimator__n_estimators': [70, 80, 90],
        'model__estimator__learning_rate' : [1, 2],
        'model__estimator__algorithm' : ['SAMME', 'SAMME.R']
    },
     {
        'model__estimator': [KNeighborsClassifier()],
        'model__estimator__p': [1, 2],
        'model__estimator__leaf_size' : [30, 20, 10],
    },
     {
        'model__estimator': [BaggingClassifier()],
        'model__estimator__n_estimators': [50, 100, 200, 300],
        'model__estimator__bootstrap' : [True, False]
     },
     {
        'model__estimator': [SVC()],
        'model__estimator__degree' : [2, 3, 4],
        'model__estimator__C' : [10,15,20, 100, 150, 200],

     },
         {
        'model__estimator': [GaussianNB()]
     },
]

In [10]:
#I'm going to write a function that will do our model analysis for us with one line of code. This will be much easier than going through and 
#changing everything manually:

def model_analyze(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)
    pipe = Pipeline([('model', ClfSwitcher())])
    
    gs = GridSearchCV(pipe, pipe_params, cv = 5, verbose = 3, n_jobs = 4)
    gs.fit(X_train, y_train)
    
    print(f'Training Data Score: {gs.score(X_train, y_train)}')
    print(f'Testing Data Score: {gs.score(X_test, y_test)}')
    print(f'Cross Val Score: {gs.best_score_}')
    print(f'Best Params: {gs.best_params_}')

In [11]:
model_analyze(X, y)

Fitting 5 folds for each of 67 candidates, totalling 335 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:    2.5s
[Parallel(n_jobs=4)]: Done 120 tasks      | elapsed:    9.9s
[Parallel(n_jobs=4)]: Done 280 tasks      | elapsed:  1.0min
[Parallel(n_jobs=4)]: Done 335 out of 335 | elapsed:  1.2min finished


Training Data Score: 0.8624718179954909
Testing Data Score: 0.8598647818070068
Cross Val Score: 0.8491492223623371
Best Params: {'model__estimator': AdaBoostClassifier(learning_rate=1, n_estimators=70), 'model__estimator__algorithm': 'SAMME.R', 'model__estimator__learning_rate': 1, 'model__estimator__n_estimators': 70}
