In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import itertools
import matplotlib.pyplot as plt

from sklearn import model_selection
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn import metrics

# Classifiers
from sklearn.svm import SVR
# from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestRegressor
# from mlxtend.Regressor import StackingCVRegressor #
from mlxtend.regressor import StackingCVRegressor
import xgboost as xgb

pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)

# Used to ignore warnings generated from StackingCVClassifier
import warnings
warnings.simplefilter('ignore')

from sklearn.model_selection import train_test_split

In [2]:

from sklearn.linear_model import LinearRegression

rf = RandomForestRegressor(random_state = 1000, n_jobs = -1)
lr = LinearRegression(n_jobs = -1)

In [3]:
df = pd.read_csv('datasets_52721_99691_student-mat.csv')

X = df.drop('G3', axis=1)
y = df[['G3']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
X_train.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,guardian,traveltime,studytime,failures,schoolsup,famsup,paid,activities,nursery,higher,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2
172,GP,M,17,U,LE3,T,4,4,teacher,other,reputation,mother,1,2,0,no,yes,yes,yes,yes,yes,yes,no,4,4,4,1,3,5,0,13,11
119,GP,M,15,U,GT3,T,3,4,other,other,reputation,father,1,1,0,no,no,no,no,yes,yes,yes,no,3,4,3,1,2,4,6,14,13
389,MS,F,18,U,GT3,T,1,1,other,other,course,mother,2,2,1,no,no,no,yes,yes,yes,no,no,1,1,1,1,1,5,0,6,5
198,GP,F,17,U,GT3,T,4,4,services,teacher,home,mother,2,1,1,no,yes,no,no,yes,yes,yes,no,4,2,4,2,3,2,24,18,18
139,GP,F,15,U,GT3,T,4,4,teacher,teacher,course,mother,2,1,0,no,no,no,yes,yes,yes,yes,no,4,3,2,1,1,5,0,16,16


In [21]:
n_estimators = [10,20,50]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [5,15,20]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [1,3, 5]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 3, 5]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
params = {
               'stackingcvregressor__randomforestregressor__n_estimators': n_estimators,  #randomforestregressor__bootstrap
               'stackingcvregressor__randomforestregressor__max_features': max_features,
               'stackingcvregressor__randomforestregressor__max_depth': max_depth,
               'stackingcvregressor__randomforestregressor__min_samples_split': min_samples_split,
               'stackingcvregressor__randomforestregressor__min_samples_leaf': min_samples_leaf,
               'stackingcvregressor__randomforestregressor__bootstrap': bootstrap,
#               "xgbclassifier__n_estimators": [10, 50, 100, 500],
#                 "xgbclassifier__learning_rate": [0.1, 0.5, 1],
         }


# params = {
#                'randomforestregressor__n_estimators': n_estimators,  #randomforestregressor__bootstrap
#                'randomforestregressor__max_features': max_features,
#                'randomforestregressor__max_depth': max_depth,
#                'randomforestregressor__min_samples_split': min_samples_split,
#                'randomforestregressor__min_samples_leaf': min_samples_leaf,
#                'randomforestregressor__bootstrap': bootstrap,
# # #               "xgbclassifier__n_estimators": [10, 50, 100, 500],
# # #                 "xgbclassifier__learning_rate": [0.1, 0.5, 1],
#          }

In [22]:
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

num_features = X_train.select_dtypes(include=['int64', 'float64']).columns
cat_features = X_train.select_dtypes(include=['object']).columns

preprocess = make_column_transformer(
    (StandardScaler(), num_features),(OneHotEncoder(handle_unknown='ignore'),cat_features))

stacking_model = StackingCVRegressor(regressors=(rf,lr), 
                            meta_regressor=lr,
                            random_state=42)
# model = make_pipeline(
#     preprocess,
#     LogisticRegression())

# pipe_stack = make_pipeline(StandardScaler(), rf)
pipe_stack = Pipeline([('preprocessor', preprocess),('stackingcvregressor',stacking_model)])

grid = GridSearchCV(estimator=pipe_stack, 
                    param_grid=params, 
                    cv=3,
                    refit=True)
grid.fit(X_train, y_train)

KeyboardInterrupt: 

In [18]:
pipe_stack.get_params()

{'memory': None,
 'steps': [('preprocessor',
   ColumnTransformer(transformers=[('standardscaler', StandardScaler(),
                                    Index(['age', 'Medu', 'Fedu', 'traveltime', 'studytime', 'failures', 'famrel',
          'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'G1', 'G2'],
         dtype='object')),
                                   ('onehotencoder',
                                    OneHotEncoder(handle_unknown='ignore'),
                                    Index(['school', 'sex', 'address', 'famsize', 'Pstatus', 'Mjob', 'Fjob',
          'reason', 'guardian', 'schoolsup', 'famsup', 'paid', 'activities',
          'nursery', 'higher', 'internet', 'romantic'],
         dtype='object'))])),
  ('stacking_model',
   StackingCVRegressor(meta_regressor=LinearRegression(n_jobs=-1), random_state=42,
                       regressors=(RandomForestRegressor(n_jobs=-1,
                                                         random_state=1000),
        

In [12]:
grid.predict(X_test)

array([ 8.94871795, 11.55      ,  0.        ,  8.94871795,  8.94871795,
       12.7       , 18.1       ,  7.09090909,  8.73333333, 13.76      ,
       15.23809524,  7.09090909, 13.88888889, 11.55      , 14.6       ,
        8.94871795,  0.        , 10.63157895, 15.23809524,  8.73333333,
       14.6       , 16.        , 15.23809524,  7.09090909,  8.73333333,
       17.9       , 10.63157895,  8.73333333, 18.1       , 11.6       ,
        8.94871795,  8.94871795, 15.23809524, 12.7       ,  0.        ,
        2.        ,  0.        , 15.23809524, 11.55      ,  8.94871795,
        7.09090909, 10.63157895, 13.88888889,  8.94871795, 15.23809524,
        0.        , 11.55      , 14.6       , 13.42857143, 15.23809524,
       13.46      , 15.23809524, 10.63157895,  8.94871795,  0.        ,
       13.42857143, 10.25      ,  0.        , 15.23809524, 16.        ,
       13.46      ,  8.94871795,  8.94871795,  2.        ,  7.09090909,
       17.9       ,  8.94871795, 10.25      , 10.25      , 15.23

In [13]:
y_test

Unnamed: 0,G3
78,10
371,12
248,5
55,10
390,9
223,13
42,18
234,6
316,0
116,14


In [16]:
print ("score = %3.2f" %(grid.score(X_test,y_test)))
print (grid.best_params_)

score = 0.67
{'randomforestregressor__bootstrap': False, 'randomforestregressor__max_depth': 5, 'randomforestregressor__max_features': 'auto', 'randomforestregressor__min_samples_leaf': 3, 'randomforestregressor__min_samples_split': 3, 'randomforestregressor__n_estimators': 10}


In [9]:
cv_keys = ('mean_test_score', 'std_test_score', 'params')

for r, _ in enumerate(grid.cv_results_['mean_test_score']):
    print("%0.3f +/- %0.2f %r"
          % (grid.cv_results_[cv_keys[0]][r],
             grid.cv_results_[cv_keys[1]][r] / 2.0,
             grid.cv_results_[cv_keys[2]][r]))

print('Best parameters: %s' % grid.best_params_)
print('Accuracy: %.2f' % grid.best_score_)

AttributeError: 'GridSearchCV' object has no attribute 'cv_results_'