In [10]:
import pandas as pd
# import numpy as np

from sklearn.preprocessing import StandardScaler, RobustScaler, PolynomialFeatures, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.decomposition import PCA
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
# from sklearn.metrics import accuracy_score

In [2]:
wine = pd.read_pickle('../../data/02_intermediate/df_robust_scaled.pkl')
wine.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,-0.238095,0.72,-0.787879,-0.428571,-0.15,-0.214286,-0.1,0.469799,1.052632,-0.333333,-0.5,0
1,-0.047619,1.44,-0.787879,0.571429,0.95,0.785714,0.725,0.022371,-0.578947,0.333333,-0.25,0
2,-0.047619,0.96,-0.666667,0.142857,0.65,0.071429,0.4,0.111857,-0.263158,0.166667,-0.25,0
3,1.571429,-0.96,0.909091,-0.428571,-0.2,0.214286,0.55,0.559284,-0.789474,-0.222222,-0.25,1
4,-0.238095,0.72,-0.787879,-0.428571,-0.15,-0.214286,-0.1,0.469799,1.052632,-0.333333,-0.5,0


## Set predictors and predicted

In [3]:
X = wine.drop(['quality'], axis=1)
y = wine['quality']

## Train test split

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=4)

## Scale Everything by several scaling options

In [12]:
std_scaler = StandardScaler()
std_X_train = std_scaler.fit_transform(X_train)
std_X_test = std_scaler.transform(X_test)

rob_scaler = RobustScaler()
rob_X_train = rob_scaler.fit_transform(X_train)
rob_X_test = rob_scaler.transform(X_test)

# poly_scaler = PolynomialFeatures(degree=2)
# poly_X = poly_scaler(X)
# poly_X_test = poly_scaler.transform(X_test)

mM_scaler = MinMaxScaler()
mM_X_train = mM_scaler.fit_transform(X_train)
mM_X_test = mM_scaler.transform(X_test)

## Testing accuracy from Logistic model on std_X

In [14]:
pca = PCA()
log_mod = LogisticRegression(solver='lbfgs')

pipe = Pipeline(steps = [('pca', pca), ('log_mod', log_mod)])

param_grid = {
    'pca__n_components' : [3, 5, 7, 9],
    'log_mod__penalty' : ['l2'],
    'log_mod__C' : [.001, .01, .1, 1, 10, 100, 1000],
}

gs = GridSearchCV(pipe, param_grid, cv=5, scoring="accuracy")
gs.fit(std_X_train, y_train)
gs.score(std_X_test, y_test)
print(gs.best_params_)

{'log_mod__C': 0.1, 'log_mod__penalty': 'l2', 'pca__n_components': 9}


In [24]:
def param_grid_tuner(param_grid, best_params):
    for param, value in best_params.items():
        if type(value)==int:
            num_options = len(param_grid[param])
            if num_options==1:
                continue
            index = param_grid[param].index(value)
            if index==0:
                param_grid[param] = [value, int((value+param_grid[param][index+1])/2), param_grid[param][index+1]]
                continue
            elif index==num_options-1:
                param_grid[param] = [param_grid[param][index-1], int((value+param_grid[param][index-1])/2), value]
                continue
            else:
                param_grid[param] = [param_grid[param][index-1], int((value+param_grid[param][index-1])/2), value, 
                                     int((value+param_grid[param][index+1])/2), param_grid[param][index+1]]
        if type(value)==float:
            num_options = len(param_grid[param])
            if num_options==1:
                continue
            index = param_grid[param].index(value)
            if index==0:
                param_grid[param] = [value, (value+param_grid[param][index+1])/2, param_grid[param][index+1]]
                continue
            elif index==num_options-1:
                param_grid[param] = [param_grid[param][index-1], (value+param_grid[param][index-1])/2, value]
                continue
            else:
                param_grid[param] = [param_grid[param][index-1], (value+param_grid[param][index-1])/2, value, 
                                     (value+param_grid[param][index+1])/2, param_grid[param][index+1]]            
        

    return param_grid            

In [30]:
param_grid = {
    'pca__n_components' : [1, 6, 11],
    'log_mod__penalty' : ['l2'],
    'log_mod__C' : [.001, .01, .1, 1.0, 10.0, 100.0, 1000.0],
}

for i in range(4):
    gs = GridSearchCV(pipe, param_grid, cv=5, scoring="accuracy")
    gs.fit(std_X_train, y_train)
    print(gs.score(std_X_test, y_test))
    param_grid = param_grid_tuner(param_grid, gs.best_params_)
    print(gs.best_params_)
    print(param_grid)

0.7225
{'log_mod__C': 1.0, 'log_mod__penalty': 'l2', 'pca__n_components': 11}
{'pca__n_components': [6, 8, 11], 'log_mod__penalty': ['l2'], 'log_mod__C': [0.1, 0.55, 1.0, 5.5, 10.0]}
0.725
{'log_mod__C': 0.55, 'log_mod__penalty': 'l2', 'pca__n_components': 11}
{'pca__n_components': [8, 9, 11], 'log_mod__penalty': ['l2'], 'log_mod__C': [0.1, 0.325, 0.55, 0.775, 1.0]}
0.7275
{'log_mod__C': 0.1, 'log_mod__penalty': 'l2', 'pca__n_components': 9}
{'pca__n_components': [8, 8, 9, 10, 11], 'log_mod__penalty': ['l2'], 'log_mod__C': [0.1, 0.21250000000000002, 0.325]}
0.7275
{'log_mod__C': 0.1, 'log_mod__penalty': 'l2', 'pca__n_components': 9}
{'pca__n_components': [8, 8, 9, 9, 10], 'log_mod__penalty': ['l2'], 'log_mod__C': [0.1, 0.15625, 0.21250000000000002]}


## Testing accuracy from Logistic model on rob_X

In [32]:
param_grid = {
    'pca__n_components' : [1, 6, 11],
    'log_mod__penalty' : ['l2'],
    'log_mod__C' : [.001, .01, .1, 1.0, 10.0, 100.0, 1000.0],
}

for i in range(5):
    gs = GridSearchCV(pipe, param_grid, cv=5, scoring="accuracy")
    gs.fit(rob_X_train, y_train)
    print(gs.score(rob_X_test, y_test))
    param_grid = param_grid_tuner(param_grid, gs.best_params_)
    print(gs.best_params_)
    print(param_grid)

0.7225
{'log_mod__C': 0.1, 'log_mod__penalty': 'l2', 'pca__n_components': 11}
{'pca__n_components': [6, 8, 11], 'log_mod__penalty': ['l2'], 'log_mod__C': [0.01, 0.055, 0.1, 0.55, 1.0]}
0.7225
{'log_mod__C': 0.1, 'log_mod__penalty': 'l2', 'pca__n_components': 11}
{'pca__n_components': [8, 9, 11], 'log_mod__penalty': ['l2'], 'log_mod__C': [0.055, 0.0775, 0.1, 0.325, 0.55]}
0.7225
{'log_mod__C': 0.1, 'log_mod__penalty': 'l2', 'pca__n_components': 11}
{'pca__n_components': [9, 10, 11], 'log_mod__penalty': ['l2'], 'log_mod__C': [0.0775, 0.08875, 0.1, 0.21250000000000002, 0.325]}
0.72
{'log_mod__C': 0.08875, 'log_mod__penalty': 'l2', 'pca__n_components': 11}
{'pca__n_components': [10, 10, 11], 'log_mod__penalty': ['l2'], 'log_mod__C': [0.0775, 0.083125, 0.08875, 0.094375, 0.1]}
0.72
{'log_mod__C': 0.083125, 'log_mod__penalty': 'l2', 'pca__n_components': 11}
{'pca__n_components': [10, 10, 11], 'log_mod__penalty': ['l2'], 'log_mod__C': [0.0775, 0.08031250000000001, 0.083125, 0.0859375, 0.08875

## Testing accuracy from Logistic model on mM_X

In [33]:
param_grid = {
    'pca__n_components' : [1, 6, 11],
    'log_mod__penalty' : ['l2'],
    'log_mod__C' : [.001, .01, .1, 1.0, 10.0, 100.0, 1000.0],
}

for i in range(5):
    gs = GridSearchCV(pipe, param_grid, cv=5, scoring="accuracy")
    gs.fit(mM_X_train, y_train)
    print(gs.score(mM_X_test, y_test))
    param_grid = param_grid_tuner(param_grid, gs.best_params_)
    print(gs.best_params_)
    print(param_grid)

0.7175
{'log_mod__C': 1.0, 'log_mod__penalty': 'l2', 'pca__n_components': 11}
{'pca__n_components': [6, 8, 11], 'log_mod__penalty': ['l2'], 'log_mod__C': [0.1, 0.55, 1.0, 5.5, 10.0]}
0.7275
{'log_mod__C': 5.5, 'log_mod__penalty': 'l2', 'pca__n_components': 11}
{'pca__n_components': [8, 9, 11], 'log_mod__penalty': ['l2'], 'log_mod__C': [1.0, 3.25, 5.5, 7.75, 10.0]}
0.7275
{'log_mod__C': 5.5, 'log_mod__penalty': 'l2', 'pca__n_components': 11}
{'pca__n_components': [9, 10, 11], 'log_mod__penalty': ['l2'], 'log_mod__C': [3.25, 4.375, 5.5, 6.625, 7.75]}
0.7275
{'log_mod__C': 5.5, 'log_mod__penalty': 'l2', 'pca__n_components': 11}
{'pca__n_components': [10, 10, 11], 'log_mod__penalty': ['l2'], 'log_mod__C': [4.375, 4.9375, 5.5, 6.0625, 6.625]}
0.7275
{'log_mod__C': 5.5, 'log_mod__penalty': 'l2', 'pca__n_components': 11}
{'pca__n_components': [10, 10, 11], 'log_mod__penalty': ['l2'], 'log_mod__C': [4.9375, 5.21875, 5.5, 5.78125, 6.0625]}


In [87]:
pca = PCA(n_components=7)
principal_comps = pca.fit_transform(mM_X_train)
# pca.explained_variance_ratio_
log_reg = LogisticRegression(solver='lbfgs', C=0.1)
log_reg.fit(principal_comps, mM_y_train)
print(log_reg.score(principal_comps, mM_y_train),
log_reg.score(pca.transform(mM_X_test), mM_y_test))

0.712 0.656


## Testing accuracy from RadnomForest model on std_X

In [90]:
pca = PCA()
rand_forest = RandomForestClassifier()

pipe = Pipeline(steps = [('pca', pca), ('rand_forest', rand_forest)])

param_grid = {
    'pca__n_components' : [3, 4, 5],
    'rand_forest__n_estimators' : [5, 6, 7],
    'rand_forest__criterion': ['gini'],
    'rand_forest__max_depth': [2]
}

gs = GridSearchCV(pipe, param_grid, cv = 5, scoring = "accuracy")
gs.fit(mM_X_train, mM_y_train)
gs.score(mM_X_test, mM_y_test)
print(gs.best_params_)

{'pca__n_components': 5, 'rand_forest__criterion': 'gini', 'rand_forest__max_depth': 2, 'rand_forest__n_estimators': 6}




In [None]:
pca = PCA(n_components=5)


In [None]:
!