In [1]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_validate, ShuffleSplit
from sklearn import ensemble, naive_bayes, tree


from config import config

In [2]:

reduced_data = pd.read_excel('./output/reduced_data.xlsx')


X_train, X_test, y_train, y_test = train_test_split(
    reduced_data.drop(config.TARGET, axis=1), 
    reduced_data[config.TARGET], 
    test_size=0.20, 
    stratify=reduced_data[config.TARGET], 
    random_state=config.RANDOM_STATE)

In [3]:
# Internal modules
from config import config

def BaseLineModels(models, X, y):
    
    row_index = 0
    cv_split = ShuffleSplit(n_splits = 10, test_size = .3, train_size = .7, random_state = 42)
    baseline_output = pd.DataFrame(columns=['model', 'mean_train_acc', 'mean_test_acc', 'parameters'])
    
    for model in [models[key]['model'] for key in models]:
        baseline_output.loc[row_index, 'model'] = model.__class__.__name__
        cross_validation_result = cross_validate(model, X, y, cv = cv_split, return_train_score=True, scoring='f1')
        model_parameters = model.fit(X, y).get_params()
        
        baseline_output.loc[row_index, 'mean_train_acc'] = cross_validation_result['train_score'].mean()
        baseline_output.loc[row_index, 'mean_test_acc'] = cross_validation_result['test_score'].mean()
        baseline_output.loc[row_index, 'parameters'] = [model_parameters]
        row_index+=1
        
        
        
    baseline_output.sort_values(by='mean_test_acc', ascending=False, inplace=True)
    
    row_index = 0
    tuned_output = pd.DataFrame(columns=['model', 'mean_train_acc_tuned', 'mean_test_acc_tuned', 'parameters_tuned'])
    
    for model in [models[key] for key in models]:
        tuned_output.loc[row_index, 'model'] = model['model'].__class__.__name__
        tuned_model = RandomizedSearchCV(model['model'], param_distributions=model['param_grid'], scoring = 'f1', cv = cv_split, return_train_score=True)
        tuned_model.fit(X, y)

        tuned_output.loc[row_index, 'mean_train_acc_tuned'] = tuned_model.cv_results_['mean_train_score'][tuned_model.best_index_]
        tuned_output.loc[row_index, 'mean_test_acc_tuned'] = tuned_model.cv_results_['mean_test_score'][tuned_model.best_index_]
        tuned_output.loc[row_index, 'parameters_tuned'] = [tuned_model.best_params_]
        row_index+=1

    

    output = baseline_output.join(tuned_output.set_index('model'), on='model')
    output.sort_values(by='mean_test_acc_tuned', ascending=False, inplace=True)
    
    return output #baseline_output, tuned_output

In [4]:
scaler = StandardScaler()
output = BaseLineModels(config.MODELS, scaler.fit_transform(X_train), y_train)

Traceback (most recent call last):
  File "C:\Users\Calle\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Calle\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1787, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\Calle\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 442, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got elasticnet penalty.

Traceback (most recent call last):
  File "C:\Users\Calle\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Calle\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1787, in fit
    solver = _check_solver(self.solve

In [5]:
output

Unnamed: 0,model,mean_train_acc,mean_test_acc,parameters,mean_train_acc_tuned,mean_test_acc_tuned,parameters_tuned
1,LogisticRegressionCV,0.579227,0.566728,"[{'Cs': 10, 'class_weight': None, 'cv': None, ...",0.660454,0.664652,"[{'random_state': 42, 'penalty': 'l2', 'max_it..."
3,GaussianNB,0.630829,0.627016,"[{'priors': None, 'var_smoothing': 1e-09}]",0.630829,0.627016,[{}]
6,RandomForestClassifier,0.995055,0.602756,"[{'bootstrap': True, 'ccp_alpha': 0.0, 'class_...",0.697248,0.619469,"[{'random_state': 42, 'n_estimators': 30, 'min..."
2,BernoulliNB,0.603541,0.609561,"[{'alpha': 1.0, 'binarize': 0.0, 'class_prior'...",0.603541,0.609561,[{}]
5,AdaBoostClassifier,0.667769,0.605457,"[{'algorithm': 'SAMME.R', 'base_estimator': No...",0.667769,0.605457,[{'base_estimator': None}]
4,DecisionTreeClassifier,0.995132,0.535068,"[{'ccp_alpha': 0.0, 'class_weight': None, 'cri...",0.655849,0.575803,"[{'splitter': 'random', 'random_state': 42, 'm..."
0,KNeighborsClassifier,0.701791,0.559939,"[{'algorithm': 'auto', 'leaf_size': 30, 'metri...",0.995132,0.561724,"[{'weights': 'distance', 'p': 1, 'n_neighbors'..."


In [None]:
tuned_output

In [None]:
output = baseline_output.join(tuned_output.set_index('model'), on='model')

In [None]:
output.sort_values(by='mean_test_acc_tuned', ascending=False, inplace=True)
output

In [None]:
cv_split = ShuffleSplit(n_splits = 10, test_size = .3, train_size = .7, random_state = 42)
row_index = 0
tuned_output = pd.DataFrame(columns=['model', 'mean_train_acc_tuned', 'mean_test_acc_tuned', 'parameters_tuned'])

for model in [config.MODELS[key] for key in config.MODELS]:
    print(model['model'])
    tuned_output.loc[row_index, 'model'] = model['model'].__class__.__name__
    #if len(config.MODELS[model]['param_grid'])!=0:
    print('randomized search')
    tuned_model = RandomizedSearchCV(model['model'], param_distributions=model['param_grid'], scoring = 'f1', cv = cv_split, return_train_score=True)
    print('fitting model')
    print(tuned_model.fit(scaler.fit_transform(X_train), y_train))
    tuned_model.fit(scaler.fit_transform(X_train), y_train)

    #tuned_output.loc[row_index, 'mean_train_acc_tuned'] = tuned_model.cv_results_['mean_train_score'][tuned_model.best_index_]
    #tuned_output.loc[row_index, 'mean_test_acc_tuned'] = tuned_model.cv_results_['mean_test_score'][tuned_model.best_index_]
    #tuned_output.loc[row_index, 'parameters_tuned'] = [tuned_model.best_params_]
    row_index+=1

In [None]:
tuned_output

In [None]:
for c in np.arange(-4., 6.):
...     lr = LogisticRegression(penalty='l1', C=10.**c,
...                             solver='liblinear',
...                             multi_class='ovr', random_state=0)



In [7]:
for c in np.arange(-4., 6.):
    print(10.**c)

0.0001
0.001
0.01
0.1
1.0
10.0
100.0
1000.0
10000.0
100000.0


In [14]:
list(10**np.arange(-4., 6.))

[0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0, 10000.0, 100000.0]

[0, 0, 0, 0, 1, 10, 100, 1000, 10000, 100000]

In [16]:
    'LogisticRegression': {
        'model': LogisticRegression(),
        'param_grid': {
            'penalty': ['l2'],
            'C': [int(val) for val in list(10**np.arange(-4., 6.))],
            'class_weight': [{1:0.5, 0:0.5}, {1:0.4, 0:0.6}, {1:0.6, 0:0.4}, {1:0.7, 0:0.3}],
            'solver': [ 'newton-cg', 'sag', 'lbfgs'],
            'max_iter': [1000],
            'random_state': [RANDOM_STATE]
        }
    }

SyntaxError: illegal target for annotation (<ipython-input-16-1e4123b2391f>, line 1)

In [46]:
 params =   {
            'penalty': ['l2'],
            'C': list(10.**np.arange(-4., 6.)),
            'class_weight': [{1:0.5, 0:0.5}, {1:0.4, 0:0.6}, {1:0.6, 0:0.4}, {1:0.7, 0:0.3}],
            'solver': ['lbfgs'],
            'max_iter': [1000],
            'random_state': [42]
        }

ape = RandomizedSearchCV(LogisticRegression(), 
                         param_distributions=params, 
                         scoring = 'f1', 
                         cv = cv_split, 
                         return_train_score=True)

goat = ape.fit(X_train, y_train)

In [38]:
'newton-cg'

In [21]:
from sklearn.linear_model import LogisticRegression


In [23]:
cv_split = ShuffleSplit(n_splits = 10, test_size = .3, train_size = .7, random_state = 42)

In [34]:
list(10.**np.arange(-4., 6.))

[0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0, 10000.0, 100000.0]

In [47]:
goat

RandomizedSearchCV(cv=ShuffleSplit(n_splits=10, random_state=42, test_size=0.3, train_size=0.7),
                   estimator=LogisticRegression(),
                   param_distributions={'C': [0.0001, 0.001, 0.01, 0.1, 1.0,
                                              10.0, 100.0, 1000.0, 10000.0,
                                              100000.0],
                                        'class_weight': [{0: 0.5, 1: 0.5},
                                                         {0: 0.6, 1: 0.4},
                                                         {0: 0.4, 1: 0.6},
                                                         {0: 0.3, 1: 0.7}],
                                        'max_iter': [1000], 'penalty': ['l2'],
                                        'random_state': [42],
                                        'solver': ['lbfgs']},
                   return_train_score=True, scoring='f1')