In [5]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_validate, ShuffleSplit
from sklearn import ensemble, naive_bayes, tree
from sklearn.linear_model import LogisticRegression
import ast
from sklearn.linear_model import LogisticRegression
import joblib



from config import config

In [2]:

reduced_data = pd.read_excel('./output/reduced_data.xlsx')


X_train, X_test, y_train, y_test = train_test_split(
    reduced_data.drop(config.TARGET, axis=1), 
    reduced_data[config.TARGET], 
    test_size=0.20, 
    stratify=reduced_data[config.TARGET], 
    random_state=config.RANDOM_STATE)

In [38]:
# Internal modules
from config import config

def BaseLineModels(models, X, y):
    
    row_index = 0
    cv_split = ShuffleSplit(n_splits = 10, test_size = .3, train_size = .7, random_state = 42)
    baseline_output = pd.DataFrame(columns=['model', 'mean_train_acc', 'mean_test_acc', 'parameters'])
    
    for model in [models[key]['model'] for key in models]:
        baseline_output.loc[row_index, 'model'] = model.__class__.__name__
        cross_validation_result = cross_validate(model, X, y, cv = cv_split, return_train_score=True, scoring='f1')
        model_parameters = model.fit(X, y).get_params()
        
        baseline_output.loc[row_index, 'mean_train_acc'] = cross_validation_result['train_score'].mean()
        baseline_output.loc[row_index, 'mean_test_acc'] = cross_validation_result['test_score'].mean()
        baseline_output.loc[row_index, 'parameters'] = [model_parameters]
        row_index+=1
        
        
        
    baseline_output.sort_values(by='mean_test_acc', ascending=False, inplace=True)
    
    row_index = 0
    tuned_output = pd.DataFrame(columns=['model', 'mean_train_acc_tuned', 'mean_test_acc_tuned', 'parameters_tuned'])
    
    for model in [models[key] for key in models]:
        tuned_output.loc[row_index, 'model'] = model['model'].__class__.__name__
        tuned_model = RandomizedSearchCV(model['model'], param_distributions=model['param_grid'], scoring = 'f1', cv = cv_split, return_train_score=True)
        tuned_model.fit(X, y)

        tuned_output.loc[row_index, 'mean_train_acc_tuned'] = tuned_model.cv_results_['mean_train_score'][tuned_model.best_index_]
        tuned_output.loc[row_index, 'mean_test_acc_tuned'] = tuned_model.cv_results_['mean_test_score'][tuned_model.best_index_]
        tuned_output.loc[row_index, 'parameters_tuned'] = [tuned_model.best_params_]
        row_index+=1

    

    output = baseline_output.join(tuned_output.set_index('model'), on='model')
    output.sort_values(by='mean_test_acc_tuned', ascending=False, inplace=True)
    
    best_model = models[output.iloc[0][['model']].values[0]]['model']
    best_parameters = output.iloc[0]['parameters_tuned'][0]
    best_model = best_model.set_params(**best_parameters)
    
    return output, best_model, best_parameters #baseline_output, tuned_output

In [39]:
scaler = StandardScaler()
output, model, params = BaseLineModels(config.MODELS, scaler.fit_transform(X_train), y_train)



In [40]:
output

Unnamed: 0,model,mean_train_acc,mean_test_acc,parameters,mean_train_acc_tuned,mean_test_acc_tuned,parameters_tuned
1,LogisticRegression,0.666439,0.667745,"[{'C': 100000.0, 'class_weight': {1: 0.7, 0: 0...",0.666439,0.667745,"[{'solver': 'lbfgs', 'random_state': 42, 'pena..."
3,GaussianNB,0.636489,0.634479,"[{'priors': None, 'var_smoothing': 1e-09}]",0.636489,0.634479,[{}]
2,BernoulliNB,0.621782,0.631832,"[{'alpha': 1.0, 'binarize': 0.0, 'class_prior'...",0.621782,0.631832,[{}]
5,AdaBoostClassifier,0.669815,0.630257,"[{'algorithm': 'SAMME.R', 'base_estimator': No...",0.669815,0.630257,[{'base_estimator': None}]
6,RandomForestClassifier,0.998153,0.622005,"[{'bootstrap': True, 'ccp_alpha': 0.0, 'class_...",0.918611,0.623807,"[{'random_state': 42, 'n_estimators': 35, 'min..."
4,DecisionTreeClassifier,0.99815,0.550138,"[{'ccp_alpha': 0.0, 'class_weight': None, 'cri...",0.635293,0.610777,"[{'splitter': 'best', 'random_state': 42, 'max..."
0,KNeighborsClassifier,0.699501,0.570849,"[{'algorithm': 'auto', 'leaf_size': 30, 'metri...",0.99815,0.580118,"[{'weights': 'distance', 'p': 1, 'n_neighbors'..."


In [13]:


df = pd.read_excel('model_performances.xlsx')
best_model = config.MODELS[df.iloc[0][['model']].values[0]]['model']
best_parameters = ast.literal_eval(df.iloc[0][['model', 'parameters_tuned']].values[1])

best_model = best_model.set_params(**best_parameters[0])

In [16]:
df.iloc[0][['model', 'parameters_tuned']].values[1].strip('[]')

"{'solver': 'lbfgs', 'random_state': 42, 'penalty': 'l2', 'max_iter': 1000, 'class_weight': {1: 0.7, 0: 0.3}, 'C': 100000.0}"

In [None]:
best_parameters = ast.literal_eval(df.iloc[0][['parameters_tuned']].values[0].strip('[]'))
best_parameters

In [None]:
df.iloc[0][['parameters_tuned']].values[0].strip('[]')

In [None]:
joblib.dump(best_model, 'model_output.pkl')

In [None]:
best_parameters = ast.literal_eval(df.iloc[0][['model', 'parameters_tuned']].values[1])
best_parameters

In [None]:
ape_lord = joblib.load('model_output.pkl')

In [None]:
ape_lord

In [None]:
def save_model(*, model_to_save):
    save_model_name =

In [None]:
X.to_excel(f"{self.output_folder}/reduced_data.xlsx", index=False)

In [None]:
config.OUTPUT_DIR/

In [None]:
config.PERFORMANCE_OUTPUT_FILE

In [None]:
joblib.dump(best_model, f"{config.OUTPUT_DIR/}")

In [None]:
config.MODELS[df.iloc[0][['model']].values[0]]['model']

In [None]:
config.MODELS[df.iloc[0]]

In [None]:
df.iloc[0]['model']

In [None]:
df

In [None]:
df.to_excel('./output/goat.xlsx', index=False)