In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('white')
sns.color_palette("muted")
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.metrics import roc_curve, auc
from sklearn.pipeline import make_pipeline
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
warnings.filterwarnings('ignore')
import pickle as pkl
from collections import defaultdict
#%matplotlib inline

In [22]:
# number of variables to be excluded for logistic regression
# these must be the first two variables in the feature matrix

# global inputs
logit_num = 4
model_names = ['l1','l2']

'''
Model Building
'''

# Pipeline dictionary
pipelines = {
    'l1' : make_pipeline(StandardScaler(), LogisticRegression( penalty = 'l1', random_state=125)),
    'l2' : make_pipeline(StandardScaler(), LogisticRegression( penalty = 'l2', random_state=125)),
    'rf' : make_pipeline(StandardScaler(), RandomForestClassifier(random_state=125)),
    'gb' : make_pipeline(StandardScaler(), GradientBoostingClassifier(random_state=125)),
    'linsvc' : make_pipeline(StandardScaler(), SVC(random_state=125,probability=True)),
    'rbfsvc' : make_pipeline(StandardScaler(), SVC(random_state=125,probability=True))
}

# Logistic Regression hyperparameters
l1_hyperparameters = {
    'logisticregression__C' : np.linspace(1e-3, 1e2, 500)
}

l2_hyperparameters = {
    'logisticregression__C' : np.linspace(1e-3, 1e2, 500)
}

# Random Forest hyperparameters
rf_hyperparameters = {
    'randomforestclassifier__n_estimators': [100, 300, 500],
    'randomforestclassifier__max_features': ['auto', 'sqrt', 0.33],
    'randomforestclassifier__max_depth': [1, 2, 3, 4, 5]
}

# Boosted Tree hyperparameters
gb_hyperparameters = {
    'gradientboostingclassifier__n_estimators': [100, 300, 500],
    'gradientboostingclassifier__learning_rate': [0.01, 0.1, 0.5, 1],
    'gradientboostingclassifier__max_depth': [1, 2, 3, 4, 5]
}

linsvc_hyperparameters = {
    'svc__C' : [1e-5, 1e-3, 1e-1, 1e1],
    'svc__kernel' : ['linear']
}

rbfsvc_hyperparameters = {
    'svc__C': [1e-5, 1e-3, 1e-1, 1e1],
    'svc__gamma' : [1e-5, 1e-3, 1e-1, 1e1],
    'svc__kernel' : ['rbf']
}
# Create hyperparameters dictionary
hyperparameters = {
    'l1' : l1_hyperparameters, 
    'l2' : l2_hyperparameters,
    'rf' : rf_hyperparameters,
    'gb' : gb_hyperparameters,
    'linsvc' : linsvc_hyperparameters,
    'rbfsvc' : rbfsvc_hyperparameters
}
# Create data pointing dictionary
datapointers = {
    'l1' : 'logistic',
    'l2' : 'logistic',
    'rf' : 'not_logistic',
    'gb' : 'not_logistic',
    'linsvc' : 'not logistic',
    'rbfsvc' : 'not logistic'
}

def model_scoring_auc(X_in, y_in, model, datapointer):
    if datapointer == 'logistic':
        pred = model.predict_proba(X_in[:,logit_num:])
    else:
        pred = model.predict_proba(X_in)
    # Get just the prediction for the positive class (1)
    pred = [p[1] for p in pred]
    # Calculate ROC curve
    fpr, tpr, thresholds = roc_curve(y_in, pred)
    # Calculate AUROC
    return auc(fpr, tpr)


def model_fitting(df, y, logit_num, model_names, pipelines, hyperparameters, datapointers, randstate):
    # Create empty dictionary called fitted_models
    fitted_models = {}
    fitted_scores = {}
    
    # split data for CV testing
    
    #this works:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=randstate,stratify=X[:,9])




    # Loop through model pipelines, tuning each one and saving it to fitted_models
    for name in model_names:
        # Create cross-validation object from pipeline and hyperparameters
        model = GridSearchCV(pipelines[name], hyperparameters[name], scoring = 'neg_log_loss', cv=10, refit=True)

        # Fit model on X_train, y_train
        if datapointers[name] == 'logistic':
            model.fit(X_train[:,logit_num:], y_train)  
        else:
            model.fit(X_train, y_train)
        # Store model in fitted_models[name] 
        fitted_models[name] = model
        
        # store scores in fitted_scores[name]
        train_score = model_scoring_auc(X_train, y_train, model, datapointers[name])
        test_score = model_scoring_auc(X_test, y_test, model, datapointers[name])
        fitted_scores[name] = [train_score,test_score]
            
    return fitted_models, fitted_scores

        


In [None]:

df = pd.read_csv('../data/fitting_data.csv')
df.drop(['Unnamed: 0'],axis=1,inplace=True)
df = df.fillna(0)
y = df.pop('music').values
X = df.values


models_iterate = {}
scores_iterate = {}
for i in range(100):
    models_iterate[i], scores_iterate[i] = model_fitting(X,y,logit_num,model_names,pipelines,hyperparameters,datapointers,i)
    print(scores_iterate[i])

{'l1': [0.7911265232045631, 0.7697101449275363], 'l2': [0.7908024371273009, 0.7671014492753623]}
{'l1': [0.7767009110292692, 0.789906103286385], 'l2': [0.7768462880403177, 0.7910798122065728]}
{'l1': [0.7916420361247948, 0.7770232031692135], 'l2': [0.7910344827586206, 0.7770232031692134]}
{'l1': [0.7925110982435822, 0.7325825825825826], 'l2': [0.7928327864633598, 0.7373873873873874]}
{'l1': [0.792640566160239, 0.7779700115340255], 'l2': [0.7925756395273341, 0.7759515570934256]}
{'l1': [0.8087323124636557, 0.7241784037558685], 'l2': [0.8079892744071849, 0.7262323943661971]}
{'l1': [0.8001385130782115, 0.7331745086360929], 'l2': [0.8001546192500967, 0.7313877307921381]}
{'l1': [0.7691371250486949, 0.8347750865051905], 'l2': [0.7681307622386703, 0.8393886966551327]}
{'l1': [0.8093333333333332, 0.7072330654420207], 'l2': [0.8090081300813008, 0.7152698048220436]}
{'l1': [0.8038861788617886, 0.7223019517795638], 'l2': [0.8025853658536586, 0.7276119402985075]}


In [None]:
import pickle as pkl
with open('../data/100models.pkl', 'wb') as picklefile:
    pkl.dump(models_iterate, picklefile)
with open('../data/100models_scores.pkl', 'wb') as picklefile:
    pkl.dump(scores_iterate, picklefile)