In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('white')
sns.color_palette("muted")
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.metrics import roc_curve, auc
from sklearn.pipeline import make_pipeline
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
warnings.filterwarnings('ignore')
import pickle as pkl
from collections import defaultdict
#%matplotlib inline

In [2]:
# number of variables to be excluded for logistic regression
# these must be the first two variables in the feature matrix

# global inputs
logit_num = 6
model_names = ['l2']

'''
Model Building
'''

# Pipeline dictionary
pipelines = {
    'l1' : make_pipeline(StandardScaler(), LogisticRegression( penalty = 'l1', random_state=125)),
    'l2' : make_pipeline(StandardScaler(), LogisticRegression( penalty = 'l2', random_state=125)),
    'rf' : make_pipeline(StandardScaler(), RandomForestClassifier(random_state=125)),
    'gb' : make_pipeline(StandardScaler(), GradientBoostingClassifier(random_state=125)),
    'linsvc' : make_pipeline(StandardScaler(), SVC(random_state=125,probability=True)),
    'rbfsvc' : make_pipeline(StandardScaler(), SVC(random_state=125,probability=True))
}

# Logistic Regression hyperparameters
l1_hyperparameters = {
    'logisticregression__C' : np.linspace(1e-2, 1e1, 500)
}

l2_hyperparameters = {
    'logisticregression__C' : np.linspace(1e-2, 1e1, 500)
}

# Random Forest hyperparameters
rf_hyperparameters = {
    'randomforestclassifier__n_estimators': [100, 300, 500],
    'randomforestclassifier__max_features': ['auto', 'sqrt', 0.33],
    'randomforestclassifier__max_depth': [1, 2, 3, 4, 5]
}

# Boosted Tree hyperparameters
gb_hyperparameters = {
    'gradientboostingclassifier__n_estimators': [100, 300, 500],
    'gradientboostingclassifier__learning_rate': [0.01, 0.1, 0.5, 1],
    'gradientboostingclassifier__max_depth': [1, 2, 3, 4, 5]
}

linsvc_hyperparameters = {
    'svc__C' : [1e-5, 1e-3, 1e-1, 1e1],
    'svc__kernel' : ['linear']
}

rbfsvc_hyperparameters = {
    'svc__C': [1e-5, 1e-3, 1e-1, 1e1],
    'svc__gamma' : [1e-5, 1e-3, 1e-1, 1e1],
    'svc__kernel' : ['rbf']
}
# Create hyperparameters dictionary
hyperparameters = {
    'l1' : l1_hyperparameters, 
    'l2' : l2_hyperparameters,
    'rf' : rf_hyperparameters,
    'gb' : gb_hyperparameters,
    'linsvc' : linsvc_hyperparameters,
    'rbfsvc' : rbfsvc_hyperparameters
}
# Create data pointing dictionary
datapointers = {
    'l1' : 'logistic',
    'l2' : 'logistic',
    'rf' : 'not_logistic',
    'gb' : 'not_logistic',
    'linsvc' : 'not logistic',
    'rbfsvc' : 'not logistic'
}

def model_scoring_auc(X_in, y_in, model, datapointer):
    if datapointer == 'logistic':
        pred = model.predict_proba(X_in[:,logit_num:])
    else:
        pred = model.predict_proba(X_in)
    # Get just the prediction for the positive class (1)
    pred = [p[1] for p in pred]
    # Calculate ROC curve
    fpr, tpr, thresholds = roc_curve(y_in, pred)
    # Calculate AUROC
    return auc(fpr, tpr)


def model_fitting(X, y, logit_num, model_names, pipelines, hyperparameters, datapointers, randstate,stratcolumn):
    # Create empty dictionary called fitted_models
    fitted_models = {}
    fitted_scores = {}
    
    # split data for CV testing
    
    #this works:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=randstate,stratify=stratcolumn)




    # Loop through model pipelines, tuning each one and saving it to fitted_models
    for name in model_names:
        # Create cross-validation object from pipeline and hyperparameters
        model = GridSearchCV(pipelines[name], hyperparameters[name], scoring = 'neg_log_loss', cv=10, refit=True)

        # Fit model on X_train, y_train
        if datapointers[name] == 'logistic':
            model.fit(X_train[:,logit_num:], y_train)  
        else:
            model.fit(X_train, y_train)
        # Store model in fitted_models[name] 
        fitted_models[name] = model
        
        # store scores in fitted_scores[name]
        train_score = model_scoring_auc(X_train, y_train, model, datapointers[name])
        test_score = model_scoring_auc(X_test, y_test, model, datapointers[name])
        fitted_scores[name] = [train_score,test_score]
            
    return fitted_models, fitted_scores

        


In [3]:

df = pd.read_csv('../data/fitting_data.csv')
df.drop(['Unnamed: 0'],axis=1,inplace=True)
df = df.fillna(0)
y = df.pop('music').values
stratification_columns = df.pop('stratification_column').values
X = df.values


models_iterate = {}
scores_iterate = {}
for i in range(1000):
    models_iterate[i], scores_iterate[i] = model_fitting(X,y,logit_num,model_names,pipelines,hyperparameters,datapointers,i+12000,stratification_columns)
    if i%10 == 0:
        print('step',i,'model',scores_iterate[i])





step 0 model {'l2': [0.7877501125908769, 0.7927927927927928]}
step 10 model {'l2': [0.7647604426533616, 0.8680379746835443]}
step 20 model {'l2': [0.7855702119485177, 0.7648530331457161]}
step 30 model {'l2': [0.7713283989092327, 0.8292964244521338]}
step 40 model {'l2': [0.7951104411089991, 0.7346938775510204]}
step 50 model {'l2': [0.7972320146233189, 0.7548433048433049]}
step 60 model {'l2': [0.8161529940705023, 0.7046883933676387]}
step 70 model {'l2': [0.7863516299445948, 0.7861822513400833]}
step 80 model {'l2': [0.8065347956652305, 0.7062678062678063]}
step 90 model {'l2': [0.8067280592514912, 0.7015306122448979]}
step 100 model {'l2': [0.781647005444646, 0.7684057971014493]}
step 110 model {'l2': [0.7563741668284476, 0.8206997084548106]}
step 120 model {'l2': [0.7851114547094447, 0.7902025014889816]}
step 130 model {'l2': [0.8130695674496213, 0.6961444308445532]}
step 140 model {'l2': [0.7868860415453309, 0.7596209912536444]}
step 150 model {'l2': [0.7956774193548386, 0.7452718

In [4]:
import pickle as pkl
with open('../data/pp_1000_models.pkl', 'wb') as picklefile:
    pkl.dump(models_iterate, picklefile)
with open('../data/pp_1000_models_scores.pkl', 'wb') as picklefile:
    pkl.dump(scores_iterate, picklefile)