In [1]:
#import libs
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import learning_curve, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFECV
import pickle

In [2]:
#open x abd y df's
with open('x_df', 'rb') as x:
    x_df = pickle.load(x)

In [3]:
with open('y_df', 'rb') as y:
    y_df = pickle.load(y)


In [4]:
#split x and y data into train/test sets
X_train, X_test, y_train, y_test = train_test_split(x_df,
                                  y_df,
                                  test_size=0.2,
                                  random_state=1)

In [5]:
#scoring metric to optimize
score = 'roc_auc'

# Model 1 SVM

In [6]:
#setup grid search
grid_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]}]

#run grid search to find best gamma
clf = GridSearchCV(SVC(probability=True),
                   grid_parameters,
                   cv=3,
                   scoring=score,
                   n_jobs = -1)

#fit SVM
#NOTE: This will take a long time!!!!!!
clf.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='rbf', max_iter=-1,
                           probability=True, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=-1,
             param_grid=[{'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001],
                          'kernel': ['rbf']}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='roc_auc', verbose=0)

In [8]:
#Generate predicted probabilites
clf_probs = clf.predict_proba(X_test)
print('AUC: ', roc_auc_score(y_test, clf_probs[:,1]))
print('Accuracy: ', clf.score(X_test, y_test))

AUC:  0.9896505009024282
Accuracy:  0.9896516700401162


In [9]:
#Pickle model 1
with open('clf', 'wb') as c:
    pickle.dump(clf, c, pickle.HIGHEST_PROTOCOL)

In [7]:
# Model 2: Logistic Regression w/ F

In [10]:
#Setup recursive feature reduction w/ cross validation
clf2 = RFECV(LogisticRegression(),
      scoring = score,
      n_jobs = -1,
      cv = 3,
      step = 5)

In [16]:
# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

In [17]:
clf2.fit(X_train, y_train)


RFECV(cv=3,
      estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                   fit_intercept=True, intercept_scaling=1,
                                   l1_ratio=None, max_iter=100,
                                   multi_class='warn', n_jobs=None,
                                   penalty='l2', random_state=None,
                                   solver='warn', tol=0.0001, verbose=0,
                                   warm_start=False),
      min_features_to_select=1, n_jobs=-1, scoring='roc_auc', step=5,
      verbose=0)

In [12]:
#Generate predicted probabilites
clf2_probs = clf2.predict_proba(X_test)
print('AUC: ', roc_auc_score(y_test, clf2_probs[:,1]))
print('Accuracy: ', clf2.score(X_test, y_test))

AUC:  0.906445017646672
Accuracy:  0.890875


In [14]:
#Pickle model 2
with open('clf2', 'wb') as c:
    pickle.dump(clf2, c, pickle.HIGHEST_PROTOCOL)

In [15]:
clf2_probs[:,1]

array([0.09011894, 0.06408473, 0.00887336, ..., 0.02788517, 0.00452491,
       0.16623803])