In [1]:
#import all required libraries
#Data Analysis
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as matplot

#model developemnt libraries
from sklearn import preprocessing
from sklearn.ensemble import AdaBoostClassifier
from sklearn import metrics
from sklearn.pipeline import Pipeline

# Import RandomForestClassifier and GradientBoostingClassifer
from sklearn.ensemble import (RandomForestClassifier, GradientBoostingClassifier,BaggingClassifier)
# Function for splitting training and test set
from sklearn.model_selection import train_test_split

# Function for creating model pipelines
from sklearn.pipeline import  make_pipeline

# Pickle for saving model files
import pickle

# For standardization
from sklearn.preprocessing  import StandardScaler

# Helper for cross-validation
from sklearn.model_selection import GridSearchCV

# Classification metrics (added later)
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score

import time
from sklearn.preprocessing import PolynomialFeatures, RobustScaler
from sklearn.linear_model import LogisticRegression,Perceptron
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.dummy import DummyClassifier

In [2]:

data = pd.read_csv("train.csv")
l = data.columns.tolist()
l = [i.strip() for i in l]
data.columns = l

y = data['signal']
X = data.drop(['signal'],axis=1)
X_train, X_test, y_train, y_test  = train_test_split(X, y,
                                                     test_size = 0.25,
                                                     random_state = 1234, 
                                                     stratify = data['signal'])
# Print number of observations in X_train, X_test, y_train, and y_test
print(len(X_train), len(X_test))
print(len(y_train), len(y_test))

159496 53166
159496 53166


In [3]:
# Pipeline dictionary
pipelines = {
#     'l1': make_pipeline(StandardScaler(),
#                         LogisticRegression(penalty='l2',random_state=123)),
    'bag': make_pipeline(StandardScaler(),
                        BaggingClassifier()),
    'gb': make_pipeline(StandardScaler(),
                        GradientBoostingClassifier(random_state=123))
}

In [4]:
pipelines['bag'].get_params()

{'memory': None,
 'steps': [('standardscaler', StandardScaler()),
  ('baggingclassifier', BaggingClassifier())],
 'verbose': False,
 'standardscaler': StandardScaler(),
 'baggingclassifier': BaggingClassifier(),
 'standardscaler__copy': True,
 'standardscaler__with_mean': True,
 'standardscaler__with_std': True,
 'baggingclassifier__base_estimator': None,
 'baggingclassifier__bootstrap': True,
 'baggingclassifier__bootstrap_features': False,
 'baggingclassifier__max_features': 1.0,
 'baggingclassifier__max_samples': 1.0,
 'baggingclassifier__n_estimators': 10,
 'baggingclassifier__n_jobs': None,
 'baggingclassifier__oob_score': False,
 'baggingclassifier__random_state': None,
 'baggingclassifier__verbose': 0,
 'baggingclassifier__warm_start': False}

In [40]:
pipelines['bag'].get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'standardscaler', 'baggingclassifier', 'standardscaler__copy', 'standardscaler__with_mean', 'standardscaler__with_std', 'baggingclassifier__base_estimator', 'baggingclassifier__bootstrap', 'baggingclassifier__bootstrap_features', 'baggingclassifier__max_features', 'baggingclassifier__max_samples', 'baggingclassifier__n_estimators', 'baggingclassifier__n_jobs', 'baggingclassifier__oob_score', 'baggingclassifier__random_state', 'baggingclassifier__verbose', 'baggingclassifier__warm_start'])

In [5]:
# Logistic Regression hyperparameters

bag_hyperparameters = {
    'baggingclassifier__max_samples' : [0.05, 0.1, 0.2, 0.5,1.0],
    'baggingclassifier__bootstrap_features': [True,False],
    'baggingclassifier__bootstrap': [True, False],
    'baggingclassifier__max_features': [1, 2, 4],
    'baggingclassifier__base_estimator':[None,
                           DummyClassifier(),
                           Perceptron(tol=1e-3),
                           DecisionTreeClassifier(),
                           SVC(gamma="scale")],
    'baggingclassifier__max_samples' : [0,3,0.6, 0.8, 1.0]
}

# Boosted Tree hyperparameters
gb_hyperparameters = {
    'gradientboostingclassifier__n_estimators': [300,320,350,400],
    'gradientboostingclassifier__max_depth': [5,6,7],
    'gradientboostingclassifier__learning_rate':[0.1,0.05,0.01]
}

# Create hyperparameters dictionary
hyperparameters = {
                  'bag': bag_hyperparameters,
                  'gb': gb_hyperparameters}

In [None]:
# Create empty dictionary called fitted_models
fitted_models = {}
start_time = time.time()

# Loop through model pipelines, tuning each one and saving it to fitted_models
for name, pipeline in pipelines.items():
    # Create cross-validation object from pipeline and hyperparameters
    model = GridSearchCV(pipeline, hyperparameters[name], cv = 2, n_jobs=2,verbose=True)
    
    # Fit model on X_train, y_train
    model.fit(X_train, y_train)
    
    # Store model in fitted_models[name] 
    fitted_models[name] = model
    
    # Print '{name} has been fitted'
    print('{0} has been fitted'.format(name))
print("--- %s seconds ---" % (time.time() - start_time))

Fitting 2 folds for each of 300 candidates, totalling 600 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   46.9s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:  2.6min
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:  5.6min
[Parallel(n_jobs=2)]: Done 600 out of 600 | elapsed: 574.7min finished


bag has been fitted
Fitting 2 folds for each of 36 candidates, totalling 72 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed: 98.0min
[Parallel(n_jobs=2)]: Done  72 out of  72 | elapsed: 158.9min finished


In [None]:
# Display best_score_ for each fitted model
for name,model in fitted_models.items():
    print(name, "Score: ", model.best_score_)

In [None]:
for name,model in fitted_models.items():
    with open('model_'+name+'BaggingGB.pkl', 'wb') as f:
        pickle.dump(model.best_estimator_, f)

In [None]:
#Predict PROBABILITIES
pred = fitted_models['MLP'].predict_proba(X_test)

# Get just the prediction for the positive class (1)
pred = [p[1] for p in pred]

In [None]:
# Calculate ROC curve from y_test and pred
fpr, tpr, thresholds = roc_curve(y_test, pred)
# Initialize figure
fig = plt.figure(figsize=(8,8))
plt.title('Receiver Operating Characteristic')

# Plot ROC curve
plt.plot(fpr, tpr, label='l1')

# Diagonal 45 degree line
plt.plot([0,1],[0,1], 'k--')

# Axes limits and labels
plt.xlim(-0.1,1.1)
plt.ylim(-0.1,1.1)
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:
# Code here

for name, model in fitted_models.items():
    pred = model.predict_proba(X_test)
    pred = [p[1] for p in pred]
    fpr, tpr, thresholds = roc_curve(y_test, pred)
    print(name, auc(fpr, tpr))