In [3]:
#import all required libraries
#Data Analysis
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as matplot

#model developemnt libraries
from sklearn import preprocessing
from sklearn.ensemble import AdaBoostClassifier
from sklearn import metrics
from sklearn.pipeline import Pipeline

# Import RandomForestClassifier and GradientBoostingClassifer
from sklearn.ensemble import (RandomForestClassifier, GradientBoostingClassifier)
# Function for splitting training and test set
from sklearn.model_selection import train_test_split

# Function for creating model pipelines
from sklearn.pipeline import  make_pipeline

# Pickle for saving model files
import pickle

# For standardization
from sklearn.preprocessing  import StandardScaler

# Helper for cross-validation
from sklearn.model_selection import GridSearchCV

# Classification metrics (added later)
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score

import time
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
import seaborn as sns

In [5]:
data = pd.read_csv("train.csv")
l = data.columns.tolist()
l = [i.strip() for i in l]
data.columns = l
data[['Kst_892_0_cosThetaH',"B_IPCHI2_OWNPV","signal"]]

Unnamed: 0,Kst_892_0_cosThetaH,B_IPCHI2_OWNPV,signal
0,-0.575502,2.662533,1.0
1,-0.615941,0.092746,1.0
2,0.249383,2.442423,1.0
3,0.591884,6.337556,0.0
4,0.655850,7.632751,0.0
...,...,...,...
212657,-0.397384,0.422886,1.0
212658,0.522046,6.838541,0.0
212659,-0.176330,2.256855,1.0
212660,0.948246,3.385165,0.0


In [7]:
y = data['signal']
X = data.drop(['signal'],axis=1)

In [8]:
X_train, X_test, y_train, y_test  = train_test_split(X, y,
                                                     test_size = 0.25,
                                                     random_state = 1234, 
                                                     stratify = data['signal'])
# Print number of observations in X_train, X_test, y_train, and y_test
print(len(X_train), len(X_test))
print(len(y_train), len(y_test))

159496 53166
159496 53166


In [9]:
# Pipeline dictionary
pipelines = {
#     'l1': make_pipeline(StandardScaler(),
#                         LogisticRegression(penalty='l2',random_state=123)),
    'l2': make_pipeline(StandardScaler(),
                        LogisticRegression(penalty='l2',random_state=123)),
    'rf': make_pipeline(StandardScaler(),
                        RandomForestClassifier(random_state=123)),
    'gb': make_pipeline(StandardScaler(),
                        GradientBoostingClassifier(random_state=123))
}

In [None]:
# Logistic Regression hyperparameters

l2_hyperparameters = {
    'logisticregression__C': np.linspace(1e-3,1e3,10)
}
# Random Forest hyperparameters
rf_hyperparameters = {
    'randomforestclassifier__n_estimators': [100,200,300],
    'randomforestclassifier__max_features': ['auto','sqrt',0.33]
}
# Boosted Tree hyperparameters
gb_hyperparameters = {
    'gradientboostingclassifier__n_estimators': [100,200,250,300],
    'gradientboostingclassifier__learning_rate': [0.05, 0.1,0.2],
    'gradientboostingclassifier__max_depth': [1,3,5],
    'gradientboostingclassifier__learning_rate':[0.1,0.05,0.01]
}

# Create hyperparameters dictionary
hyperparameters = {
                  'l2': l2_hyperparameters,
                  'rf': rf_hyperparameters,
                  'gb': gb_hyperparameters}

In [14]:
# Create empty dictionary called fitted_models
fitted_models = {}
start_time = time.time()

# Loop through model pipelines, tuning each one and saving it to fitted_models
for name, pipeline in pipelines.items():
    # Create cross-validation object from pipeline and hyperparameters
    model = GridSearchCV(pipeline, hyperparameters[name], cv = 10, n_jobs=-1)
    
    # Fit model on X_train, y_train
    model.fit(X_train, y_train)
    
    # Store model in fitted_models[name] 
    fitted_models[name] = model
    
    # Print '{name} has been fitted'
    print('{0} has been fitted'.format(name))
print("--- %s hours ---" % (time.time() - start_time/3600))

NameError: name 'hyperparameters' is not defined

In [None]:
# Display best_score_ for each fitted model
for name,model in fitted_models.items():
    print(name, "Score: ", model.best_score_)

In [15]:
for name,model in fitted_models.items():
    with open('model_'+name+'_finalNoButter.pkl', 'wb') as f:
        pickle.dump(model.best_estimator_, f)

In [None]:
#Predict PROBABILITIES
pred = fitted_models['gb'].predict_proba(X_test)

# Get just the prediction for the positive class (1)
pred = [p[1] for p in pred]

In [None]:
# Calculate ROC curve from y_test and pred
fpr, tpr, thresholds = roc_curve(y_test, pred)


# Initialize figure
fig = plt.figure(figsize=(8,8))
plt.title('Receiver Operating Characteristic')

# Plot ROC curve
plt.plot(fpr, tpr, label='l1')

# Diagonal 45 degree line
plt.plot([0,1],[0,1], 'k--')

# Axes limits and labels
plt.xlim(-0.1,1.1)
plt.ylim(-0.1,1.1)
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:
# Code here

for name, model in fitted_models.items():
    pred = model.predict_proba(X_test)
    pred = [p[1] for p in pred]
    fpr, tpr, thresholds = roc_curve(y_test, pred)
    print(name, auc(fpr, tpr))

In [1]:
a = "1706851.1147917302 2124563.9233724214 2984126.1997318044 4505573.86514001
6442603.727979784
11714672.725343287
19634497.16478676
16341417.657488761
30247932.58866336
298681354.46312535
110956882751.84926
44141803924591.74
2.1401634802033456e+16
1.3801414674407589e+17
1.1281761215651294e+17
9.366791393012419e+18
6.419436039641725e+17
2.0799238648167085e+17
2.1778509510788995e+18
3.697150985106024e+19
3.945398724862883e+18
2.2408206353390477e+19
2.7076871262948954e+19
2.0436189549296864e+20
7.401114113469545e+19
1.1885615281093183e+21"

1.1885615281093183e+21