# Steps for all of us

Choose a dataset that you want to use. 

You may do whatever steps you think necessary for building the best classifier.

Take the data you chose and do whatever massaging you think is necessary: standardizing, scaling, feature engineering/ transforming, feature selection, etc.  

Build a classifier however you see fit. You may want build one and tweak the paramters manually or use some sort of grid search to look through all possible parameters. 

Remember: The same model built on the massaged data may perform better than if the data was untouched. It may be more conveniant to chose a standared massaging pipeline and tweak a model to that data.

After you massage your data, follow these steps:

if you want to balance your target (which you should) follow along these lines:

### build your features data and target data

- X = df.drop(columns = "whatever your target name is")
- y = df"whatever your target name is"

### split the data into training and testing

- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2019)

### create the oversampled data to train on 

- oversampler = SMOTE(random_state = 2019)
- X_train_oversampled, y_train_oversampled = oversampler.fit_resample(X_train, y_train)

### Put the oversampled data back into a dataframe

- X_train_oversampled = pd.DataFrame(X_train_oversampled, columns = X_train.columns)
- y_train_oversampled = pd.Series(y_train_oversampled)

### Build your classifier here. As an example:

- xgb_clf = xgb.XGBClassifier(max_depth=5, n_estimators=100, colsample_bytree=0.3, learning_rate=0.1, n_jobs=-1)

 
### Fit to the oversampled data; this will train the classifier on the oversampled data

- xgb_clf.fit(X_train_oversampled, y_train_oversampled)

### Use 5-fold cross validation to see how well the classfier you built is doing on test data. 
Some points: you have to substitute your classifer name in the cross_val_score function 

- kfold = KFold(n_splits=5, random_state=2019)
- results = cross_val_score(xgb_clf, X_test, y_test, cv=kfold, scoring = 'f1')


## It may be best to keep all of your models you built; have a log of them to see their scores and keep a record of your process of building your data. 



In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import copy
from Modules import *
sns.set()
%matplotlib inline
from sklearn.model_selection import train_test_split, KFold, cross_val_score, RandomizedSearchCV,GridSearchCV
from sklearn.pipeline import FeatureUnion
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, f1_score
from imblearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.pipeline import Pipeline 
from sklearn.ensemble import RandomForestClassifier
from time import time
from sklearn.externals import joblib
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.feature_selection import RFE, SelectKBest
from sklearn.decomposition import PCA


### read in the full sequential data

In [2]:
#read in the data; data formated weird with extra column, 
#use only data we need
data = pd.read_csv('Data/Final_trimmed_sequential_data.csv')
df = data.iloc[:, 1:]
y = df['Y']

df.head().T

Unnamed: 0,0,1,2,3,4
AGE,24.0,26.0,34.0,37.0,57.0
Y,1.0,1.0,0.0,0.0,0.0
SEX_Female,1.0,1.0,1.0,1.0,0.0
SEX_Male,0.0,0.0,0.0,0.0,1.0
EDUCATION_Graduate School,0.0,0.0,0.0,0.0,0.0
EDUCATION_Other,0.0,0.0,0.0,0.0,0.0
EDUCATION_University,1.0,1.0,1.0,1.0,1.0
MARRIAGE_Married,1.0,0.0,0.0,1.0,1.0
MARRIAGE_Non-married,0.0,1.0,1.0,0.0,0.0
PERCENT_OF_LIMIT_BAL1,0.19565,0.02235,0.308011,0.8998,0.13234


In [5]:
#make pipeline



#split into training and testing
Train_data, test_data = train_test_split(df, test_size = 0.2, random_state = 2019)

#set the target and predictor labels
target = 'Y'
predictors = [x for x in df.columns if x not in [target] ]

#build the classifier
clf = LogisticRegression()

#set up the imbalanced data handling procedure
oversampler = SMOTE(random_state = 2019)

#creat the pipeline
pipeline = Pipeline([('smote', oversampler),('clf', clf)])

#cross validate results
kfold = KFold(n_splits=5, random_state=2019)
results = cross_val_score(pipeline, df[predictors], df[target], cv=kfold, scoring = 'accuracy')
print(f"5-fold cross-validation results: {np.mean(results)}")

5-fold cross-validation results: 0.7822333333333333


In [6]:
#make helper function

def evaluate_clf(data, model, imbalance='SMOTE', params=None, show_accuracy=True, show_auc=True, show_f1=True):
    

    #split into training and testing
    Train_data, test_data = train_test_split(data, test_size = 0.2, random_state = 2019)

    #set the target and predictor labels
    target = 'Y'
    predictors = [x for x in data.columns if x not in [target] ]

    #build the classifier
    clf = model

    #set up the imbalanced data handling procedure
    if imbalance=='SMOTE':
        oversampler = SMOTE(random_state = 2019)
    else:
        oversampler = RandomOverSampler(random_state=2019)
    #creat the pipeline
    pipeline = Pipeline([('smote', oversampler),('clf', clf)])
    pipeline.fit(Train_data[predictors], Train_data[target])
    y_pred = pipeline.predict(test_data[predictors])
    
    if show_accuracy:
        print ("Accuracy:{0:.3f}".format(accuracy_score(test_data[target],y_pred)),"\n")
        
    if show_auc:
        print ("AUC:{0:.3f}".format(roc_auc_score(test_data[target],y_pred)),"\n")
        
    if show_f1:
        print ("F1:{0:.3f}".format(f1_score(test_data[target],y_pred)),"\n")
    
    print(classification_report(test_data[target], y_pred))

evaluate_clf(df, LogisticRegression())

Accuracy:0.782 

AUC:0.706 

F1:0.530 

              precision    recall  f1-score   support

           0       0.88      0.84      0.86      4710
           1       0.49      0.57      0.53      1290

   micro avg       0.78      0.78      0.78      6000
   macro avg       0.69      0.71      0.69      6000
weighted avg       0.79      0.78      0.79      6000



In [3]:
#helper function for pipeline
def evaluate_pipeline(data, pipeline, params=None, show_accuracy=True, show_auc=True, show_f1=True):
    

    #split into training and testing
    Train_data, test_data = train_test_split(data, test_size = 0.2, random_state = 2019)

    #set the target and predictor labels
    target = 'Y'
    predictors = [x for x in data.columns if x not in [target] ]
    
    #fit the pipeline
    pipe = pipeline
    pipe.fit(Train_data[predictors], Train_data[target])
    y_pred = pipeline.predict(test_data[predictors])
    
    if show_accuracy:
        print ("Accuracy:{0:.3f}".format(accuracy_score(test_data[target],y_pred)),"\n")
        
    if show_auc:
        print ("AUC:{0:.3f}".format(roc_auc_score(test_data[target],y_pred)),"\n")
        
    if show_f1:
        print ("F1:{0:.3f}".format(f1_score(test_data[target],y_pred)),"\n")
    
    print(classification_report(test_data[target], y_pred))

In [7]:
#create RandomSearchCV helper function
def evaluate_random_grid(data, model, grid_params, score, n_iter=10,
                         cv = 5):

    #split into training and testing
    Train_data, test_data = train_test_split(data, test_size = 0.2, random_state = 2019)
    
    #set the target and predictor labels
    target = 'Y'
    predictors = [x for x in data.columns if x not in [target] ]

    #build the classifier
    grid = RandomizedSearchCV(estimator = model, param_distributions = grid_params, n_iter=n_iter, 
                            cv = cv, n_jobs=-1, scoring = score)
    
    #fit the random search
    start = time()
    cv_results = grid.fit(Train_data[predictors], Train_data[target])
    print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter))
    print()
    
    print(f"RandomizedSearchCV grid model {model} with parameters {cv_results.best_params_} had a best score of {cv_results.best_score_}")
    



In [8]:
#create GridSearchCV helper
def evaluate_gridsearch(data, model, grid_params, score, imbalance='SMOTE',
                         cv = 5):

    #split into training and testing
    Train_data, test_data = train_test_split(data, test_size = 0.2, random_state = 2019)
    
    #set the target and predictor labels
    target = 'Y'
    predictors = [x for x in data.columns if x not in [target] ]

    #build the classifier
    grid = GridSearchCV(estimator = model, param_grid = grid_params, 
                            cv = cv, n_jobs=-1, scoring = score)
    
    #apply class balancing
    data_oversampled_X, data_oversampled_y = oversampler.fit_resample(data[predictors], data[target])
    
    #fit the random search
    start = time()
    cv_results = grid.fit(Train_data[predictors], Train_data[target])
    print("GridSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter))
    print()
    
    print(f"GridSearchCV grid model {model} with parameters {cv_results.best_params_}  had a best score of {cv_results.best_score_}")
    
    



In [9]:
# Test procudure
clf = LogisticRegression()
param_grid = {
    'C': [1,5],
    'penalty': ['l1', 'l2']
          }

pipe = Pipeline([
    ('smote', SMOTE(random_state=2019)),
    ('clf', clf)])

evaluate_random_grid(df, pipe, grid_params=param_grid, n_iter=4, score = 'accuracy')

ValueError: Invalid parameter penalty for estimator Pipeline(memory=None,
     steps=[('smote', SMOTE(k_neighbors=5, kind='deprecated', m_neighbors='deprecated', n_jobs=1,
   out_step='deprecated', random_state=2019, ratio=None,
   sampling_strategy='auto', svm_estimator='deprecated')), ('clf', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))]). Check the list of available parameters with `estimator.get_params().keys()`.

In [14]:
evaluate_clf(df, LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l1', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))

Accuracy:0.782 

AUC:0.705 

F1:0.529 

              precision    recall  f1-score   support

           0       0.88      0.84      0.86      4710
           1       0.49      0.57      0.53      1290

   micro avg       0.78      0.78      0.78      6000
   macro avg       0.69      0.70      0.69      6000
weighted avg       0.79      0.78      0.79      6000



In [9]:
#testing a pipeline object in the helper functions
clf = LogisticRegression()
pipe = Pipeline([
    ('smote', SMOTE(random_state=2019)),
    ('scaler', StandardScaler()),
    ('clf', clf)])

evaluate_pipeline(data=df, pipeline=pipe)

Accuracy:0.781 

AUC:0.705 

F1:0.529 

              precision    recall  f1-score   support

           0       0.88      0.84      0.86      4710
           1       0.49      0.57      0.53      1290

   micro avg       0.78      0.78      0.78      6000
   macro avg       0.69      0.70      0.69      6000
weighted avg       0.79      0.78      0.79      6000



  Xt = transform.transform(Xt)


In [11]:
#testing same code but with a random search
clf = LogisticRegression()
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', clf)])

param_grid = {
    'clf__C': [0.5,1, 1.5],
    'clf__penalty': ['l1', 'l2']
             }
evaluate_random_grid(df, model=pipe, cv=5, grid_params=param_grid,score='accuracy')



RandomizedSearchCV took 28.00 seconds for 10 candidates parameter settings.

RandomizedSearchCV grid model Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('clf', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))]) with parameters {'clf__penalty': 'l1', 'clf__C': 0.5} had a best score of 0.6576356788221195


In [None]:
from sklearn.svm import SVC
# This dataset is way to high-dimensional. Better do PCA:
pca = PCA(n_components=3)

# Maybe some original features where good, too?
selection = SelectKBest(k=1)

# Build estimator from PCA and Univariate selection:
combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)])

# Model:
svm = SVC(kernel="linear")

# Do grid search over k, n_components and C:
pipeline = Pipeline([("features", combined_features), ("svm", svm)])
param_grid = dict(features__pca__n_components=[1, 2, 3],
                  features__univ_select__k=[1, 2],
                  svm__C=[0.1, 1, 10])

grid_search = GridSearchCV(pipeline, param_grid=param_grid, verbose=10, n_jobs=-1 )
grid_search.fit(df[predictors], y)
print(grid_search.best_estimator_)

Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


In [13]:
# attempting with a random forest

clfRF = RandomForestClassifier()
evaluate_clf(data=df, model=clfRF)

Accuracy:0.801 

AUC:0.642 

F1:0.440 

              precision    recall  f1-score   support

           0       0.84      0.92      0.88      4710
           1       0.56      0.36      0.44      1290

   micro avg       0.80      0.80      0.80      6000
   macro avg       0.70      0.64      0.66      6000
weighted avg       0.78      0.80      0.78      6000



In [19]:
#testing the seperate fit to training, and predicting

clf4 = LogisticRegression()

pipeline4 = Pipeline([('smote', oversampler), ('clf', clf4)])
pipeline4.fit(Train_data[predictors], Train_data[target])

y_pred = pipeline4.predict(test_data[predictors])

print(f"accuracy score: {accuracy_score(test_data[target], y_pred)}")

accuracy score: 0.6963333333333334


In [20]:
#make pipeline with randomforest

from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import classification_report
from imblearn.pipeline import Pipeline 
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE

#split into training and testing
Train_data, test_data = train_test_split(df, test_size = 0.2, random_state = 2019)

target = 'Y'
predictors = [x for x in Train_data.columns if x not in [target] ]

clfRF = RandomForestClassifier()

oversampler = SMOTE(random_state = 2019)
pipeline = Pipeline([('smote', oversampler),('clf', clfRF)])

#cross validate results
kfold = KFold(n_splits=5, random_state=2019)
results = cross_val_score(pipeline, Train_data[predictors], Train_data[target], cv=kfold, scoring = 'accuracy')
print(f"5-fold cross-validation results: {np.mean(results)}")

5-fold cross-validation results: 0.7969999999999999


In [12]:
#compare the XGBoost models

from sklearn.externals import joblib 
from sklearn.naive_bayes import GaussianNB
model = joblib.load('Models/xgboost3.dat')
#pipeline5 =  Pipeline([('smote', oversampler),('clf', model)])
kfold = KFold(n_splits=5, random_state=2019)
results = cross_val_score(model, test_data[predictors], test_data[target], cv=kfold, scoring = 'accuracy')
print(f"5-fold cross-validation results: {np.mean(results)}")




5-fold cross-validation results: 0.7866666666666667
