In [1]:
import warnings
warnings.filterwarnings("ignore", message="is_sparse is deprecated and will be removed in a future version.")
warnings.filterwarnings("ignore", category=FutureWarning, module="sklearn.utils.validation")
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import json
import math
from sklearn.ensemble import RandomForestClassifier as rf
import sklearn.linear_model as lm
from sklearn.tree import DecisionTreeClassifier as tree
from sklearn.neighbors import KNeighborsClassifier as knn
from xgboost import XGBClassifier as xgb
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB as gnb
import sklearn.model_selection as ms
import sklearn.metrics as sm
import pickle

%autosave 5

Autosaving every 5 seconds


In [2]:
def retrieveModelsBasedOnModelType(modelType):
    if modelType == 'log':
        gridmodel = lm.LogisticRegression(random_state=51,penalty='l2',C=0.001)
    elif modelType == 'naiveBayes':
        gridmodel = gnb()
    elif modelType == 'tree':
        gridmodel = tree(random_state=51)
    elif modelType == 'forest':
        gridmodel = rf(random_state=51, oob_score=True)
    elif modelType == 'knn':
        gridmodel = knn()
    elif modelType == 'xgboost':
        gridmodel = xgb(random_state=51,reg_alpha=1000,reg_lambda=1000)
    elif modelType == 'svm':
        gridmodel = SVC(random_state=51)
    else:
        raise Exception("modelType Value not considered. Please choose from ['log','naiveBayes','tree','forest','knn','xgboost','svm']")
    return gridmodel

In [3]:
def fitModelWithGridSearch(searchParams,XTrain,yTrain,modelType):
    gridmodel = retrieveModelsBasedOnModelType(modelType)
    modelGridSearch = ms.GridSearchCV(gridmodel, param_grid=searchParams,scoring='f1',
                                      cv=ms.StratifiedKFold(n_splits=5,random_state=51, shuffle=True),n_jobs=-1)
    modelGridSearch.fit(XTrain,yTrain)
    return modelGridSearch

In [4]:
def processPredictData(predictFileName):
    df = pd.read_csv(f"../Data/External/{predictFileName}.csv")
    scaledDF = scaleTestData(df)
    encodedDF = transformDF(scaledDF)
    encodedDF.to_csv(f'../Data/External/{predictFileName}Final.csv',index=False)

In [5]:
def loadData(dataType,baseName):
    TermDepositData = None
    if dataType == "train":
        TermDepositData = pd.read_csv(f"../Data/Processed/{baseName}Train.csv")
    else:
        TermDepositData = pd.read_csv(f"../Data/Processed/{baseName}Test.csv")
    y = TermDepositData[["y"]].values.ravel()
    X = TermDepositData.drop("y",axis=1)
    return X,y

In [6]:
def getTreeFeatureRange(baseName):
    fullName = baseName+'Under'
    XTrainOriginal,yTrain = loadData("train",fullName)
    oheColumns = pickle.load(open(f'../Data/Interim/{fullName}OheColumns.pkl','rb'))
    nTreeCols = XTrainOriginal.shape[1] - len(oheColumns)
    twoEights = int(2*nTreeCols/8)
    fiveEights = int(5*nTreeCols/8)
    return [twoEights,fiveEights]

In [7]:
def printScore(trueY,predictY,dataSetType):
    scoreValue = sm.f1_score(trueY,predictY)
    print(f"{dataSetType} report")
    print(sm.classification_report(trueY,predictY))
    return scoreValue

In [8]:
def saveModel(model,modelName):
    pickle.dump(model, open(f"../Models/{modelName}.pkl", 'wb'))

In [9]:
def main():
    baseName = "TermDeposit"
    np.random.seed(51)
    
    """
    models = [
        (LogisticRegression, {'max_iter': [1000, 1500, 2000]}),
        (KNeighborsClassifier, {'n_neighbors': np.arange(2, 10, 1)}),
        (DecisionTreeClassifier, {'max_depth': np.arange(5, 10, 1)}),
        (RandomForestClassifier, {'n_estimators': np.arange(5, 10, 1)}),
        (xgb.XGBClassifier, {'n_estimators': [100, 150, 200], 'subsample': [0.8, 0.9, 1]})
    ]
    """
    logParams = {
        'max_iter': [1000, 1500, 2000]
    }
    
    bayesParams = {
        "var_smoothing": [1e-3,1e-6,1e-9]
    }
    
    treeParams = {
        'max_depth': np.arange(5, 10, 1)
    }
    forestParams = {
        'n_estimators': np.arange(5, 10, 1)
    }
    xgbParams = {
        'n_estimators': [100, 150, 200], 
        'subsample': [0.8, 0.9, 1]
    }
    svmParams = {
        "kernel": ["linear","rbf","poly","sigmoid"],
        "gamma": ["auto","scale"],
        "max_iter": [1000, 1500, 2000],
        "class_weight": ["balanced"],
        "probability": [True]
    }
    
    knnParams = {
        "n_neighbors": np.arange(2, 10, 1)
    }
    
    goodModels = []
    goodModelScores = []
    
    for balanceType in ["Under","Over","Smote","NearMiss"]:
        fullName = baseName + balanceType
        categoricalColumns = pickle.load(open(f'../Data/Interim/{fullName}SignificantCategoricalCols.pkl','rb'))
        oheColumns = pickle.load(open(f'../Data/Interim/{fullName}OheColumns.pkl','rb'))
        
        XTrainOriginal,yTrain = loadData("train",fullName)
        XTestOriginal,yTest = loadData("test",fullName)
        XTrainOHE = XTrainOriginal.drop(categoricalColumns,axis=1)
        XTestOHE = XTestOriginal.drop(categoricalColumns,axis=1)
        XTrainLE = XTrainOriginal.drop(oheColumns,axis=1)
        XTestLE = XTestOriginal.drop(oheColumns,axis=1)
    
        estimators = [
            ("logModel",fitModelWithGridSearch(logParams,XTrainOHE,yTrain,'log'),'onehot'),
            ("naiveBayes",fitModelWithGridSearch(bayesParams,XTrainOHE,yTrain,'naiveBayes'),'onehot'),
            ("tree",fitModelWithGridSearch(treeParams,XTrainLE,yTrain,'tree'),'label'),
            ("forest",fitModelWithGridSearch(forestParams,XTrainLE,yTrain,'forest'),'label'),
            ("knn",fitModelWithGridSearch(knnParams,XTrainOHE,yTrain,'knn'),'onehot'),
            ("xgboost",fitModelWithGridSearch(xgbParams,XTrainLE,yTrain,'xgboost'),'label'),
            ("svm",fitModelWithGridSearch(svmParams,XTrainOHE,yTrain,'svm'),'onehot')
        ]
        
        for est in estimators:
            modName = f'{est[0]}{balanceType}sample'
            displayName = f'{est[0]} {balanceType}sample'
            mod = est[1]
            if est[2] == 'onehot':
                XTrain,XTest = XTrainOHE,XTestOHE
            else:
                XTrain,XTest = XTrainLE,XTestLE
                
            predictTrainY = mod.predict(XTrain)
            predictTestY = mod.predict(XTest)
            print(displayName)
            print(mod.best_estimator_.get_params())
            trainScore = printScore(yTrain,predictTrainY,"Training")
            testScore = printScore(yTest,predictTestY,"Testing")
            if testScore > 0.81:
                saveModel(mod,modName)
                goodModels.append(modName)
                goodModelScores.append(testScore)

    goodModelsDictionary = {
        "goodModels": goodModels,
        "goodModelScores": goodModelScores
    }

    with open('../Models/goodModelsDictionary.json', 'w') as fp:
        json.dump(goodModelsDictionary, fp)

In [10]:
main()

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


logModel Undersample
{'C': 0.001, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 1000, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': 51, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
Training report
              precision    recall  f1-score   support

           0       0.81      0.86      0.83      2896
           1       0.85      0.79      0.82      2896

    accuracy                           0.83      5792
   macro avg       0.83      0.83      0.83      5792
weighted avg       0.83      0.83      0.83      5792

Testing report
              precision    recall  f1-score   support

           0       0.81      0.87      0.84      2896
           1       0.86      0.79      0.82      2896

    accuracy                           0.83      5792
   macro avg       0.83      0.83      0.83      5792
weighted avg       0.83      0.83      0.83      5792

naiveBayes Under

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


logModel Oversample
{'C': 0.001, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 1000, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': 51, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
Training report
              precision    recall  f1-score   support

           0       0.82      0.87      0.85     29652
           1       0.86      0.81      0.84     29714

    accuracy                           0.84     59366
   macro avg       0.84      0.84      0.84     59366
weighted avg       0.84      0.84      0.84     59366

Testing report
              precision    recall  f1-score   support

           0       0.82      0.87      0.85      7452
           1       0.86      0.81      0.84      7390

    accuracy                           0.84     14842
   macro avg       0.84      0.84      0.84     14842
weighted avg       0.84      0.84      0.84     14842

naiveBayes Oversa

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


logModel Smotesample
{'C': 0.001, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 1000, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': 51, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
Training report
              precision    recall  f1-score   support

           0       0.88      0.88      0.88     29652
           1       0.88      0.88      0.88     29714

    accuracy                           0.88     59366
   macro avg       0.88      0.88      0.88     59366
weighted avg       0.88      0.88      0.88     59366

Testing report
              precision    recall  f1-score   support

           0       0.88      0.88      0.88      7452
           1       0.88      0.88      0.88      7390

    accuracy                           0.88     14842
   macro avg       0.88      0.88      0.88     14842
weighted avg       0.88      0.88      0.88     14842

naiveBayes Smote

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


logModel NearMisssample
{'C': 0.001, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 1000, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': 51, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
Training report
              precision    recall  f1-score   support

           0       0.73      0.83      0.78      2896
           1       0.80      0.69      0.74      2896

    accuracy                           0.76      5792
   macro avg       0.77      0.76      0.76      5792
weighted avg       0.77      0.76      0.76      5792

Testing report
              precision    recall  f1-score   support

           0       0.71      0.74      0.72      2896
           1       0.72      0.69      0.71      2896

    accuracy                           0.71      5792
   macro avg       0.71      0.71      0.71      5792
weighted avg       0.71      0.71      0.71      5792

naiveBayes Ne