In [None]:
import numpy as np
import pandas as pd
#import matplotlib.pyplot as plt
#from os.path import exists
import json
#import scipy.stats as stats
import math
#from sklearn.model_selection import train_test_split
#import seaborn as sb
from sklearn.ensemble import RandomForestClassifier as rf
#from sklearn.pipeline import make_pipeline
#from sklearn.preprocessing import StandardScaler
import sklearn.linear_model as lm
from sklearn.tree import DecisionTreeClassifier as tree
from sklearn.neighbors import KNeighborsClassifier as knn
from xgboost import XGBClassifier as xgb
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB as gnb
#from sklearn.ensemble import VotingClassifier
import sklearn.model_selection as ms
import sklearn.metrics as sm
import pickle
#from sklearn import preprocessing
import warnings

warnings.filterwarnings("ignore")
%autosave 5

In [None]:
def retrieveModelsBasedOnModelType(modelType):
    if modelType == 'log':
        gridmodel = lm.LogisticRegression(random_state=51,penalty='l2')
    elif modelType == 'naiveBayes':
        gridmodel = gnb()
    elif modelType == 'tree':
        gridmodel = tree(random_state=51)
    elif modelType == 'forest':
        gridmodel = rf(random_state=51, oob_score=True)
    elif modelType == 'knn':
        gridmodel = knn()
    elif modelType == 'xgboost':
        gridmodel = xgb(random_state=51,reg_alpha=1000,reg_lambda=1000)
    elif modelType == 'svm':
        gridmodel = SVC(random_state=51)
    else:
        raise Exception("modelType Value not considered. Please choose from ['log','naiveBayes','tree','forest','knn','xgboost','svm']")
    return gridmodel

In [None]:
def fitModelWithGridSearch(searchParams,XTrain,yTrain,modelType):
    gridmodel = retrieveModelsBasedOnModelType(modelType)
    modelGridSearch = ms.GridSearchCV(gridmodel, param_grid=searchParams,scoring='f1',
                                      cv=ms.StratifiedKFold(n_splits=7,random_state=51, shuffle=True),n_jobs=-1)
    modelGridSearch.fit(XTrain,yTrain)
    return modelGridSearch

In [None]:
def processPredictData(predictFileName):
    df = pd.read_csv(f"../Data/External/{predictFileName}.csv")
    scaledDF = scaleTestData(df)
    encodedDF = transformDF(scaledDF)
    encodedDF.to_csv(f'../Data/External/{predictFileName}Final.csv',index=False)

In [None]:
def loadData(dataType,baseName):
    TermDepositData = None
    if dataType == "train":
        TermDepositData = pd.read_csv(f"../Data/Processed/{baseName}Train.csv")
    else:
        TermDepositData = pd.read_csv(f"../Data/Processed/{baseName}Test.csv")
    y = TermDepositData[["y"]].values.ravel()
    X = TermDepositData.drop("y",axis=1)
    return X,y

In [None]:
def getTreeFeatureRange(baseName):
    fullName = baseName+'Under'
    XTrainOriginal,yTrain = loadData("train",fullName)
    oheColumns = pickle.load(open(f'../Data/Interim/{fullName}OheColumns.pkl','rb'))
    nTreeCols = XTrainOriginal.shape[1] - len(oheColumns)
    twoEights = int(2*nTreeCols/8)
    fiveEights = int(5*nTreeCols/8)
    return [twoEights,fiveEights]

In [None]:
def printScore(trueY,predictY,dataSetType):
    scoreValue = sm.f1_score(trueY,predictY)
    print(f"{dataSetType} report")
    print(sm.classification_report(trueY,predictY))
    return scoreValue

In [None]:
def saveModel(model,modelName):
    pickle.dump(model, open(f"../Models/{modelName}.pkl", 'wb'))

In [None]:
def main():
    baseName = "TermDeposit"
    np.random.seed(51)
    
    treeFeatureRange = getTreeFeatureRange(baseName)
    
    logParams = {
        "class_weight": ["balanced"],
        "C": [0.1,0.01,0.001],
        "max_iter": [100,1000,10000]
    }
    
    bayesParams = {
        "var_smoothing": [1,1e-1,1e-2,1e-3]
    }
    
    treeParams = {
        "max_depth":treeFeatureRange,
        "max_features":treeFeatureRange,
        "min_samples_split": [2] + treeFeatureRange,
        "ccp_alpha":[0] + treeFeatureRange,
        "criterion": ["gini","entropy"],
        "class_weight": ["balanced"]
    }
    forestParams = {
        "n_estimators": [50,100,150],
        "max_depth": treeFeatureRange,
        "max_features":treeFeatureRange,
        "min_samples_split": [2] + treeFeatureRange,
        "ccp_alpha":[0] + treeFeatureRange,
        "criterion": ["gini","entropy"],
        "class_weight": ["balanced"]
    }
    xgbParams = {
        "learning_rate": [.05,.1,.5],
        "n_estimators": [50,100,150],
        "max_depth": treeFeatureRange
    }
    svmParams = {
        "kernel": ["linear","rbf","poly","sigmoid"],
        "gamma": ["auto","scale"],
        "max_iter": [100,500,1000],
        "class_weight": ["balanced"],
        "probability": [True]
    }
    goodModels = []
    
    categoricalColumns = pickle.load(open('../Data/Interim/TermDepositCategoricalCols.pkl','rb'))
    
    for balanceType in ["Under","Over"]:
        fullName = baseName + balanceType
        oheColumns = pickle.load(open(f'../Data/Interim/{fullName}OheColumns.pkl','rb'))
        
        XTrainOriginal,yTrain = loadData("train",fullName)
        XTestOriginal,yTest = loadData("test",fullName)
        XTrainOHE = XTrainOriginal.drop(categoricalColumns,axis=1)
        XTestOHE = XTestOriginal.drop(categoricalColumns,axis=1)
        XTrainLE = XTrainOriginal.drop(oheColumns,axis=1)
        XTestLE = XTestOriginal.drop(oheColumns,axis=1)
        
        #44 is overfit
        kRange = [int(x) for x in np.linspace(44,88,10)]
    
    
        knnParams = {
            "n_neighbors": kRange
        }
    
        logModel = lm.LogisticRegression()
        gnbModel = gnb()
    
        estimators = [
            ("logModel",fitModelWithGridSearch(logParams,XTrainOHE,yTrain,'log'),'onehot'),
            ("naiveBayes",fitModelWithGridSearch(bayesParams,XTrainOHE,yTrain,'naiveBayes'),'onehot'),
            ("tree",fitModelWithGridSearch(treeParams,XTrainLE,yTrain,'tree'),'label'),
            ("forest",fitModelWithGridSearch(forestParams,XTrainLE,yTrain,'forest'),'label'),
            ("knn",fitModelWithGridSearch(knnParams,XTrainOHE,yTrain,'knn'),'onehot'),
            ("xgboost",fitModelWithGridSearch(xgbParams,XTrainLE,yTrain,'xgboost'),'label'),
            ("svm",fitModelWithGridSearch(svmParams,XTrainOHE,yTrain,'svm'),'onehot')
        ]
        
        for est in estimators:
            modName = f'{est[0]} {balanceType}sample'
            mod = est[1]
            if est[2] == 'onehot':
                XTrain,XTest = XTrainOHE,XTestOHE
            else:
                XTrain,XTest = XTrainLE,XTestLE
                
            predictTrainY = mod.predict(XTrain)
            predictTestY = mod.predict(XTest)
            print(modName)
            print(mod.best_estimator_.get_params())
            trainScore = printScore(yTrain,predictTrainY,"Training")
            if printScore(yTest,predictTestY,"Testing") > 0.81:
                saveModel(mod,modName)
                goodModels.append(modName)

    goodModelsDictionary = {
        "goodModels": goodModels
    }

    with open('../Models/goodModelsDictionary.json', 'w') as fp:
        json.dump(goodModelsDictionary, fp)

In [None]:
main()