In [None]:
#import numpy as np
#import pandas as pd
#import matplotlib.pyplot as plt
#from os.path import exists
#import json
#import scipy.stats as stats
#import math
#from sklearn.model_selection import train_test_split
#import seaborn as sb
#from sklearn.ensemble import RandomForestClassifier as rf
#from sklearn.pipeline import make_pipeline
#from sklearn.preprocessing import StandardScaler
#import sklearn.linear_model as lm
#from sklearn.tree import DecisionTreeClassifier as tree
#from sklearn.neighbors import KNeighborsClassifier as knn
#from xgboost import XGBClassifier as xgb
#from sklearn.svm import SVC
#from sklearn.naive_bayes import GaussianNB as gnb
#from sklearn.ensemble import VotingClassifier
#import sklearn.model_selection as ms
#import sklearn.metrics as sm
#import joblib
#from sklearn import preprocessing
%autosave 5

In [None]:
def retrieveModelsBasedOnModelType(modelType):
    if modelType == 'tree':
        gridmodel = tree(random_state=51)
        finalmodel = tree(random_state=51)
    elif modelType == 'forest':
        gridmodel = rf(random_state=51)
        finalmodel = rf(random_state=51)
    elif modelType == 'knn':
        gridmodel = knn()
        finalmodel = knn()
    elif modelType == 'xgboost':
        gridmodel = xgb(random_state=51)
        finalmodel = xgb(random_state=51)
    elif modelType == 'svm':
        gridmodel = SVC(random_state=51)
        finalmodel = SVC(random_state=51)
    else:
        raise Exception("modelType Value not considered. Please choose from ['tree','forest','knn','xgboost','svm']")
    return gridmodel,finalmodel

In [None]:
def fitModelWithGridSearch(searchParams,XTrain,yTrain,modelType):
    gridmodel,finalmodel = retrieveModelsBasedOnModelType(modelType)
    modelGridSearch = ms.GridSearchCV(gridmodel, param_grid=searchParams,scoring='accuracy',cv=6)
    modelGridSearch.fit(XTrain,yTrain)
    finalmodel.set_params(**modelGridSearch.best_params_)
    return finalmodel

In [None]:
def transformDF(df):
    pass

In [None]:
def scaleTestData(df):
    pass

In [None]:
def encodeDF(df):
    pass

In [None]:
def processTestData(testFileName):
    df = pd.read_csv(f"../Data/Interim/{testFileName}.csv")
    scaledDF = scaleTestData(df)
    encodedDF = transformDF(scaledDF)
    encodedDF.to_csv(f'../Data/Processed/{testFileName}.csv',index=False)

In [None]:
def processPredictData(predictFileName):
    df = pd.read_csv(f"../Data/External/{predictFileName}.csv")
    scaledDF = scaleTestData(df)
    encodedDF = transformDF(scaledDF)
    encodedDF.to_csv(f'../Data/External/{predictFileName}Final.csv',index=False)

In [None]:
def loadData(dataType,baseName):
    TermDepositData = None
    if dataType == "train":
        TermDepositData = pd.read_csv(f"../Data/Processed/{baseName}Train.csv")
    else:
        TermDepositData = pd.read_csv(f"../Data/Processed/{baseName}Test.csv")
    y = TermDepositData[["y"]].values.ravel()
    X = TermDepositData.drop("y",axis=1)
    return X,y

In [None]:
def printScore(model,X,y,dataSetType):
    print(f"{dataSetType} F1 score: {model.score(X,y)}")

In [None]:
def saveModel(model,modelName):
    joblib.dump(value=model, filename=f"../Models/{modelName}.pkl")

In [None]:
def main():
    baseName = "TermDeposit"
    np.random.seed(51)
    treeParams = {
        "max_depth":[2,3],
        "max_features":[2,3],
        "criterion": ["gini","entropy"]
    }
    forestParams = {
        "n_estimators": [100,150,200,250,300],
        "max_depth":[2,3],
        "max_features":[2,3],
        "criterion": ["gini","entropy"]
    }
    xgbParams = {
        "learning_rate": list(np.linspace(.1,1,10))
    }
    svmParams = {
        "kernel": ["linear","rbf","poly","sigmoid"],
        "gamma": ["auto","scale"]
    }
    goodModels = []
    for balanceType in ["Under","Over"]:
        fullName = baseName + balanceType
        XTrainOriginal,yTrainOriginal = loadData("train",fullName)
        nRows = XTrainOriginal.shape[0]
        sqrtNRows = int(math.sqrt(nRows))
        log2NRows = int(math.log2(nRows))
        possibleThirdGeometricTerm1 = int((sqrtNRows ** 2)/log2NRows)
        possibleThirdGeometricTerm2 = int((log2NRows ** 2)/sqrtNRows)
        suggestedMaxKRange = [possibleThirdGeometricTerm1,possibleThirdGeometricTerm2]
        kRange = [int(x) for x in np.linspace(5,max(suggestedMaxKRange),10)]
    
    
        knnParams = {
            "n_neighbors": kRange
        }
    
        logModel = lm.LogisticRegression(max_iter=1e9)
    
        estimators = [
            ("logModel",logModel,'onehot'),
            ("naiveBayes",gnbModel,'onehot'),
            ("tree",fitModelWithGridSearch(treeParams,XTrain,yTrain,'tree'),'label'),
            ("forest",fitModelWithGridSearch(forestParams,XTrain,yTrain,'forest'),'label'),
            ("knn",fitModelWithGridSearch(knnParams,XTrain,yTrain,'knn'),'onehot'),
            ("xgboost",fitModelWithGridSearch(xgbParams,XTrain,yTrain,'xgboost'),'label'),
            ("svm",fitModelWithGridSearch(svmParams,XTrain,yTrain,'svm'),'onehot')
        ]
        
    
        XTest,yTest = loadData("test")
        for est in estimators:
            modName = est[0]
            mod = est[1]
            mod.fit(XTrain,yTrain)
            print(modName)
            printScore(mod,XTrain,yTrain,"Training")
            printScore(mod,XTest,yTest,"Testing")
            if mod.score(XTest,yTest) > 0.81:
                saveModel(mod,modName)
                goodModels.append(modName)

        goodModelsDictionary = {
            "goodModels": goodModels
        }

        with open('../Models/goodModelsDictionary.json', 'w') as fp:
            json.dump(goodModelsDictionary, fp)

In [None]:
main()