In [None]:
import numpy as np
import pandas as pd
#import matplotlib.pyplot as plt
from os.path import exists
import pickle
#import json
#import scipy.stats as stats
#import math
from sklearn.model_selection import train_test_split
#import seaborn as sb
#from sklearn.ensemble import RandomForestClassifier as rf
#from sklearn.pipeline import make_pipeline
#from sklearn.preprocessing import StandardScaler
#import sklearn.linear_model as lm
#from sklearn.tree import DecisionTreeClassifier as tree
#from sklearn.neighbors import KNeighborsClassifier as knn
#from xgboost import XGBClassifier as xgb
#from sklearn.svm import SVC
#from sklearn.naive_bayes import GaussianNB as gnb
#from sklearn.ensemble import VotingClassifier
#import sklearn.model_selection as ms
#import sklearn.metrics as sm
#import joblib
from sklearn.preprocessing import StandardScaler
import category_encoders as ce
%autosave 5

In [None]:
def removeOutliers(df,baseName):
    numericalColList = pickle.load(open(f"../Data/Interim/{baseName}NumericalCols.pkl", 'rb'))
    for col in numericalColList:
        df = df.loc[(df[col] >= -3) & (df[col] <= 3)]
    return df

In [None]:
def encodeTestDF(categoricalDF,baseName):
    ohe = pickle.load(open(f"../Data/Interim/{baseName}OneHotEncoder.pkl", 'rb'))
    le = pickle.load(open(f"../Data/Interim/{baseName}LabelEncoder.pkl", 'rb'))
    oheDF = ohe.transform(categoricalDF).fillna(0)
    leDF = le.transform(categoricalDF)
    return pd.concat([oheDF,leDF],axis=1)

In [None]:
def encodeDF(categoricalDF,baseName):
    ohe = ce.OneHotEncoder(handle_unknown='ignore',return_df=True,use_cat_names=True)
    le = ce.OrdinalEncoder(return_df=True)
    oheDF = ohe.fit_transform(categoricalDF)
    oheColumns = list(oheDF.columns)
    pickle.dump(oheColumns, open(f"../Data/Interim/{baseName}OheColumns.pkl", 'wb'))
    leDF = le.fit_transform(categoricalDF)
    pickle.dump(ohe, open(f"../Data/Interim/{baseName}OneHotEncoder.pkl", 'wb'))
    pickle.dump(le, open(f"../Data/Interim/{baseName}LabelEncoder.pkl", 'wb'))
    return pd.concat([oheDF,leDF],axis=1)

In [None]:
def scaleTestDF(df,baseName):
    scaler = pickle.load(open(f"../Data/Interim/{baseName}Scaler.pkl", 'rb'))
    numericalCols = list(df.columns)
    df[numericalCols] = scaler.transform(df[numericalCols])
    return df

In [None]:
def scaleDF(df,baseName):
    scaler = StandardScaler()
    numericalCols = list(df.columns)
    df[numericalCols] = scaler.fit_transform(df[numericalCols])
    pickle.dump(scaler, open(f"../Data/Interim/{baseName}Scaler.pkl", 'wb'))
    return df

In [None]:
def separateDFBySubtype(df,baseName):
    numericalCols = []
    categoricalCols = []
    for col in df.columns:
        if np.issubdtype(df[col].dtype, np.number):
            numericalCols.append(str(col))
        else:
            categoricalCols.append(str(col))
    numericalDF = df[numericalCols]
    categoricalDF = df[categoricalCols]
    pickle.dump(numericalCols, open(f"../Data/Interim/{baseName}NumericalCols.pkl", 'wb'))
    pickle.dump(categoricalCols, open(f"../Data/Interim/{baseName}CategoricalCols.pkl", 'wb'))
    return numericalDF,categoricalDF

In [None]:
def processTestData(baseName):
    df = pd.read_csv(f"../Data/Interim/{baseName}Test.csv")
    numericalCols = pickle.load(open(f"../Data/Interim/{baseName}NumericalCols.pkl", 'rb'))
    categoricalCols = pickle.load(open(f"../Data/Interim/{baseName}CategoricalCols.pkl", 'rb'))
    y = df[["y"]]
    df = df.drop("y",axis = 1)
    for balanceType in ["Under","Over"]:
        combinedName = baseName + balanceType
        numericalDF,categoricalDF = df[numericalCols],df[categoricalCols]
        scaledDF = scaleTestDF(numericalDF,combinedName)
        encodedDF = encodeTestDF(categoricalDF,combinedName)
        finalDF = pd.concat([scaledDF,encodedDF,y],axis=1)
        finalDF.to_csv(f"../Data/Processed/{combinedName}Test.csv",index=False)
        

In [None]:
def processTrainData(baseName):
    unbalancedDF = pd.read_csv(f"../Data/Interim/{baseName}Train.csv")
    unbalancedZeros = unbalancedDF[unbalancedDF["y"] == 0]
    unbalancedOnes = unbalancedDF[unbalancedDF["y"] == 1]
    
    for balanceType in ["Under","Over"]:
        combinedName = baseName + balanceType
        if balanceType == "Under":
            balancedZeros = unbalancedZeros.sample(unbalancedOnes.shape[0],random_state=51)
            balancedOnes = unbalancedOnes
        else:
            balancedZeros = unbalancedZeros
            balancedOnes = unbalancedOnes.sample(unbalancedZeros.shape[0],replace=True,random_state=51)
        df = pd.concat([balancedZeros,balancedOnes],axis=0)
        y = df[["y"]]
        df = df.drop("y",axis = 1)
        numericalDF,categoricalDF = separateDFBySubtype(df,baseName)
        scaledDF = scaleDF(numericalDF,combinedName)
        encodedDF = encodeDF(categoricalDF,combinedName)
        preOutlierDF = pd.concat([scaledDF,encodedDF,y],axis=1)
        finalDF = removeOutliers(preOutlierDF,baseName)
        finalDF.to_csv(f"../Data/Processed/{combinedName}Train.csv",index=False)

In [None]:
def binarizeTargets(fileName):
    df = pd.read_csv(fileName)
    df.loc[df["y"] == 'yes', "y"] = 1
    df.loc[df["y"] == 'no', "y"] = 0
    df.to_csv(fileName,index=False)

In [None]:
def splitData(baseName):
    df = pd.read_csv("../Data/Raw/term-deposit-marketing-2020.csv",index_col=[0])
    y = df[["y"]]
    X = df.drop("y",axis=1)
    XTrain,XTest,yTrain,yTest = train_test_split(X, y, test_size=0.2,random_state=51)
    TermDepositTrain = XTrain.copy()
    TermDepositTrain['y'] = yTrain
    TermDepositTrain.to_csv(f'../Data/Interim/{baseName}Train.csv',index=False)
    TermDepositTest = XTest.copy()
    TermDepositTest['y'] = yTest
    TermDepositTest.to_csv(f'../Data/Interim/{baseName}Test.csv',index=False)

In [None]:
def main():
    np.random.seed(51)
    baseName = "TermDeposit"
    if exists(f"../Data/Interim/{baseName}Train.csv") == False:
        splitData(baseName)
        binarizeTargets(f"../Data/Interim/{baseName}Train.csv")
        binarizeTargets(f"../Data/Interim/{baseName}Test.csv")
    processTrainData(baseName)
    processTestData(baseName)

In [None]:
main()