In [None]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
from os.path import exists
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import category_encoders as ce
from sklearn.feature_selection import chi2
from scipy.stats import spearmanr
from imblearn.over_sampling import RandomOverSampler,SMOTENC
from imblearn.under_sampling import RandomUnderSampler,NearMiss
%autosave 5

In [None]:
def removeUnimportantCategoricalColumns(categoricalDF,y,combinedName,datasetType = 'train'):
    if datasetType == 'test':
        significantColumns = pickle.load(open(f"../Data/Interim/{combinedName}SignificantCategoricalCols.pkl", 'rb'))
    else:
        le = ce.OrdinalEncoder(return_df=True)
        leDF = le.fit_transform(categoricalDF)
        pValues = chi2(leDF, y)[1]
        pValueDF = pd.DataFrame({"feature":list(categoricalDF.columns),"pValue":pValues},columns=["feature","pValue"],index=None)
        lowPDF = pValueDF[pValueDF["pValue"] < 0.05]
        significantColumns = list(lowPDF['feature'])
        pickle.dump(significantColumns, open(f"../Data/Interim/{combinedName}SignificantCategoricalCols.pkl", 'wb'))
    return categoricalDF[significantColumns]
        

In [None]:
def removeUnimportantNumericalColumns(numericalDF,y,combinedName,datasetType = 'train'):
    if datasetType == 'test':
        significantColumns = pickle.load(open(f"../Data/Interim/{combinedName}SignificantNumericalCols.pkl", 'rb'))
    else:
        scaler = StandardScaler()
        numericalCols = list(numericalDF.columns)
        numericalDF[numericalCols] = scaler.fit_transform(numericalDF[numericalCols])
        significantColumns = []
        allNumericalColumns = numericalDF.columns
        for col in allNumericalColumns:
            x = numericalDF[[col]].values.ravel()
            p = spearmanr(x,y)[1]
            if p < 0.05:
                significantColumns.append(str(col))
        pickle.dump(significantColumns, open(f"../Data/Interim/{combinedName}SignificantNumericalCols.pkl", 'wb'))
    return numericalDF[significantColumns]       

In [None]:
def removeOutliers(df,baseName):
    numericalColList = pickle.load(open(f"../Data/Interim/{baseName}SignificantNumericalCols.pkl", 'rb'))
    for col in numericalColList:
        df = df.loc[(df[col] >= -3) & (df[col] <= 3)]
    return df

In [None]:
def encodeTestDF(categoricalDF,baseName):
    ohe = pickle.load(open(f"../Data/Interim/{baseName}OneHotEncoder.pkl", 'rb'))
    le = pickle.load(open(f"../Data/Interim/{baseName}LabelEncoder.pkl", 'rb'))
    oheDF = ohe.transform(categoricalDF).fillna(0)
    leDF = le.transform(categoricalDF)
    return pd.concat([oheDF,leDF],axis=1)

In [None]:
def encodeDF(categoricalDF,baseName):
    ohe = ce.OneHotEncoder(handle_unknown='ignore',return_df=True,use_cat_names=True)
    le = ce.OrdinalEncoder(return_df=True)
    oheDF = ohe.fit_transform(categoricalDF)
    oheColumns = list(oheDF.columns)
    pickle.dump(oheColumns, open(f"../Data/Interim/{baseName}OheColumns.pkl", 'wb'))
    leDF = le.fit_transform(categoricalDF)
    pickle.dump(ohe, open(f"../Data/Interim/{baseName}OneHotEncoder.pkl", 'wb'))
    pickle.dump(le, open(f"../Data/Interim/{baseName}LabelEncoder.pkl", 'wb'))
    return pd.concat([oheDF,leDF],axis=1)

In [None]:
def scaleTestDF(df,baseName):
    scaler = pickle.load(open(f"../Data/Interim/{baseName}Scaler.pkl", 'rb'))
    numericalCols = list(df.columns)
    df[numericalCols] = scaler.transform(df[numericalCols])
    return df

In [None]:
def scaleDF(df,baseName):
    scaler = StandardScaler()
    numericalCols = list(df.columns)
    df[numericalCols] = scaler.fit_transform(df[numericalCols])
    pickle.dump(scaler, open(f"../Data/Interim/{baseName}Scaler.pkl", 'wb'))
    return df

In [None]:
def getCategoricalColumns(df):
    categoricalColumnIndecies = []
    for i,col in enumerate(df.columns):
        if np.issubdtype(df[col].dtype, np.number) == False:
            categoricalColumnIndecies.append(i)
    return categoricalColumnIndecies

In [None]:
def separateDFBySubtype(df,baseName):
    numericalCols = []
    categoricalCols = []
    for col in df.columns:
        if np.issubdtype(df[col].dtype, np.number):
            numericalCols.append(str(col))
        else:
            categoricalCols.append(str(col))
    numericalDF = df[numericalCols]
    categoricalDF = df[categoricalCols]
    pickle.dump(numericalCols, open(f"../Data/Interim/{baseName}NumericalCols.pkl", 'wb'))
    pickle.dump(categoricalCols, open(f"../Data/Interim/{baseName}CategoricalCols.pkl", 'wb'))
    return numericalDF,categoricalDF

In [None]:
def balanceDataset(X,y,balanceType):
    if balanceType == "Under":
        balancer = RandomUnderSampler(random_state=51, replacement=True)
    elif balanceType == "Over":
        balancer = RandomOverSampler(random_state=51)
    elif balanceType == "NearMiss":
        balancer = NearMiss()
    else: #default is SMOTE
        categoricalVariables = [i for i,col in enumerate(X.columns) if not np.issubdtype(X[col].dtype, np.number)]
        balancer = SMOTENC(categorical_features=categoricalVariables,random_state=51)
    
    if balanceType != "NearMiss":
        return balancer.fit_resample(X, y)
    else:
        categoricalVariables = [col for col in X.columns if not np.issubdtype(X[col].dtype, np.number)]
        encoder = ce.OneHotEncoder(cols=categoricalVariables)
        X_encoded = encoder.fit_transform(X)
        resampledX,resampledy = balancer.fit_resample(X_encoded, y)
        return encoder.inverse_transform(resampledX),resampledy

In [None]:
def processTestData(baseName):
    originalTestDF = pd.read_csv(f"../Data/Interim/{baseName}Test.csv")
    
    numericalCols = pickle.load(open(f"../Data/Interim/{baseName}NumericalCols.pkl", 'rb'))
    categoricalCols = pickle.load(open(f"../Data/Interim/{baseName}CategoricalCols.pkl", 'rb'))
    
    for balanceType in ["Under","Over","Smote","NearMiss"]:
        combinedName = baseName + balanceType
        df = originalTestDF.copy()
        y = df[["y"]]
        yArray = y.values.ravel()
        df = df.drop("y",axis = 1)
        numericalDF = removeUnimportantNumericalColumns(df,yArray,combinedName,"test")
        categoricalDF = removeUnimportantCategoricalColumns(df,yArray,combinedName,"test")
        scaledDF = scaleTestDF(numericalDF,combinedName)
        encodedDF = encodeTestDF(categoricalDF,combinedName)
        finalDF = pd.concat([scaledDF,encodedDF],axis=1)
        finalDF['y'] = yArray.reshape(-1,1)
        finalDF.to_csv(f"../Data/Processed/{combinedName}Test.csv",index=False)
        

In [None]:
def processTrainData(baseName):
    unbalancedDF = pd.read_csv(f"../Data/Interim/{baseName}Train.csv")
    originalY = unbalancedDF[["y"]]
    originalYArray = originalY.values.ravel()
    originalX = unbalancedDF.drop("y",axis = 1)
    for balanceType in ["Under","Over","Smote","NearMiss"]:
        combinedName = baseName + balanceType
        df,yArray = balanceDataset(originalX,originalYArray,balanceType)
        numericalDF,categoricalDF = separateDFBySubtype(df,baseName)
        numericalDF = removeUnimportantNumericalColumns(numericalDF,yArray,combinedName)
        categoricalDF = removeUnimportantCategoricalColumns(categoricalDF,yArray,combinedName)
        scaledDF = scaleDF(numericalDF,combinedName)
        encodedDF = encodeDF(categoricalDF,combinedName)
        preOutlierDF = pd.concat([scaledDF,encodedDF],axis=1)
        preOutlierDF['y'] = yArray.reshape(-1,1)
        finalDF = removeOutliers(preOutlierDF,combinedName)
        finalDF.to_csv(f"../Data/Processed/{combinedName}Train.csv",index=False)

In [None]:
def binarizeTargets(fileName):
    df = pd.read_csv(fileName)
    df.loc[df["y"] == 'yes', "y"] = 1
    df.loc[df["y"] == 'no', "y"] = 0
    df.to_csv(fileName,index=False)

In [None]:
def splitData(baseName):
    df = pd.read_csv("../Data/Raw/term-deposit-marketing-2020.csv",index_col=[0])
    y = df[["y"]]
    X = df.drop("y",axis=1)
    XTrain,XTest,yTrain,yTest = train_test_split(X, y, test_size=0.2,random_state=51)
    TermDepositTrain = XTrain.copy()
    TermDepositTrain['y'] = yTrain
    TermDepositTrain.to_csv(f'../Data/Interim/{baseName}Train.csv',index=False)
    TermDepositTest = XTest.copy()
    TermDepositTest['y'] = yTest
    TermDepositTest.to_csv(f'../Data/Interim/{baseName}Test.csv',index=False)

In [None]:
def main():
    np.random.seed(51)
    baseName = "TermDeposit"
    if exists(f"../Data/Interim/{baseName}Train.csv") == False:
        splitData(baseName)
        binarizeTargets(f"../Data/Interim/{baseName}Train.csv")
        binarizeTargets(f"../Data/Interim/{baseName}Test.csv")
    processTrainData(baseName)
    processTestData(baseName)

In [None]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    main()