# Regression/Classifier of Loan Amount

In [None]:
## Packages to import
from itertools import *
import numpy as np
import pandas as pd
import sys
## Install packages if need be
#!{sys.executable} -m pip install category_encoders
#!{sys.executable} -m pip install scikit-learn
#!{sys.executable} -m pip install fancyimpute
#!{sys.executable} -m pip install xgboost

#Encoders and Imputers
from category_encoders import TargetEncoder, OneHotEncoder, HashingEncoder, BinaryEncoder, OrdinalEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge

#Scaling
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler

#Plotting and visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.gridspec as grid_spec
from matplotlib.ticker import FuncFormatter
import seaborn as sns

# sklearn and models
from sklearn.model_selection import train_test_split,cross_val_score,RepeatedKFold, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error

from sklearn.linear_model import LinearRegression,SGDRegressor
from sklearn.tree import DecisionTreeRegressor
import xgboost as xgb
from xgboost import XGBRegressor

## Load Data

In [None]:
# ## Load Data
# df = pd.read_csv('.csv')
# dg = pd.read_csv('loan_amount.csv')

### See datatypes and any missing values

In [None]:
## General Information about data
print(df.info())
print(df.dtypes)
print(df.country.unique())
print(df.describe())

### Split intro train/val/test sets

In [None]:
### Split train and test and validation (Compare some models with validation)
X_train,X_test,y_train,y_test = train_test_split(df,dg,test_size=0.20,random_state=42)
X_tt,X_val,y_tt,y_val = train_test_split(X_train,y_train,test_size=0.20,random_state=32)

## Check for correlation between variables

In [None]:
print(pd.concat([X_tt,y_tt],axis=1).corr())

### Encoders

In [None]:
## ENCODERS
# Too much colision for n_components=8
#def hashing_encoding(df_feature):
#    '''For state data'''
#    df = df_feature.copy()
#    he=HashingEncoder(n_components=8)
#    return he.fit_transform(df)


def binary_encoding(df_feature):
    df = df_feature.copy()
    be= BinaryEncoder(handle_missing='return_nan',return_df=True)
    return be.fit_transform(df)

def ordinal_encoding(df_feature):
    df = df_feature.copy()
    oe = OrdinalEncoder(handle_missing='return_nan',return_df=True)
    ddf = oe.fit_transform(df)
    ddf.columns = [df_feature.name+'_enc']
    return ddf

def one_hot_encoding(df_feature):
    df = df_feature.copy()
    ohe = OneHotEncoder(handle_unknown='return_nan',return_df=True,use_cat_names=True)
    return ohe.fit_transform(df)



# def encode(X):
#     df = X.copy()
#     enc = [df,binary_encoding(df['state']),ordinal_encoding(df['is_married']),one_hot_encoding(df['gender']),one_hot_encoding(df['promo_group_1']),one_hot_encoding(df['promo_group_2'])]
#     df1 = pd.concat(enc,axis=1)
#     X_enc = df1.drop(columns=['state','gender','is_married','promo_group_1','promo_group_2'])
#     return X_enc

# X_tt_enc = encode(X_tt)
# print(X_tt_enc.columns)
# X_tt_enc.dtypes

## Imputers

In [None]:
## Impute missing data
def iter_impute(X):
    impute = IterativeImputer(BayesianRidge())
    X_imp = pd.DataFrame(impute.fit_transform(X))
    X_imp.columns = X.columns
    return X_imp


X_tt_imp = iter_impute(X_tt_enc)
print(X_tt_imp.shape)
print(X_tt_imp.columns)
print(X_tt_enc.shape)
print(X_tt_enc.columns)



### Discretize Variables

In [None]:
def discretize(df,col,bins = None,labels = None,*args):
    mx = df[col].max()
    mn = df[col].min()
    if bins == None and labels == None:
        step = (mx-mn)/10
        b = np.arange(mn,mx,step)
        df[col+'_bin']=pd.cut(x = df[col],bins = b,labels = list(range(10)))
    elif labels == None:
        l = len(bins)
        df[col+'_bin']=pd.cut(x = df[col],labels = list(range(l)))
    elif bins == None:
        step = (mx-mn)/l
        b = np.arange(mn,mx,step)
        df[col+'_bin']=pd.cut(x = df[col],labels = list(range(l)))
    else:
        df[col+'_bin']=pd.cut(x = df[col],bins = bins,labels = l)
    return

### Normalize Distr

In [None]:
def log_normalize(df,col):
    df[col] = df[col].apply(lambda x: np.log(x+1))
    return

### Boxplot to get idea of outliers
sns.set_theme(style="whitegrid")
ax = sns.boxplot(x=df[col]) #see how skew it is afterwards

### Trim or Drop Outliers

In [None]:
## Trimming Outliers
def trim_outliers(X_tt_imp):
    X = X_tt_imp.copy()
    X['income'] = X['income'].clip(upper=X_tt_imp.income.quantile(.95),lower=X_tt_imp.income.quantile(.05))
    X['age'] = X['age'].clip(upper=X_tt_imp.age.quantile(.95),lower=X_tt_imp.age.quantile(.05))
    X['brand_awareness_index'] = X['brand_awareness_index'].clip(upper=X_tt_imp.brand_awareness_index.quantile(.95))
    X['customer_loyalty_index'] = X['customer_loyalty_index'].clip(upper=X_tt_imp.customer_loyalty_index.quantile(.95),lower=-X_tt_imp.customer_loyalty_index.quantile(.95))
    return X

## Dropping outliers (more than 2.5 stddevs from mean)
def drop_outliers(X_tt_imp,y_tt):
    X = X_tt_imp.copy()
    X=X.set_index('customer_id')
    y = y_tt.copy()
    Xc = pd.concat([X,y],axis=1)
    Xc = Xc.drop(Xc[(Xc['income'] > X_tt_imp.income.quantile(.98)) | (Xc['income'] < X_tt_imp.income.quantile(.02)) | (Xc['customer_loyalty_index'] > X_tt_imp.customer_loyalty_index.quantile(.98)) | (Xc['customer_loyalty_index'] < X_tt_imp.customer_loyalty_index.quantile(.02)) | (Xc['brand_awareness_index'] > X_tt_imp.brand_awareness_index.quantile(.95))].index)
    Xd = Xc.drop(columns=['tov_6mos'])
    yd = Xc.tov_6mos
    return Xd,yd

## Training data with outliers removed
Xd,yd = drop_outliers(X_tt_imp,y_tt)
print(Xd.shape)
print(yd.shape)


# Scaling

In [None]:
## MinMaxScaling not necessary for tree based methods as it is monotonic transformation
def scaling(X_tt_imp):    
    scaler = MinMaxScaler() 
    X1 = X_tt_imp.copy()
    X1['income'] = scaler.fit_transform(np.array(X1['income']).reshape(-1, 1))
    X1['age'] = scaler.fit_transform(np.array(X1['age']).reshape(-1, 1))
    X1['customer_loyalty_index'] = scaler.fit_transform(np.array(X1['customer_loyalty_index']).reshape(-1, 1))
    X1['brand_awareness_index'] = scaler.fit_transform(np.array(X1['brand_awareness_index']).reshape(-1, 1))
    return X1

# Prepare Validation Set

In [None]:
def encode_impute(X_train1,X_test1):
    X_train = X_train1.copy()
    X_test = X_test1.copy()
    X_test['train'] = 0
    X_train['train'] = 1
    comb = pd.concat([X_train,X_test])
    comb_enc = encode(comb)
    comb_imp = iter_impute(comb_enc)
    X_test_enc = comb_enc[comb_enc['train'] == 0].drop(columns=['train'])
    X_test_imp = comb_imp[comb_imp['train'] == 0].drop(columns=['train'])
    return X_test_enc,X_test_imp

# Hyperparameter Tuning

In [None]:
def model_hyperparam_tune(max_dep,alph,lamb):
    xgb_model = XGBRegressor(n_estimators = 200, max_depth=max_dep, min_child_weight=5, gamma=0, eta=0.1, subsample=.75, colsample_bytree=0.8,reg_alpha=alph,reg_lambda=lamb)
    xgb_model.fit(Xd,yd,eval_metric='rmse')
    y_pred = xgb_model.predict(X_val_imp)
    y_tt_pred = xgb_model.predict(Xd)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    mae = mean_absolute_error(y_val,y_pred)
    print("MAE %f and RMSE %f" % (mae,rmse))
    return y_pred,y_tt_pred

max_dep = (5,8)
alph = (.05,.1,.2)
lamb = (.05,.1,.2)
for z in product(max_dep,alph,lamb):
    print(z)
    y_pred,y_tt_pred= model_hyperparam_tune(*z)


# Train Model on whole training set

In [None]:
#Using (8, 0.2, 0.2)
xgb_model = XGBRegressor(n_estimators = 200, max_depth=8, min_child_weight=5, gamma=0, eta=0.1, subsample=.75, colsample_bytree=0.8,reg_alpha=.2,reg_lambda=.2)
xgb_model.fit(X_train_d,y_train_d,eval_metric='rmse')
y_pred = xgb_model.predict(X_test_imp)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test,y_pred)
print("MAE %f and RMSE %f" % (mae,rmse))


# Feature Importance

In [None]:
### Feature Importance
xgb.plot_importance(xgb_model)
plt.rcParams['figure.figsize'] = [9, 9]
plt.show()
