## Import library

In [None]:
import numpy as np
import pandas as pd
import datetime
import random
import pickle

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, BaggingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.linear_model import ElasticNet, ElasticNetCV
from sklearn.svm import SVR
from mlxtend.regressor import StackingCVRegressor
import lightgbm as lgb
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor


In [None]:
from scipy.stats import skew, norm

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_log_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import scale
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler

pd.set_option('display.max_columns', None)

In [None]:
import warnings
warnings.filterwarnings(action="ignore")
pd.options.display.max_seq_items = 8000
pd.options.display.max_rows = 8000

## Read dataset into dataframe

In [None]:
df_train = pd.read_csv('./dataset/train.csv')
df_test  = pd.read_csv('./dataset/test_9K3DBWQ.csv')
df_train.shape, df_test.shape

In [None]:
df_train.columns

In [None]:
df_train.sample(10)

In [None]:
df_train.dtypes

In [None]:
df_test.sample(5)

In [None]:
df_dict = pd.read_excel('./dataset/Data_Dictionary.xlsx')
df_dict


In [None]:
sns.set_style("white")
sns.set_color_codes(palette='deep')
f, ax = plt.subplots(figsize=(8, 7))
#Check the new distribution
sns.distplot(df_train['cc_cons'], color="b");
ax.xaxis.grid(False)
ax.set(ylabel="Frequency")
ax.set(xlabel="cc_cons")
ax.set(title="cc_cons distribution")
sns.despine(trim=True, left=True)
plt.show()

In [None]:
print("Skewness: %f" % df_train['cc_cons'].skew())
print("Kurtosis: %f" % df_train['cc_cons'].kurt())

In [None]:
# id , account_type , gender,age , region_code , loan_enq
df_train.columns

In [None]:
numeric_dtypes = ['int64', 'float64']
numeric = []
for i in df_train.columns:
    if df_train[i].dtype in numeric_dtypes:
        if i in ['id', 'account_type','gender','age','region_code','loan_enq']:
            pass
        else:
            numeric.append(i)
# visualising some more outliers in the data values
fig, axs = plt.subplots(ncols=2, nrows=0, figsize=(12, 120))
plt.subplots_adjust(right=2)
plt.subplots_adjust(top=2)
sns.color_palette("husl", 8)
for i, feature in enumerate(list(df_train[numeric]), 1):
    if(feature=='MiscVal'):
        break
    plt.subplot(len(list(numeric)), 3, i)
    sns.scatterplot(x=feature, y='cc_cons', hue='cc_cons', palette='Blues', data=df_train)
        
    plt.xlabel('{}'.format(feature), size=15,labelpad=12.5)
    plt.ylabel('cc_cons', size=15, labelpad=12.5)
    
    for j in range(2):
        plt.tick_params(axis='x', labelsize=12)
        plt.tick_params(axis='y', labelsize=12)
    
    plt.legend(loc='best', prop={'size': 10})
        
plt.show()

In [None]:
corr = df_train.corr()
plt.subplots(figsize=(15,12))
sns.heatmap(corr, vmax=0.9, cmap="Blues", square=True)

In [None]:
df_train.columns

In [None]:
data = pd.concat([df_train['cc_cons'], df_train['cc_cons_apr']], axis=1)
data.plot.scatter(x='cc_cons_apr', y='cc_cons', alpha=0.3, ylim=(0,200000));

In [None]:
# Remove the Ids from train and test, as they are unique for each row and hence not useful for the model
train_ID = df_train['id']
test_ID = df_test['id']
df_train.drop('id', axis=1, inplace=True)
df_test.drop('id', axis=1, inplace=True)
df_train.shape, df_test.shape

In [None]:
df_train_copy = df_train.copy()
df_test_copy = df_test.copy()

## Feature Engineer

In [None]:
sns.set_style("white")
sns.set_color_codes(palette='deep')
f, ax = plt.subplots(figsize=(8, 7))
#Check the new distribution 
sns.distplot(df_train['cc_cons'], color="b");
ax.xaxis.grid(False)
ax.set(ylabel="Frequency")
ax.set(xlabel="cc_cons")
ax.set(title="cc_cons distribution")
sns.despine(trim=True, left=True)
plt.show()

In [None]:
df_train = df_train_copy.copy()

In [None]:
# Remove outlier
df_train.drop(df_train[(df_train['cc_cons']>250000)].index, inplace=True)
df_train.drop(df_train[(df_train['cc_cons']<50)].index, inplace=True)

df_train.reset_index(drop=True, inplace=True)

In [None]:
df_train["cc_cons"] = np.log1p(df_train["cc_cons"])

In [None]:
sns.set_style("white")
sns.set_color_codes(palette='deep')
f, ax = plt.subplots(figsize=(8, 7))
#Check the new distribution 
sns.distplot(df_train['cc_cons'] , fit=norm, color="b");

# Get the fitted parameters used by the function
(mu, sigma) = norm.fit(df_train['cc_cons'])
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))

#Now plot the distribution
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
            loc='best')
ax.xaxis.grid(False)
ax.set(ylabel="Frequency")
ax.set(xlabel="cc_cons")
ax.set(title="cc_cons distribution")
sns.despine(trim=True, left=True)

plt.show()

In [None]:
# Split features and labels
train_labels = df_train['cc_cons'].reset_index(drop=True)
train_features = df_train.drop(['cc_cons'], axis=1)
test_features = df_test

# Combine train and test features in order to apply the feature transformation pipeline to the entire dataset
all_features = pd.concat([train_features, test_features]).reset_index(drop=True)
all_features.shape

## Fill missing values

In [None]:
def percent_missing(df):
    data = pd.DataFrame(df)
    df_cols = list(pd.DataFrame(data))
    dict_x = {}
    for i in range(0, len(df_cols)):
        dict_x.update({df_cols[i]: round(data[df_cols[i]].isnull().mean()*100,2)})
    
    return dict_x

missing = percent_missing(all_features)
df_miss = sorted(missing.items(), key=lambda x: x[1], reverse=True)
print('Percent of missing data')
df_miss[0:42]

In [None]:
# Visualize missing values
sns.set_style("white")
f, ax = plt.subplots(figsize=(8, 7))
sns.set_color_codes(palette='deep')
missing = round(df_train.isnull().mean()*100,2)
missing = missing[missing > 0]
missing.sort_values(inplace=True)
missing.plot.bar(color="b")
# Tweak the visual presentation
ax.xaxis.grid(False)
ax.set(ylabel="Percent of missing values")
ax.set(xlabel="Features")
ax.set(title="Percent missing data by feature")
sns.despine(trim=True, left=True)

In [None]:
all_features.columns

In [None]:
# fill 0 in 'personal_loan_active', 'vehicle_loan_active', 'personal_loan_closed',
#        'vehicle_loan_closed',

for i in all_features.columns:
    if i in [ 'personal_loan_active', 'vehicle_loan_active', 'personal_loan_closed', 'vehicle_loan_closed']:
        all_features[i].fillna(0, inplace=True)
            

In [None]:
# loan_enq fill 0, and change Y to 1
all_features['loan_enq'].replace('Y',1, inplace=True)
all_features['loan_enq'].fillna(0,inplace=True)

In [None]:
all_features['card_lim'][all_features['card_lim']<10000]=np.nan
all_features['card_lim'] = all_features.groupby('age')['card_lim'].transform(lambda x: x.fillna(x.median()))


In [None]:
# fill nan 'investment_1', 'investment_2', 'investment_3', 'investment_4' to 0
for i in all_features.columns:
    if i in [ 'investment_1', 'investment_2', 'investment_3', 'investment_4']:
        all_features[i].fillna(0, inplace=True)
        
all_features['total_investment']=all_features['investment_1']+all_features['investment_2']+all_features['investment_3']+all_features['investment_4']
# all_features.drop(['investment_1','investment_2','investment_3','investment_4'], axis=1, inplace=True)

In [None]:
# 'dc_cons_apr', 'dc_count_apr
# 'dc_cons_may', 'dc_count_may'
all_features['dc_cons_apr'].fillna(0, inplace=True)
all_features['dc_count_apr'].fillna(0, inplace=True) 
all_features['dc_cons_may'].fillna(0, inplace=True)
all_features['dc_count_may'].fillna(0, inplace=True) 

In [None]:
all_features.update(all_features[['dc_cons_jun', 'dc_count_jun']][(all_features['dc_cons_jun'].isnull()) & (all_features['dc_count_jun'].isnull())].fillna(0))


In [None]:
# ('dc_cons_jun', 17.73),
all_features['dc_cons_jun'] = all_features.groupby('age')['dc_cons_jun'].transform(lambda x: x.fillna(x.median()))


In [None]:
#  ('debit_amount_may', 7.04),
#  ('debit_count_may', 6.33),

#  ('debit_amount_jun', 6.7),
#   ('debit_count_jun', 4.66),
   
#  ('debit_amount_apr', 5.84),
#  ('debit_count_apr', 5.43),
all_features.update(all_features[['debit_amount_may', 'debit_count_may']][(all_features['debit_amount_may'].isnull()) & (all_features['debit_count_may'].isnull())].fillna(0))
all_features.update(all_features[['debit_amount_jun', 'debit_count_jun']][(all_features['debit_amount_jun'].isnull()) & (all_features['debit_count_jun'].isnull())].fillna(0))
all_features.update(all_features[['debit_amount_apr', 'debit_count_apr']][(all_features['debit_amount_apr'].isnull()) & (all_features['debit_count_apr'].isnull())].fillna(0))


In [None]:
#  ('debit_amount_jun', 2.04),
#  ('debit_amount_may', 0.71),
#  ('debit_amount_apr', 0.41),

all_features['debit_amount_jun'] = all_features.groupby('age')['debit_amount_jun'].transform(lambda x: x.fillna(x.median()))
all_features['debit_amount_may'] = all_features.groupby('age')['debit_amount_may'].transform(lambda x: x.fillna(x.median()))
all_features['debit_amount_apr'] = all_features.groupby('age')['debit_amount_apr'].transform(lambda x: x.fillna(x.median()))


In [None]:
#  ('credit_amount_jun', 4.7),
#  ('max_credit_amount_jun', 4.7),
# ('credit_amount_may', 10.45),
#  ('max_credit_amount_may', 10.45),
#  ('credit_amount_apr', 10.17),
#  ('max_credit_amount_apr', 10.17),

all_features['credit_amount_may'].fillna(0, inplace=True)
all_features['max_credit_amount_may'].fillna(0, inplace=True) 
all_features['credit_amount_apr'].fillna(0, inplace=True)
all_features['max_credit_amount_apr'].fillna(0, inplace=True)
all_features['credit_amount_jun'].fillna(0, inplace=True)
all_features['max_credit_amount_jun'].fillna(0, inplace=True)


In [None]:
    
# ('cc_count_may', 3.62),
# ('cc_count_apr', 7.33),
# ('cc_count_jun', 4.74),

for col in ['cc_count_may','cc_count_apr','cc_count_jun']:
    all_features[col] = all_features.groupby(['age','card_lim'])[col].transform(lambda x: x.fillna(x.median()))


In [None]:
# ('cc_count_apr', 0.38),
# ('cc_count_jun', 0.22),
# ('cc_count_may', 0.21),

for col in ['cc_count_may','cc_count_apr','cc_count_jun']:
    all_features[col] = all_features.groupby(['age'])[col].transform(lambda x: x.fillna(x.median()))


In [None]:
# ('credit_count_may', 6.33),
# ('credit_count_apr', 5.43),
# ('credit_count_jun', 4.66),

for col in ['credit_count_may','credit_count_apr', 'credit_count_jun']:
    all_features[col] = all_features.groupby(['age'])[col].transform(lambda x: x.fillna(x.median()))


In [None]:
missing = percent_missing(all_features)
df_miss = sorted(missing.items(), key=lambda x: x[1], reverse=True)
print('Percent of missing data')
df_miss[0:42]

In [None]:
all_features.to_csv('./dataset/all_feature_210719.csv', index=False)

## Fix skewed features

In [None]:
all_features = pd.read_csv('./dataset/all_feature_210719.csv')

In [None]:
# Some of the non-numeric predictors are stored as numbers; convert them into strings 

for col in ('region_code', 'age', 'personal_loan_active', 'vehicle_loan_active', 'personal_loan_closed', 'vehicle_loan_closed', 'loan_enq'):
    all_features[col] = all_features[col].apply(str)

In [None]:
# Fetch all numeric features
numeric_dtypes = ['int64', 'float64']

numeric = []
for i in all_features.columns:
    if all_features[i].dtype in numeric_dtypes:
        numeric.append(i)

In [None]:
# Create box plots for all numeric features
sns.set_style("white")
f, ax = plt.subplots(figsize=(8, 7))
ax.set_xscale("log")
ax = sns.boxplot(data=all_features[numeric] , orient="h", palette="Set1")
ax.xaxis.grid(False)
ax.set(ylabel="Feature names")
ax.set(xlabel="Numeric values")
ax.set(title="Numeric Distribution of Features")
sns.despine(trim=True, left=True)

In [None]:
# Find skewed numerical features
skew_features = all_features[numeric].apply(lambda x: skew(x)).sort_values(ascending=False)

high_skew = skew_features[skew_features >0.5]
skew_index = high_skew.index

print("There are {} numerical features with Skew > 0.5 :".format(high_skew.shape[0]))
skewness = pd.DataFrame({'Skew' :high_skew})
skew_features.head(27)

In [None]:
for i in skew_index:

    all_features[i] = np.log1p(all_features[i])


In [None]:
sns.set_style("white")
f, ax = plt.subplots(figsize=(8, 7))
ax.set_xscale("log")
ax = sns.boxplot(data=all_features[numeric] , orient="h", palette="Set1")
ax.xaxis.grid(False)
ax.set(ylabel="Feature names")
ax.set(xlabel="Numeric values")
ax.set(title="Numeric Distribution of Features")
sns.despine(trim=True, left=True)

In [None]:
# Find skewed numerical features
skew_features = all_features[numeric].apply(lambda x: skew(x)).sort_values(ascending=False)

high_skew = skew_features[skew_features >0.5]
skew_index = high_skew.index
print(skew_index)

print("There are {} numerical features with Skew > 0.5 :".format(high_skew.shape[0]))
skewness = pd.DataFrame({'Skew' :high_skew})
skew_features.head()

In [None]:
for i in skew_index:

    all_features[i] = np.cbrt(all_features[i])

In [None]:
sns.set_style("white")
f, ax = plt.subplots(figsize=(8, 7))
ax.set_xscale("log")
ax = sns.boxplot(data=all_features[numeric] , orient="h", palette="Set1")
ax.xaxis.grid(False)
ax.set(ylabel="Feature names")
ax.set(xlabel="Numeric values")
ax.set(title="Numeric Distribution of Features")
sns.despine(trim=True, left=True)

In [None]:
# Find skewed numerical features
skew_features = all_features[numeric].apply(lambda x: skew(x)).sort_values(ascending=False)

high_skew = skew_features[skew_features >0.5]
skew_index = high_skew.index
print(skew_index)

print("There are {} numerical features with Skew > 0.5 :".format(high_skew.shape[0]))
skewness = pd.DataFrame({'Skew' :high_skew})
skew_features.head()

In [None]:
all_features.describe()

## Encode categorical features

In [None]:
all_features = pd.get_dummies(all_features).reset_index(drop=True)
all_features.shape

In [None]:
all_features.sample(3)

In [None]:
# Remove any duplicated column names
all_features = all_features.loc[:,~all_features.columns.duplicated()]

In [None]:
all_features.to_csv('./dataset/all_features_final_210719.csv', index=False)

## Recreate training and test sets

In [None]:
all_features=pd.read_csv('./dataset/all_features_final_210719.csv')

In [None]:
X = all_features.iloc[:len(train_labels), :]
X_test = all_features.iloc[len(train_labels):, :]
X.shape, X_test.shape

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler_x = MinMaxScaler()
print(scaler_x.fit(all_features))
all_features=scaler_x.transform(all_features)

In [None]:
np.any(np.isnan(X))

In [None]:
np.all(np.isfinite(X))

In [None]:
X[np.isnan(X)]=0

In [None]:
np.any(np.isnan(X))

In [None]:
X.to_csv('./dataset/X_210719.csv', index=False)
X_test.to_csv('./dataset/X_test_210719.csv', index=False)

In [None]:
df_labels=pd.DataFrame(train_labels)
df_labels.to_csv('./dataset/df_labels_210719.csv', index=False)

# Train model

### Setup cross validation and define error metrics¶


In [None]:
# Setup cross validation folds
kf = KFold(n_splits=3, random_state=42, shuffle=True)

In [None]:
from sklearn.metrics import SCORERS
SCORERS.keys()

In [None]:
# Define error metrics
def rmsle(y, y_pred):
    return mean_squared_log_error(y, y_pred)*100

def cv_rmse(model, X=X):
    rmse = np.sqrt(-cross_val_score(model, X, train_labels, scoring="neg_mean_squared_error", cv=kf))
    return (rmse)

### Set up model

In [None]:
# Light Gradient Boosting Regressor
lightgbm = LGBMRegressor(objective='regression', 
                       num_leaves=6,
                       learning_rate=0.01, 
                       n_estimators=7000,
                       max_bin=200, 
                       bagging_fraction=0.8,
                       bagging_freq=4, 
                       bagging_seed=8,
                       feature_fraction=0.2,
                       feature_fraction_seed=8,
                       min_sum_hessian_in_leaf = 11,
                       verbose=-1,
                       random_state=42)

# XGBoost Regressor
xgboost = XGBRegressor(learning_rate=0.01,
                       n_estimators=6000,
                       max_depth=4,
                       min_child_weight=0,
                       gamma=0.6,
                       subsample=0.7,
                       colsample_bytree=0.7,
                       objective='reg:squarederror',
                       nthread=-1,
                       scale_pos_weight=1,
                       seed=27,
                       reg_alpha=0.00006,
                       random_state=42)


# Gradient Boosting Regressor
gbr = GradientBoostingRegressor(n_estimators=6000,
                                learning_rate=0.01,
                                max_depth=4,
                                max_features='sqrt',
                                min_samples_leaf=15,
                                min_samples_split=10,
                                loss='huber',
                                random_state=42)  

# Ridge Regressor
ridge_alphas = [1e-15, 1e-10, 1e-8, 9e-4, 7e-4, 5e-4, 3e-4, 1e-4, 1e-3, 5e-2, 1e-2, 0.1, 0.3, 1, 3, 5, 10, 15, 18, 20, 30, 50, 75, 100]
ridge = make_pipeline(RobustScaler(), RidgeCV(alphas=ridge_alphas, cv=kf))

# Support Vector Regressor
svr = make_pipeline(RobustScaler(), SVR(C= 20, epsilon= 0.008, gamma=0.0003))

# Random Forest Regressor
rf = RandomForestRegressor(n_estimators=1200,
                          max_depth=15,
                          min_samples_split=5,
                          min_samples_leaf=5,
                          max_features=None,
                          oob_score=True,
                          random_state=42)

# Stack up all the models above, optimized using gbr
stack_gen_6 = StackingCVRegressor(regressors=(xgboost, lightgbm, svr, ridge, gbr, rf),
                                meta_regressor=gbr,
                                use_features_in_secondary=True)


In [None]:
scores = {}

In [None]:
score = cv_rmse(lightgbm)
print("lightgbm: {:.4f} ({:.4f})".format(score.mean(), score.std()))
scores['lgb'] = (score.mean(), score.std())

In [None]:
score = cv_rmse(xgboost)
print("xgboost: {:.4f} ({:.4f})".format(score.mean(), score.std()))
scores['xgb'] = (score.mean(), score.std())

In [None]:
score = cv_rmse(gbr)
print("gbr: {:.4f} ({:.4f})".format(score.mean(), score.std()))
scores['gbr'] = (score.mean(), score.std())

In [None]:
score = cv_rmse(svr)
print("SVR: {:.4f} ({:.4f})".format(score.mean(), score.std()))
scores['svr'] = (score.mean(), score.std())

In [None]:
score = cv_rmse(ridge)
print("ridge: {:.4f} ({:.4f})".format(score.mean(), score.std()))
scores['ridge'] = (score.mean(), score.std())

In [None]:
score = cv_rmse(rf)
print("rf: {:.4f} ({:.4f})".format(score.mean(), score.std()))
scores['rf'] = (score.mean(), score.std())

## Fit the models


In [None]:
print('Svr')
svr_model_full_data = svr.fit(X, train_labels)

# save the model to disk
filename = './models/svr_210719.sav'
pickle.dump(svr_model_full_data, open(filename, 'wb'))

In [None]:
print('Ridge')
ridge_model_full_data = ridge.fit(X, train_labels)

# save the model to disk
filename = './models/ridge_210719.sav'
pickle.dump(ridge_model_full_data, open(filename, 'wb'))

In [None]:
print('RandomForest')
rf_model_full_data = rf.fit(X, train_labels)

# save the model to disk
filename = './models/randomforest_210719.sav'
pickle.dump(rf_model_full_data, open(filename, 'wb'))

In [None]:
print('stack_gen')
stack_gen_6model = stack_gen_6.fit(np.array(X), np.array(train_labels))

# save the model to disk
filename = './models/stack_gen_6model_210719.sav'
pickle.dump(stack_gen_6model, open(filename, 'wb'))

In [None]:
print('lightgbm_210719')
lgb_model_full_data = lightgbm.fit(X, train_labels)

# save the model to disk
filename = './models/lightgbm_210719.sav'
pickle.dump(lgb_model_full_data, open(filename, 'wb'))

In [None]:
print('xgboost_210719')
xgb_model_full_data = xgboost.fit(X, train_labels)

# save the model to disk
filename = './models/xgboost_210719.sav'
pickle.dump(xgb_model_full_data, open(filename, 'wb'))

In [None]:
print('gradientboosting_210719')
gbr_model_full_data = gbr.fit(X, train_labels)

# save the model to disk
filename = './models/gradientboosting_210719.sav'
pickle.dump(gbr_model_full_data, open(filename, 'wb'))

## Submit predictions

In [None]:
X_test[np.isnan(X_test)]=0

In [None]:
def blended7_predictions(X):
    return ((0.05 * ridge_model_full_data.predict(X)) + \
            (0.1 * svr_model_full_data.predict(X)) + \
            (0.15 * gbr_model_full_data.predict(X)) + \
            (0.1 * xgb_model_full_data.predict(X)) + \
            (0.15 * lgb_model_full_data.predict(X)) + \
            (0.1 * rf_model_full_data.predict(X)) + \
            (0.35 * stack_gen_6model.predict(np.array(X))))

In [None]:
y_pred = blended7_predictions(X_test)

In [None]:
# Read in sample_submission dataframe
submission = pd.read_csv("./dataset/sample_submission_iwBpW0t.csv")
submission.shape

In [None]:
# Append predictions from blended models
submission.iloc[:,1] = np.floor(np.expm1(y_pred))

In [None]:
# Fix outleir predictions
q1 = submission['cc_cons'].quantile(0.00035)
q2 = submission['cc_cons'].quantile(0.99)
submission['cc_cons'] = submission['cc_cons'].apply(lambda x: x if x > q1 else x*0.80)
submission['cc_cons'] = submission['cc_cons'].apply(lambda x: x if x < q2 else x*1.08)
submission.to_csv("./dataset/submission_blended71_210719.csv", index=False)


In [None]:
# Scale predictions
submission['cc_cons'] *= 0.9845
submission.to_csv("./dataset/submission_blended72_210719.csv", index=False)
