In [None]:
# Parameterization

# Prototyping
sample_size = 1

# dropnan
do_dropallnan = 0

# Outliers
do_drop_outliers = 1

# one-hot encoding?
do_one_hot = 0

# PCA
do_pca = "none"
pca_n_components = 5

# Scaling
do_normalization = 0

# Imputation
imputation = "drop"

# Feature Selection
do_select_percentile = 0
select_percentile_percentile = 25

do_selectkbest = 1
selectkbest_k = 20

# Machine Learning
do_xgb = 0

# evaluation on test set
do_evaluate = 0

# scoring
do_scoring = 0



In [None]:
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

import pandas as pd
import numpy as np

from itertools import compress

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

from sklearn.feature_selection import SelectPercentile, SelectKBest, mutual_info_classif, chi2
from sklearn.preprocessing import OneHotEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.neural_network import MLPClassifier

from xgboost import XGBClassifier

from sklearn.model_selection import RandomizedSearchCV

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA, TruncatedSVD

from time import time
from catboost import CatBoostClassifier
#import pandas_profiling

In [None]:
# import data from files

X_in = pd.read_csv('../Data/X_binned.csv')
Y_in = pd.read_csv('../Data/train_labels.csv')


# score set
X_score = pd.read_csv('../Data/test_values.csv')

In [None]:
data = pd.DataFrame(X_in)
data = data.merge(Y_in, on='row_id')

In [None]:
data["LoanIncomeRatio"]=data["loan_amount"]/data["applicant_income"]
X_score["LoanIncomeRatio"]=X_score["loan_amount"]/X_score["applicant_income"]

In [None]:
# Only these columns are really numeric. The others are already label encoded as numbers
num_cols = [
    "loan_amount",
    "applicant_income",
    "population",
    "minority_population_pct",
    "ffiecmedian_family_income",
    "tract_to_msa_md_income_pct",
    "number_of_owner-occupied_units",
    "number_of_1_to_4_family_units",
    "loan_amount",
    "LoanIncomeRatio"
]
print("Numeric columns: ")
print(num_cols)

cat_cols = list(set(data.columns) - set(num_cols)-set(["accepted", "row_id"]))
print("Categorical columns: ")
print(cat_cols)

In [None]:
# Manual feature selection
drop_features = [
    'row_id'
    #'loan_amount',
    #'applicant_income',
    #'population',
    #'minority_population_pct',
    #'ffiecmedian_family_income',
    #'tract_to_msa_md_income_pct',
    #'number_of_owner-occupied_units',
    #'number_of_1_to_4_family_units',
    #cat
    #'occupancy',
    #'preapproval',
    #'state_code',
    #'county_code',
    #'property_type',
    #'applicant_race',
    #'loan_purpose',
    #'lender',
    #'applicant_ethnicity'
    #'applicant_sex',
    #'loan_type',
    #'co_applicant',
    #'msa_md'
]
# Update list of numeric and categorical columns
num_cols = list(set(num_cols) - set(drop_features) - set(["row_id"]))
cat_cols = list(set(cat_cols) - set(drop_features))

# Drop features from data
data.drop(drop_features, axis=1, inplace=True)
X_score.drop(drop_features, axis=1, inplace=True)

In [None]:
print("Remaining numeric columns")
print(num_cols)
print("Remaining categorical columns")
print(cat_cols)


In [None]:
# Drop outliers
def dropoutliers(data):
    if do_drop_outliers == 1:
        if "loan_amout" in data.columns:
            data.drop(data[data["loan_amount"]>700].index, inplace=True)
        if "applicant_income" in data.columns:       
            data.drop(data[data["applicant_income"]>200].index, inplace=True)
        if "population" in data.columns:
            data.drop(data[data["population"]>190000].index, inplace=True)
        if "ffiecmedian_family_income" in data.columns:
            data.drop(data[data["ffiecmedian_family_income"]<20000].index, inplace=True)
        if "ffiecmedian_family_income" in data.columns:
            data.drop(data[data["ffiecmedian_family_income"]>120000].index, inplace=True)
        if "tract_to_msa_md_income_pct" in data.columns:
            data.drop(data[data["tract_to_msa_md_income_pct"]<40].index, inplace=True)        
        if "number_of_1_to_4_family_units" in data.columns:
            data.drop(data[data["number_of_1_to_4_family_units"]>6000].index, inplace=True)
        if "number_of_owner-occupied_units" in data.columns:
            data.drop(data[data["number_of_owner-occupied_units"]>4500].index, inplace=True)
        if "LoanIncomeRatio" in data.columns:
            data.drop(data[data["LoanIncomeRatio"]>15].index, inplace=True)

        print("Dropped outliers")
        return data

data = dropoutliers(data)



In [None]:
# Split X and Y
feature_cols = [col for col in data.columns ]
X = pd.DataFrame(data.loc[:, data.columns != "accepted"])
Y = data["accepted"]

In [None]:
# Drop all nans
if do_dropallnan == 1:
    data.dropna(inplace=True)

In [None]:
# Treat missing values
data.replace({-1: np.nan}, inplace=True)
X_score.replace({-1: np.nan}, inplace=True)

if imputation == "drop":
    data.dropna(inplace=True)
    X_score.dropna(inplace=True)
    
if imputation == "median":
    for col in X.columns:
        col_median = X[col].median()
        X[col] = X[col].replace({np.nan: col_median})
        X_score[col] = X_score[col].replace({np.nan: col_median})
        
if imputation == "mean":
    for col in X.columns:
        col_mean = X[col].mean()
        X[col] = X[col].replace({np.nan: col_mean})
        X_score[col] = X_score[col].replace({np.nan: col_mean})        

if imputation == "fixed":
        for col in X.columns:
            fixed_val = -1
            X[col] = X[col].replace({np.nan: fixed_val})
            X_score[col] = X_score[col].replace({np.nan: fixed_val})


In [None]:
# Normalization
if (do_normalization == 1) and (num_cols != []):
    print("Standardizing data...")
    scaler = StandardScaler()
    X[num_cols] = scaler.fit_transform(X[num_cols])
    X_score[num_cols] = scaler.fit_transform(X_score[num_cols])

In [None]:
# Train test split
# 80% train, 20% test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
clf_cat = CatBoostClassifier(learning_rate=0.05,
                            n_estimators=200,
                            eval_metric='Accuracy',
                            one_hot_max_size=5,
                            max_depth=6,
                            random_strength=10
                            )
clf_cat.fit(X_train, Y_train, plot=True, logging_level='Silent', eval_set=(X_test.values, Y_test.values))
Y_pred = clf_cat.predict(X_test)
Y_pred_train = clf_cat.predict(X_train)
test_acc = accuracy_score(Y_test, Y_pred)
train_acc = accuracy_score(Y_train, Y_pred_train)
print("Accuracy on test set: " + str(test_acc))
print("Accuracy on train set: " + str(train_acc))


In [None]:
fea_imp = pd.DataFrame({'imp': clf_cat.feature_importances_, 'col': clf_cat.feature_names_})
fea_imp = fea_imp.sort_values(['imp', 'col'], ascending=[True, False])
fea_imp

In [None]:
import shap
shap.initjs()

# explain the model's predictions using SHAP values
# (same syntax works for LightGBM, CatBoost, and scikit-learn models)
explainer = shap.TreeExplainer(clf_cat)
shap_values = explainer.shap_values(X_train)

# visualize the first prediction's explanation (use matplotlib=True to avoid Javascript)
shap.force_plot(explainer.expected_value, shap_values[0,:], X_train.iloc[0,:])

In [None]:
shap.summary_plot(shap_values, X_train)

In [None]:
# Predict score set
Y_score = clf_cat.predict(X_score)
Y_score = Y_score.astype(int)
out = pd.DataFrame()
out["row_id"] = X_score.index.values
out["accepted"] = Y_score

out.to_csv("output.csv", index=False)