In [34]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import random
from sklearn.model_selection import cross_val_score
from sklearn import ensemble
from scipy import stats
from treeinterpreter import treeinterpreter as ti
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn import metrics
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
%matplotlib inline

In [23]:
df = pd.read_csv("loan-data/loan_payments_data.csv")

In [27]:
#approximating income
import statistics 
# white women, black women, asian women, hispanic women
percentage_of_mens_income = [82.2, 86.9, 77.1, 86.0] 
print("women's average percentage", statistics.mean(percentage_of_mens_income))



women's average percentage 83.05


In [6]:
df.head()

Unnamed: 0,Loan_ID,loan_status,Principal,terms,effective_date,due_date,paid_off_time,past_due_days,age,education,Gender,education_level,yearly_income,years_experience,is_female
0,xqd20166231,PAIDOFF,1000,30,9/8/2016,10/7/2016,9/14/2016 19:31,,45,High School or Below,male,1,28912.0,27,0
1,xqd20168902,PAIDOFF,1000,30,9/8/2016,10/7/2016,10/7/2016 9:00,,50,Bechalor,female,2,31784.896,30,1
2,xqd20160003,PAIDOFF,1000,30,9/8/2016,10/7/2016,9/25/2016 16:58,,33,Bechalor,female,2,31784.896,13,1
3,xqd20160004,PAIDOFF,1000,15,9/8/2016,9/22/2016,9/22/2016 20:00,,27,college,male,3,69576.0,5,0
4,xqd20160005,PAIDOFF,1000,30,9/9/2016,10/8/2016,9/23/2016 21:36,,28,college,female,3,57782.868,6,1


In [24]:
def years_experience_calculator(x):
    if x["education"] == "High School or Below":
        return x["age"] - 18
    if x["education"] == "Bechalor":
        return x["age"] - 20
    if x["education"] == "college":
        return x["age"] - 22
    if x["education"] == "Master or Above":
        return x["age"] - 25

# wage information comes from here: https://www.bls.gov/news.release/pdf/wkyeng.pdf
# because education is a proxy for age, we won't use age earnings that are found here:
# https://www.bls.gov/opub/reports/womens-earnings/2017/pdf/home.pdf
def approximate_salary_calculator(x):
    weekly_to_yearly_multiplier = 52
    if x["education"] == "High School or Below":
        salary = 556 * weekly_to_yearly_multiplier
    if x["education"] == "Bechalor":
        salary = 736 * weekly_to_yearly_multiplier
    if x["education"] == "college":
        salary = 1338 * weekly_to_yearly_multiplier
    if x["Gender"] == "female":
        salary *= 0.8305
    if x["education"] == "Master or Above":
        if x["Gender"] == 'female':
            salary = 2789
        else:
            salary = 3922
    return salary

df.drop("Loan_ID", inplace=True, axis=1)
df.drop("effective_date", inplace=True, axis=1)
df.drop("due_date", inplace=True, axis=1)
df.drop("paid_off_time", inplace=True, axis=1)
df.drop("past_due_days", inplace=True, axis=1)
df.drop("loan_status", inplace=True, axis=1)
df = pd.concat([df, pd.get_dummies(df["education"])], axis=1)
df["yearly_income"] = df.apply(approximate_salary_calculator, axis=1)
df["years_experience"] = df.apply(years_experience_calculator, axis=1)
df["is_female"] = pd.get_dummies(df["Gender"])["female"]
df.drop("Gender", inplace=True, axis=1)
df.drop("education", inplace=True, axis=1)

In [8]:

def ks_selection(sample, df, columns):
    test_results = []
    for column in columns:
        pval = stats.ks_2samp(sample[column], df[column]).pvalue
        test_results.append(pval < 0.05)
    if all(test_results):
        return sample
    else:
        return ''
    

def moment_differencing_selection(sample, df, moment_values, columns):
    tmp = []
    for column in columns:
        first_moment = abs(sample[column].mean() - df[column].mean())
        tmp.append(first_moment)
    for column in columns:
        second_moment = abs(sample[column].std() - df[column].std())
        tmp.append(second_moment)
    moment_values.append(tmp)
    return moment_values
    
    
def generate_representative_sample(df, columns, sample_size=10000, num_iterations=1000):
    """
    This function generates a representative random sample based 
    on specific variables in the data set.
    We attempt two methods:
    - Kolmogorov-Smirnov test as a means of selection
    - moment differencing as criteria for representativeness
    If KS happens to every returns a valid sample,
    that means all the distributions are equal for all variables of consideration.
    If the moment differencing method is used, we search for the sample which minimizes
    difference between the first two moments.
    Notice that we only select on moment differences if ks fails for all generated samples.
    """
    
    possible_samples = []
    moment_values = []
    for _ in range(num_iterations):
        sample = df.sample(sample_size, replace=True)   
        result = ks_selection(sample, df,columns)
        if result != '':
            return result
        possible_samples.append(sample)
        moment_values = moment_differencing_selection(
             sample, 
             df, 
             moment_values,
             columns)
            
    min_sum = sum(moment_values[0])
    best_sample = possible_samples[0]
    for index, value in enumerate(moment_values):
        if sum(value) < min_sum:
            min_sum = sum(value)
            best_sample = possible_samples[index]
    return best_sample

In [18]:
df.head()

Unnamed: 0,Principal,terms,age,education,Bechalor,High School or Below,Master or Above,college,yearly_income,years_experience,is_female
0,1000,30,45,High School or Below,0,1,0,0,28912.0,27,0
1,1000,30,50,Bechalor,1,0,0,0,31784.896,30,1
2,1000,30,33,Bechalor,1,0,0,0,31784.896,13,1
3,1000,15,27,college,0,0,0,1,69576.0,5,0
4,1000,30,28,college,0,0,0,1,57782.868,6,1


In [25]:
# simulate data
columns = ["Principal", "terms", "age", "is_female"]
synthesized_df = generate_representative_sample(df, columns, sample_size=2500, num_iterations=1000)

In [29]:
from sklearn.ensemble import RandomForestRegressor

y = synthesized_df["Principal"]
X = synthesized_df.drop("Principal", axis=1)
rf_reg = RandomForestRegressor()
prediction = cross_val_predict(rf_reg, X, y)
metrics.mean_squared_error(y, prediction)



4096.56614476692

In [30]:
import statistics as st

def contrib_feature_importance_overall(X_test, contrib):
    features = X_test.columns.tolist()
    feature_contribs = {}
    for feature in features:
        feature_contribs[feature] = []
    for contribution in contrib:
        for ind, feature in enumerate(features):
            feature_contribs[feature].append(contribution[ind])
    return feature_contribs

def feature_importance(X_test, contrib):
    overall_importance = contrib_feature_importance_overall(X_test, contrib)
    avg_feature_importance = []
    for feature in overall_importance:
        avg_feature_importance.append((feature, st.mean(overall_importance[feature])))

    avg_feature_importance = sorted(avg_feature_importance, key=lambda t:t[1], reverse=True)
    return avg_feature_importance

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
rf_reg.fit(X_train, y_train)
pred, bias, contrib = ti.predict(rf_reg, X_test)
feature_importance(X_test, contrib)



[('yearly_income', 0.6312068805208947),
 ('Master or Above', 0.3129780994085355),
 ('High School or Below', 0.199241285280263),
 ('college', 0.06768034116079136),
 ('Bechalor', -0.07524361715519406),
 ('age', -0.31578966425677063),
 ('is_female', -0.6056948690251931),
 ('terms', -1.1546497184439015),
 ('years_experience', -1.21184388499497)]

In [31]:
from sklearn.ensemble import GradientBoostingRegressor

y = synthesized_df["Principal"]
X = synthesized_df.drop("Principal", axis=1)
gb_reg = GradientBoostingRegressor()
prediction = cross_val_predict(gb_reg, X, y)
metrics.mean_squared_error(y, prediction)



4963.453764724211

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
rf_reg.fit(X_train, y_train)
pred, bias, contrib = ti.predict(rf_reg, X_test)
feature_importance(X_test, contrib)

[('Master or Above', 0.5372374870671442),
 ('Bechalor', 0.23172145674858427),
 ('yearly_income', 0.1128393378077322),
 ('High School or Below', 0.052122792314277334),
 ('college', 0.0414879765215041),
 ('is_female', -0.253609586909219),
 ('terms', -0.6310896782435291),
 ('years_experience', -0.9815547519116219),
 ('age', -1.6334797891805983)]

In [None]:
def baseline_model():
# create model
    model = Sequential()
    model.add(Dense(18, kernel_initializer='normal', activation='relu'))
    model.add(Dense(9, kernel_initializer="normal", activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    # Compile model
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

seed = 7
np.random.seed(seed)
# evaluate model with standardized dataset
estimator = KerasRegressor(build_fn=baseline_model, epochs=100, batch_size=5, verbose=0)

kfold = KFold(n_splits=10, random_state=seed)
results = cross_val_score(estimator, X.as_matrix(), y.values, cv=kfold)
print("Results: %.2f (%.2f) MSE" % (results.mean(), results.std()))

