In [124]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import random
import statsmodels.formula.api as smf
from scipy import stats
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn import metrics
%matplotlib inline

In [58]:
df = pd.read_csv("loan-data/loan_payments_data.csv")

In [27]:
#approximating income
import statistics 
# white women, black women, asian women, hispanic women
percentage_of_mens_income = [82.2, 86.9, 77.1, 86.0] 
print("women's average percentage", statistics.mean(percentage_of_mens_income))



women's average percentage 83.05


In [68]:
def years_experience_calculator(x):
    if x["education"] == "High School or Below":
        return x["age"] - 18
    if x["education"] == "Bechalor":
        return x["age"] - 20
    if x["education"] == "college":
        return x["age"] - 22
    if x["education"] == "Master or Above":
        return x["age"] - 25

# wage information comes from here: https://www.bls.gov/news.release/pdf/wkyeng.pdf
# because education is a proxy for age, we won't use age earnings that are found here:
# https://www.bls.gov/opub/reports/womens-earnings/2017/pdf/home.pdf
def approximate_salary_calculator(x):
    weekly_to_yearly_multiplier = 52
    if x["education"] == "High School or Below":
        salary = 556 * weekly_to_yearly_multiplier
    if x["education"] == "Bechalor":
        salary = 736 * weekly_to_yearly_multiplier
    if x["education"] == "college":
        salary = 1338 * weekly_to_yearly_multiplier
    if x["Gender"] == "female":
        salary *= 0.8305
    if x["education"] == "Master or Above":
        if x["Gender"] == 'female':
            salary = 2789
        else:
            salary = 3922
    return salary

education_level_map = {
    "High School or Below":1,
    "Bechalor": 2,
    "college": 3,
    "Master or Above": 4
}
    
df["education_level"] = df["education"].map(education_level_map)
df["yearly_income"] = df.apply(approximate_salary_calculator, axis=1)
df["years_experience"] = df.apply(years_experience_calculator, axis=1)
df["is_female"] = pd.get_dummies(df["Gender"])["female"]

In [65]:

def ks_selection(sample, df, columns):
    test_results = []
    for column in columns:
        pval = stats.ks_2samp(sample[column], df[column]).pvalue
        test_results.append(pval < 0.05)
    if all(test_results):
        return sample
    else:
        return ''
    

def moment_differencing_selection(sample, df, moment_values, columns):
    tmp = []
    for column in columns:
        first_moment = abs(sample[column].mean() - df[column].mean())
        tmp.append(first_moment)
    for column in columns:
        second_moment = abs(sample[column].std() - df[column].std())
        tmp.append(second_moment)
    moment_values.append(tmp)
    return moment_values
    
    
def generate_representative_sample(df, columns, sample_size=10000, num_iterations=1000):
    """
    This function generates a representative random sample based 
    on specific variables in the data set.
    We attempt two methods:
    - Kolmogorov-Smirnov test as a means of selection
    - moment differencing as criteria for representativeness
    If KS happens to every returns a valid sample,
    that means all the distributions are equal for all variables of consideration.
    If the moment differencing method is used, we search for the sample which minimizes
    difference between the first two moments.
    Notice that we only select on moment differences if ks fails for all generated samples.
    """
    
    possible_samples = []
    moment_values = []
    for _ in range(num_iterations):
        sample = df.sample(sample_size, replace=True)   
        result = ks_selection(sample, df,columns)
        if result != '':
            return result
        possible_samples.append(sample)
        moment_values = moment_differencing_selection(
             sample, 
             df, 
             moment_values,
             columns)
            
    min_sum = sum(moment_values[0])
    best_sample = possible_samples[0]
    for index, value in enumerate(moment_values):
        if sum(value) < min_sum:
            min_sum = sum(value)
            best_sample = possible_samples[index]
    return best_sample

In [12]:
df.head()

Unnamed: 0,Loan_ID,loan_status,Principal,terms,effective_date,due_date,paid_off_time,past_due_days,age,education,Gender
0,xqd20166231,PAIDOFF,1000,30,9/8/2016,10/7/2016,9/14/2016 19:31,,45,High School or Below,male
1,xqd20168902,PAIDOFF,1000,30,9/8/2016,10/7/2016,10/7/2016 9:00,,50,Bechalor,female
2,xqd20160003,PAIDOFF,1000,30,9/8/2016,10/7/2016,9/25/2016 16:58,,33,Bechalor,female
3,xqd20160004,PAIDOFF,1000,15,9/8/2016,9/22/2016,9/22/2016 20:00,,27,college,male
4,xqd20160005,PAIDOFF,1000,30,9/9/2016,10/8/2016,9/23/2016 21:36,,28,college,female


In [24]:
df["education"].unique()

array(['High School or Below', 'Bechalor', 'college', 'Master or Above'],
      dtype=object)

In [13]:
df.describe()

Unnamed: 0,Principal,terms,past_due_days,age
count,500.0,500.0,200.0,500.0
mean,943.2,22.824,36.01,31.116
std,115.240274,8.000064,29.38088,6.084784
min,300.0,7.0,1.0,18.0
25%,1000.0,15.0,3.0,27.0
50%,1000.0,30.0,37.0,30.0
75%,1000.0,30.0,60.0,35.0
max,1000.0,30.0,76.0,51.0


In [14]:
df["loan_status"].nunique()

3

In [15]:
df["loan_status"].unique()

array(['PAIDOFF', 'COLLECTION', 'COLLECTION_PAIDOFF'], dtype=object)

In [16]:
df["Principal"].nunique()

6

In [17]:
df["Principal"].unique()

array([1000,  300,  800,  900,  700,  500])

In [34]:
df.head()

Unnamed: 0,Loan_ID,loan_status,Principal,terms,effective_date,due_date,paid_off_time,past_due_days,age,education,Gender,yearly_income,years_experience,is_female
0,xqd20166231,PAIDOFF,1000,30,9/8/2016,10/7/2016,9/14/2016 19:31,,45,High School or Below,male,28912.0,27,0
1,xqd20168902,PAIDOFF,1000,30,9/8/2016,10/7/2016,10/7/2016 9:00,,50,Bechalor,female,31784.896,30,1
2,xqd20160003,PAIDOFF,1000,30,9/8/2016,10/7/2016,9/25/2016 16:58,,33,Bechalor,female,31784.896,13,1
3,xqd20160004,PAIDOFF,1000,15,9/8/2016,9/22/2016,9/22/2016 20:00,,27,college,male,69576.0,5,0
4,xqd20160005,PAIDOFF,1000,30,9/9/2016,10/8/2016,9/23/2016 21:36,,28,college,female,57782.868,6,1


In [39]:
df["Principal"].value_counts()

1000    377
800     111
300       6
500       3
900       2
700       1
Name: Principal, dtype: int64

In [134]:
# simulate data
columns = ["Principal", "terms", "age", "is_female", "education_level"]
synthesized_df = generate_representative_sample(df, columns, sample_size=2500, num_iterations=1000)

In [135]:
# Linear - Linear
cols = synthesized_df.columns.tolist()
cols.remove("Principal")
cols.remove("Loan_ID")
cols.remove("loan_status")
cols.remove("effective_date")
cols.remove("due_date")
cols.remove("paid_off_time")
cols.remove("past_due_days")
cols.remove("education")
cols.remove("Gender")
cols.remove("terms")
cols.remove("education_level")
cols.remove("is_female")
cols.remove("years_experience")
cols.remove("age")

X = synthesized_df[cols]
y = synthesized_df["Principal"]
X = sm.add_constant(X, prepend=False)
model = sm.OLS(y, X)
result = model.fit()
print(result.summary())

print("MSE", result.mse_resid)

                            OLS Regression Results                            
Dep. Variable:              Principal   R-squared:                       0.002
Model:                            OLS   Adj. R-squared:                  0.001
Method:                 Least Squares   F-statistic:                     3.813
Date:                Thu, 15 Nov 2018   Prob (F-statistic):             0.0510
Time:                        14:49:29   Log-Likelihood:                -15413.
No. Observations:                2500   AIC:                         3.083e+04
Df Residuals:                    2498   BIC:                         3.084e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
yearly_income     0.0002      0.000      1.953

In [136]:
# Log - Linear
cols = synthesized_df.columns.tolist()
cols.remove("Principal")
cols.remove("Loan_ID")
cols.remove("loan_status")
cols.remove("effective_date")
cols.remove("due_date")
cols.remove("paid_off_time")
cols.remove("past_due_days")
cols.remove("education")
cols.remove("Gender")
cols.remove("terms")
cols.remove("education_level")
cols.remove("is_female")
cols.remove("years_experience")
cols.remove("age")

X = synthesized_df[cols]
y = np.log(synthesized_df["Principal"])
X = sm.add_constant(X, prepend=False)
model = sm.OLS(y, X)
result = model.fit()
print(result.summary())
print("MSE", result.mse_resid)

                            OLS Regression Results                            
Dep. Variable:              Principal   R-squared:                       0.003
Model:                            OLS   Adj. R-squared:                  0.003
Method:                 Least Squares   F-statistic:                     7.462
Date:                Thu, 15 Nov 2018   Prob (F-statistic):            0.00635
Time:                        14:49:38   Log-Likelihood:                 977.02
No. Observations:                2500   AIC:                            -1950.
Df Residuals:                    2498   BIC:                            -1938.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
yearly_income  4.593e-07   1.68e-07      2.732

In [128]:
# Linear - Log
cols = synthesized_df.columns.tolist()
cols.remove("Principal")
cols.remove("Loan_ID")
cols.remove("loan_status")
cols.remove("effective_date")
cols.remove("due_date")
cols.remove("paid_off_time")
cols.remove("past_due_days")
cols.remove("education")
cols.remove("Gender")
cols.remove("terms")
cols.remove("education_level")
cols.remove("is_female")
cols.remove("years_experience")
cols.remove("age")

X = np.log(synthesized_df[cols])
y = synthesized_df["Principal"]
X = sm.add_constant(X, prepend=False)
model = sm.OLS(y, X)
result = model.fit()
print(result.summary())
print("MSE", result.mse_resid)

                            OLS Regression Results                            
Dep. Variable:              Principal   R-squared:                      -0.992
Model:                            OLS   Adj. R-squared:                 -0.993
Method:                 Least Squares   F-statistic:                    -621.7
Date:                Thu, 15 Nov 2018   Prob (F-statistic):               1.00
Time:                        14:47:11   Log-Likelihood:                -16274.
No. Observations:                2500   AIC:                         3.255e+04
Df Residuals:                    2497   BIC:                         3.257e+04
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
yearly_income         -3.194e+

In [137]:
# Log - Log
cols = synthesized_df.columns.tolist()
cols.remove("Principal")
cols.remove("Loan_ID")
cols.remove("loan_status")
cols.remove("effective_date")
cols.remove("due_date")
cols.remove("paid_off_time")
cols.remove("past_due_days")
cols.remove("education")
cols.remove("Gender")
cols.remove("terms")
cols.remove("education_level")
cols.remove("is_female")
cols.remove("years_experience")
cols.remove("age")

X = np.log(synthesized_df[cols])
y = np.log(synthesized_df["Principal"])
X = sm.add_constant(X, prepend=False)
model = sm.OLS(y, X)
result = model.fit()
print(result.summary())
print("MSE", result.mse_resid)

                            OLS Regression Results                            
Dep. Variable:              Principal   R-squared:                       0.021
Model:                            OLS   Adj. R-squared:                  0.021
Method:                 Least Squares   F-statistic:                     53.45
Date:                Thu, 15 Nov 2018   Prob (F-statistic):           3.55e-13
Time:                        14:50:24   Log-Likelihood:                 999.76
No. Observations:                2500   AIC:                            -1996.
Df Residuals:                    2498   BIC:                            -1984.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
yearly_income     0.0510      0.007      7.311

In [139]:
# linear - exponential
cols = synthesized_df.columns.tolist()
cols.remove("Principal")
cols.remove("Loan_ID")
cols.remove("loan_status")
cols.remove("effective_date")
cols.remove("due_date")
cols.remove("paid_off_time")
cols.remove("past_due_days")
cols.remove("education")
cols.remove("Gender")
cols.remove("terms")
cols.remove("education_level")
cols.remove("is_female")
cols.remove("years_experience")
cols.remove("age")
if "yearly_income_squared" not in cols:
    cols.append("yearly_income_squared")

synthesized_df["yearly_income_squared"] = synthesized_df["yearly_income"] * synthesized_df["yearly_income"] 
X = synthesized_df[cols]
y = synthesized_df["Principal"]
X = sm.add_constant(X, prepend=False)
model = sm.OLS(y, X)
result = model.fit()
print(result.summary())
print("MSE", result.mse_resid)

                            OLS Regression Results                            
Dep. Variable:              Principal   R-squared:                       0.038
Model:                            OLS   Adj. R-squared:                  0.037
Method:                 Least Squares   F-statistic:                     49.32
Date:                Thu, 15 Nov 2018   Prob (F-statistic):           9.85e-22
Time:                        14:50:53   Log-Likelihood:                -15367.
No. Observations:                2500   AIC:                         3.074e+04
Df Residuals:                    2497   BIC:                         3.076e+04
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
yearly_income             0.01

In [131]:
cols = synthesized_df.columns.tolist()
cols.remove("Principal")
cols.remove("Loan_ID")
cols.remove("loan_status")
cols.remove("effective_date")
cols.remove("due_date")
cols.remove("paid_off_time")
cols.remove("past_due_days")
cols.remove("education")
cols.remove("Gender")
cols.remove("terms")
cols.remove("education_level")
cols.remove("is_female")

X = synthesized_df[cols]
y = synthesized_df["Principal"]
X = sm.add_constant(X, prepend=False)
model = sm.OLS(y, X)
result = model.fit()
print(result.summary())
print("MSE", result.mse_resid)

                            OLS Regression Results                            
Dep. Variable:              Principal   R-squared:                       0.031
Model:                            OLS   Adj. R-squared:                  0.030
Method:                 Least Squares   F-statistic:                     20.27
Date:                Thu, 15 Nov 2018   Prob (F-statistic):           1.89e-16
Time:                        14:47:14   Log-Likelihood:                -15372.
No. Observations:                2500   AIC:                         3.075e+04
Df Residuals:                    2495   BIC:                         3.078e+04
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
age                     -17.66

In [132]:
# do linear regression with just women and then with just men and show how this relates to the mixed effects model

md = smf.mixedlm("Principal ~ age + yearly_income + years_experience", synthesized_df, groups=synthesized_df["is_female"])
mdf = md.fit()
print(mdf.summary())
X_train, X_test, y_train, y_test = train_test_split(synthesized_df[["age", "yearly_income", "years_experience", "is_female"]], 
                 synthesized_df["Principal"], random_state=42)
X_train["Principal"] = y_train
md = smf.mixedlm("Principal ~ age + yearly_income + years_experience", X_train, groups=X_train["is_female"])
mdf = md.fit()
result = mdf.predict(X_test)
print("MSE", metrics.mean_squared_error(y_test, result))
print("R^2", metrics.r2_score(y_test, result))



              Mixed Linear Model Regression Results
Model:                MixedLM   Dependent Variable:   Principal  
No. Observations:     2500      Method:               REML       
No. Groups:           2         Scale:                12864.7800 
Min. group size:      380       Likelihood:           -15378.6631
Max. group size:      2120      Converged:            Yes        
Mean group size:      1250.0                                     
-----------------------------------------------------------------
                  Coef.   Std.Err.   z    P>|z|  [0.025   0.975] 
-----------------------------------------------------------------
Intercept        1303.065   42.826 30.427 0.000 1219.128 1387.003
age               -21.440    2.530 -8.473 0.000  -26.399  -16.481
yearly_income       0.002    0.000  7.195 0.000    0.001    0.002
years_experience   20.526    2.554  8.036 0.000   15.520   25.532
Group Var         100.201    1.522                               

MSE 12310.941361704465


