In [ ]:
# Part 2

In [1]:
import pandas as pd
import os
from sys import platform
import matplotlib.pyplot as plt
import datetime
import numpy as np
import pickle
import seaborn


In [2]:
# Helper functions
def is_integer(x):
   
    try:
        return (int(x) == float(x))
    except:
        return False

def ingest_files(directory: str):
    
    
    # If the directory has no trailing slash, add one
    if directory[-1] != "/":
        directory = directory + "/"
    
    all_files = os.listdir(directory)
    output = {}
    
    print("Directory " + directory + " has " + str(len(all_files)) + " files:")
    for i in all_files:
        print(" Reading file " + i)
        output[i] = pd.read_csv(directory + i, dtype = str, skiprows = 1)
        
        # To remove those lines, find any lines with non-integer IDs
        # and remove them
        invalid_rows = (output[i].id.apply( lambda x : is_integer(x) == False ))
        if invalid_rows.sum() > 0:
            print(" Found " + str(invalid_rows.sum()) + " invalid rows which were removed")
            output[i] = output[i][invalid_rows == False]
    
    return output

In [ ]:
## Question 1

In [3]:
# Define the directories that contain the full dataset files downloaded in 2019
dir_full = "/".join(["Data"] + ["full_dataset"] )

# Ingest the set of files
files_full = ingest_files(dir_full)

final_data = pd.concat(files_full.values()).reset_index(drop = True)

Directory Data/full_dataset/ has 15 files:
 Reading file LoanStats_securev1_2018Q4.csv
 Found 2 invalid rows which were removed
 Reading file LoanStats_securev1_2018Q2.csv
 Found 2 invalid rows which were removed
 Reading file LoanStats_securev1_2018Q3.csv
 Found 2 invalid rows which were removed
 Reading file LoanStats_securev1_2018Q1.csv
 Found 2 invalid rows which were removed
 Reading file LoanStats_securev1_2019Q1.csv
 Found 2 invalid rows which were removed
 Reading file LoanStats_securev1_2019Q2.csv
 Found 2 invalid rows which were removed
 Reading file LoanStats_securev1_2019Q3.csv
 Found 2 invalid rows which were removed
 Reading file LoanStats_securev1_2017Q1.csv
 Found 2 invalid rows which were removed
 Reading file LoanStats_securev1_2017Q2.csv
 Found 2 invalid rows which were removed
 Reading file LoanStats_securev1_2017Q3.csv
 Found 2 invalid rows which were removed
 Reading file LoanStats_securev1_2017Q4.csv
 Found 2 invalid rows which were removed
 Reading file LoanStat

In [5]:
print("Starting with " + str(len(final_data)) + " rows")

Starting with 1763077 rows


## QUESTION 2

In [7]:
# Tell where the directories that contain the files downloaded in 2017 and 2019
dir_2017 = "/".join(["Data"] + ["1712_download"] )
dir_2019 = "/".join(["Data"] + ["1912_download"] )

# Ingest the set of files downloaded in 2017 and then the files downloaded in 2019
files_2017 = ingest_files(dir_2017)
files_2019 = ingest_files(dir_2019)

Directory Data/1712_download/ has 8 files:
 Reading file LoanStats_securev1_2017Q1.csv
 Found 2 invalid rows which were removed
 Reading file LoanStats_securev1_2017Q2.csv
 Found 2 invalid rows which were removed
 Reading file LoanStats_securev1_2017Q3.csv
 Found 2 invalid rows which were removed
 Reading file LoanStats_securev1_2017Q4.csv
 Found 2 invalid rows which were removed
 Reading file LoanStats_securev1_2016Q2.csv
 Found 2 invalid rows which were removed
 Reading file LoanStats_securev1_2016Q3.csv
 Found 2 invalid rows which were removed
 Reading file LoanStats_securev1_2016Q1.csv
 Found 2 invalid rows which were removed
 Reading file LoanStats_securev1_2016Q4.csv
 Found 2 invalid rows which were removed
Directory Data/1912_download/ has 8 files:
 Reading file LoanStats_securev1_2017Q1.csv
 Found 2 invalid rows which were removed
 Reading file LoanStats_securev1_2017Q2.csv
 Found 2 invalid rows which were removed
 Reading file LoanStats_securev1_2017Q3.csv
 Found 2 invalid row

In [8]:
# Check the two sets of files have the same number of files
# with the same names
assert len(files_2017) == len(files_2019)
assert sorted(files_2017) == sorted(files_2019)

In [9]:
# Ensure each set of files has the same loan IDs in both instances
for i in files_2017:
    assert sorted(files_2017[i].id) == sorted(files_2019[i].id)

In [10]:
# Combined

data_2017 = pd.concat(files_2017.values()).reset_index(drop = True)
data_2019 = pd.concat(files_2019.values()).reset_index(drop = True)

In [11]:
# Looks like there are same amount of rows but 2017 has additional column
print(data_2017.shape)
print(data_2019.shape)

set(data_2017.columns) - set(data_2019.columns)

(877986, 151)
(877986, 150)


{'disbursement_method'}

In [13]:
# There is somethings that not quite right
# 2017 dataset has an additional column which is disbursement_method
data_2017.drop(['disbursement_method'], axis=1, inplace=True)

In [14]:
# Ensure the loan IDs are a unique key
assert len(set(data_2019.id)) == len(data_2019)
assert len(set(data_2017.id)) == len(data_2017)

In [18]:
columns = list(data_2017.columns)
assert sorted(columns) == sorted(data_2019.columns)

In [43]:
# Join the files. Because we have previously confirmed that each
# set of files contained the same sets of IDs *and* that these
# IDs were unique, the two files can be joined safely

# Just in case, check the datasets have the same number of rows
n_rows = len(data_2017)
assert n_rows == len(data_2019)

# Merge them
combined = pd.merge(data_2017, data_2019, how = 'inner', on="id", suffixes=('_x', '_y'))

# Ensure the merged dataset has the same number of rows
assert n_rows == len(combined)

In [47]:
# Go through each column, and find what percentage of the values in
# that column are identical in the two datasets
static_perc = {}

# Remove the ID column first
columns = [i for i in columns if i != "id"]

for i in columns:
    combined[i+"_comp"] = (combined[i+"_x"] == combined[i+"_y"]) | (combined[i+"_x"].isnull() & combined[i+"_y"].isnull())
    static_perc[i] = combined[i+"_comp"].sum()*100.0/len(combined)

In [50]:
static_perc_data = pd.DataFrame([ [i, static_perc[i]] for i in static_perc], columns=["column", "perc_equal"]).\
                                                    sort_values("perc_equal", ascending = False)
static_perc_data

Unnamed: 0,column,perc_equal
0,member_id,100.000000
64,open_il_12m,100.000000
94,num_actv_rev_tl,100.000000
93,num_actv_bc_tl,100.000000
92,num_accts_ever_120_pd,100.000000
...,...,...
40,total_pymnt_inv,23.959152
39,total_pymnt,23.959038
51,last_fico_range_low,13.783135
50,last_fico_range_high,13.783135


In [58]:
cols_to_pick = ["loan_amnt",
"funded_amnt",
"revol_util",
"revol_bal",
"fico_range_low",
"fico_range_high",
"pub_rec",
"open_acc",
"earliest_cr_line",
"delinq_2yrs",
"dti",
"purpose",
"issue_d",
"annual_inc",
"home_ownership",
"emp_length",
"grade",
"term",
"int_rate",
"installment",
"verification_status",
"recoveries",
"loan_status",
"last_pymnt_d",
"total_pymnt"]

# Ensure that the columns we want to pick for our model are in
# the above, and check how consistent they've been
assert set(cols_to_pick) - set(static_perc_data.column) - set(["id"]) == set()

static_perc_data[static_perc_data.column.isin(cols_to_pick)]

Unnamed: 0,column,perc_equal
1,loan_amnt,100.0
32,pub_rec,100.0
31,open_acc,100.0
27,fico_range_high,100.0
26,fico_range_low,100.0
25,earliest_cr_line,100.0
24,delinq_2yrs,100.0
23,dti,100.0
19,purpose,100.0
34,revol_util,100.0


In [59]:
#Consider the columns that were not consistent in both datasets

# First, make sure the columns in which int_rate and installment
# are different are the same columns. Notice they are not all the 
assert (combined["int_rate_comp"] != combined["installment_comp"]).sum() == 0

AssertionError: 

In [61]:
# There are 35 rows where the int_rate_comp and installment_comp values are not 
(combined["int_rate_comp"] != combined["installment_comp"]).sum()

35

In [67]:
# what do you notice looking at these loans?
combined.loc[(combined["int_rate_comp"] != combined["installment_comp"]),:].loc[:,['id','int_rate_x','int_rate_y','term_x','term_y','installment_x','installment_y']]

Unnamed: 0,id,int_rate_x,int_rate_y,term_x,term_y,installment_x,installment_y
39423,99535008,6.00%,6.00%,60 months,60 months,691.94,683.53
104473,111428146,6.00%,6.00%,36 months,36 months,304.18,298.27
128866,109651663,13.59%,13.59%,36 months,36 months,203.88,199.55
135919,110022313,20.00%,20.00%,36 months,36 months,628.07,595.85
152502,108147170,6.00%,6.00%,36 months,36 months,122.78,122.7
159709,107600128,12.74%,12.74%,60 months,60 months,361.93,348.79
174039,106508810,18.99%,18.99%,36 months,36 months,73.31,67.06
203968,120018914,7.35%,7.35%,36 months,36 months,1241.5,1235.0
244907,117273468,12.62%,12.62%,36 months,36 months,33.52,32.04
278811,114853903,6.00%,6.00%,36 months,36 months,591.71,573.14


## Question 3