In [77]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import glob
import warnings
import os
import numpy as np
import pickle
from sys import platform
import datetime
import seaborn

warnings.filterwarnings('ignore')


In [78]:
#QUESTION1

In [79]:
def is_integer(x):
   
    try:
        return (int(x) == float(x))
    except:
        return False

In [80]:
def ingest_files(directory: str):
    
    
    # If the directory has no trailing slash, add one
    if directory[-1] != "/":
        directory = directory + "/"
    
    all_files = os.listdir(directory)
    output = {}
    
    print("Directory " + directory + " has " + str(len(all_files)) + " files:")
    for i in all_files:
        print(" Reading file " + i)
        output[i] = pd.read_csv(directory + i, dtype = str, skiprows = 1)
        
        # To remove those lines, find any lines with non-integer IDs
        # and remove them
        invalid_rows = (output[i].id.apply( lambda x : is_integer(x) == False ))
        if invalid_rows.sum() > 0:
            print(" Found " + str(invalid_rows.sum()) + " invalid rows which were removed")
            output[i] = output[i][invalid_rows == False]
    
    return output

In [81]:
# Define the directories that contain the full dataset files downloaded in 2019
dir_full = "/".join(["Data"] + ["full_dataset"] )

# Ingest the set of files
files_full = ingest_files(dir_full)

final_data = pd.concat(files_full.values()).reset_index(drop = True)

Directory Data/full_dataset/ has 15 files:
 Reading file LoanStats_securev1_2016Q1.csv
 Found 2 invalid rows which were removed
 Reading file LoanStats_securev1_2016Q2.csv
 Found 2 invalid rows which were removed
 Reading file LoanStats_securev1_2016Q3.csv
 Found 2 invalid rows which were removed
 Reading file LoanStats_securev1_2016Q4.csv
 Found 2 invalid rows which were removed
 Reading file LoanStats_securev1_2017Q1.csv
 Found 2 invalid rows which were removed
 Reading file LoanStats_securev1_2017Q2.csv
 Found 2 invalid rows which were removed
 Reading file LoanStats_securev1_2017Q3.csv
 Found 2 invalid rows which were removed
 Reading file LoanStats_securev1_2017Q4.csv
 Found 2 invalid rows which were removed
 Reading file LoanStats_securev1_2018Q1.csv
 Found 2 invalid rows which were removed
 Reading file LoanStats_securev1_2018Q2.csv
 Found 2 invalid rows which were removed
 Reading file LoanStats_securev1_2018Q3.csv
 Found 2 invalid rows which were removed
 Reading file LoanStat

In [32]:
final_data.to_csv("final_data.csv",index=False)

In [33]:
final_data.shape

(1763077, 150)

In [39]:
final_data.duplicated().sum()

0

In [41]:
final_data.isnull().sum()

id                             0
member_id                1763077
loan_amnt                      0
funded_amnt                    0
funded_amnt_inv                0
                          ...   
settlement_status        1736832
settlement_date          1736832
settlement_amount        1736832
settlement_percentage    1736832
settlement_term          1736832
Length: 150, dtype: int64

In [None]:
#QUESTION 2

In [34]:
# Tell where the directories that contain the files downloaded in 2017 and 2019
dir_2017 = "/".join(["Data"] + ["1712_download"] )
dir_2019 = "/".join(["Data"] + ["1912_download"] )

# Ingest the set of files downloaded in 2017 and then the files downloaded in 2019
files_2017 = ingest_files(dir_2017)
files_2019 = ingest_files(dir_2019)

Directory Data/1712_download/ has 8 files:
 Reading file LoanStats_securev1_2016Q1.csv
 Found 2 invalid rows which were removed
 Reading file LoanStats_securev1_2016Q2.csv
 Found 2 invalid rows which were removed
 Reading file LoanStats_securev1_2016Q3.csv
 Found 2 invalid rows which were removed
 Reading file LoanStats_securev1_2016Q4.csv
 Found 2 invalid rows which were removed
 Reading file LoanStats_securev1_2017Q1.csv
 Found 2 invalid rows which were removed
 Reading file LoanStats_securev1_2017Q2.csv
 Found 2 invalid rows which were removed
 Reading file LoanStats_securev1_2017Q3.csv
 Found 2 invalid rows which were removed
 Reading file LoanStats_securev1_2017Q4.csv
 Found 2 invalid rows which were removed
Directory Data/1912_download/ has 8 files:
 Reading file LoanStats_securev1_2016Q1.csv
 Found 2 invalid rows which were removed
 Reading file LoanStats_securev1_2016Q2.csv
 Found 2 invalid rows which were removed
 Reading file LoanStats_securev1_2016Q3.csv
 Found 2 invalid row

In [82]:
# Check the two sets of files have the same number of files
# with the same names
assert len(files_2017) == len(files_2019)
assert sorted(files_2017) == sorted(files_2019)

In [84]:
# Ensure each set of files has the same loan IDs in both instances
for i in files_2017:
    assert sorted(files_2017[i].id) == sorted(files_2019[i].id)

In [85]:
# Combined

data_2017 = pd.concat(files_2017.values()).reset_index(drop = True)
data_2019 = pd.concat(files_2019.values()).reset_index(drop = True)


In [86]:
data_2017.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
0,75758152,,34000,34000,34000,60 months,15.31%,814.41,C,C5,...,,,Cash,N,,,,,,
1,75277223,,12200,12200,12200,60 months,11.47%,268.13,B,B5,...,,,Cash,N,,,,,,
2,75909198,,10000,10000,10000,36 months,11.47%,329.62,B,B5,...,,,Cash,N,,,,,,
3,67969208,,21000,21000,21000,36 months,11.47%,692.2,B,B5,...,,,Cash,N,,,,,,
4,76151755,,35000,35000,35000,60 months,21.18%,950.42,E,E3,...,,,Cash,N,,,,,,


In [87]:
data_2019.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
0,74121690,,6000,6000,6000,36 months,12.99%,202.14,C,C2,...,,,,N,,,,,,
1,74724861,,21000,21000,21000,60 months,19.53%,550.9,D,D5,...,,,,N,,,,,,
2,74826201,,7200,7200,7200,36 months,5.32%,216.83,A,A1,...,,,,N,,,,,,
3,75061311,,12000,12000,12000,60 months,11.99%,266.88,C,C1,...,,,,N,,,,,,
4,75091735,,11425,11425,11425,36 months,19.53%,421.87,D,D5,...,,,,N,,,,,,


In [67]:
# 2017 dataset has an additional column
data_2017.drop(['disbursement_method'], axis=1, inplace=True)

In [68]:
print(data_2017.shape)
print(data_2019.shape)

(877986, 150)
(877986, 150)


In [69]:
A=set(data_2017.columns)
B=set(data_2019.columns)

A-B

set()

In [70]:
# Ensure the loan IDs are a unique key
assert len(set(data_2019.id)) == len(data_2019)

In [71]:
columns = list(data_2017.columns)

# We verified every file had the same columns, but double check just in case
assert sorted(columns) == sorted(data_2019.columns)

In [72]:
# Join the files. Because we have previously confirmed that each
# set of files contained the same sets of IDs *and* that these
# IDs were unique, the two files can be joined safely

# Just in case, check the datasets have the same number of rows
n_rows = len(data_2017)
assert n_rows == len(data_2019)

# Merge them
combined = pd.merge(data_2017, data_2019, how = 'inner', on="id", suffixes=('_x', '_y'))

# Ensure the merged dataset has the same number of rows
assert n_rows == len(combined)

In [73]:
# Go through each column, and find what percentage of the values in
# that column are identical in the two datasets
static_perc = {}

# Remove the ID column first
columns = [i for i in columns if i != "id"]

for i in columns:
    combined[i+"_comp"] = (combined[i+"_x"] == combined[i+"_y"]) | (combined[i+"_x"].isnull() & combined[i+"_y"].isnull())
    static_perc[i] = combined[i+"_comp"].sum()*100.0/len(combined)

In [74]:
static_perc = pd.DataFrame([ [i, static_perc[i]] for i in static_perc], columns=["column", "perc_equal"]).\
                                                    sort_values("perc_equal", ascending = False)

In [75]:
# Ensure that the columns we want to pick for our model are in
# the above, and check how consistent they've been
assert set(cols_to_pick) - set(static_perc.column) - set(["id"]) == set()

static_perc[static_perc.column.isin(cols_to_pick)]

NameError: name 'cols_to_pick' is not defined

In [None]:
#Consider the columns that were not consistent in both datasets

# First, make sure the columns in which int_rate and installment
# are different are the same columns
assert (combined["int_rate_comp"] != combined["installment_comp"]).sum() == 0

In [None]:
# So the differences in int_rate and installment are the same. Let's look at them
combined[combined.int_rate_comp == False][["id", "issue_d_x", "issue_d_y", "term_x", "term_y", "int_rate_x", "int_rate_y"]]

In [None]:
#Question3

In [None]:
#Prepare Final Dataset

In [30]:
print("Starting with " + str(len(final_data)) + " rows")

Starting with 1763077 rows


In [31]:
for i in float_cols:
    final_data[i] = final_data[i].astype(float)
    
def clean_perc(x):
    if pd.isnull(x):
        return np.nan
    else:
        return float(x.strip()[:-1])
for i in perc_cols:
    final_data[i] = final_data[i].apply( clean_perc )
    
def clean_date(x):
    if pd.isnull(x):
        return None
    else:
        return datetime.datetime.strptime( x, "%b-%Y").date()
for i in date_cols:
    final_data[i] = final_data[i].apply( clean_date )
    
for i in cat_cols:
    final_data.loc[final_data[i].isnull(), i] = None

NameError: name 'float_cols' is not defined