In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import glob
import warnings
import os
import numpy as np
import pickle
from sys import platform
import datetime
import seaborn

warnings.filterwarnings('ignore')


In [3]:
                                            #QUESTION1

In [4]:
def is_integer(x):
   
    try:
        return (int(x) == float(x))
    except:
        return False

In [5]:
def ingest_files(directory: str):
    
    
    # If the directory has no trailing slash, add one
    if directory[-1] != "/":
        directory = directory + "/"
    
    all_files = os.listdir(directory)
    output = {}
    
    print("Directory " + directory + " has " + str(len(all_files)) + " files:")
    for i in all_files:
        print(" Reading file " + i)
        output[i] = pd.read_csv(directory + i, dtype = str, skiprows = 1)
        
        # To remove those lines, find any lines with non-integer IDs
        # and remove them
        invalid_rows = (output[i].id.apply( lambda x : is_integer(x) == False ))
        if invalid_rows.sum() > 0:
            print(" Found " + str(invalid_rows.sum()) + " invalid rows which were removed")
            output[i] = output[i][invalid_rows == False]
    
    return output

In [6]:
# Define the directories that contain the full dataset files downloaded in 2019
dir_full = "/".join(["Data"] + ["full_dataset"] )

# Ingest the set of files
files_full = ingest_files(dir_full)

final_data = pd.concat(files_full.values()).reset_index(drop = True)

Directory Data/full_dataset/ has 15 files:
 Reading file LoanStats_securev1_2018Q4.csv
 Found 2 invalid rows which were removed
 Reading file LoanStats_securev1_2018Q2.csv
 Found 2 invalid rows which were removed
 Reading file LoanStats_securev1_2018Q3.csv
 Found 2 invalid rows which were removed
 Reading file LoanStats_securev1_2018Q1.csv
 Found 2 invalid rows which were removed
 Reading file LoanStats_securev1_2019Q1.csv
 Found 2 invalid rows which were removed
 Reading file LoanStats_securev1_2019Q2.csv
 Found 2 invalid rows which were removed
 Reading file LoanStats_securev1_2019Q3.csv
 Found 2 invalid rows which were removed
 Reading file LoanStats_securev1_2017Q1.csv
 Found 2 invalid rows which were removed
 Reading file LoanStats_securev1_2017Q2.csv
 Found 2 invalid rows which were removed
 Reading file LoanStats_securev1_2017Q3.csv
 Found 2 invalid rows which were removed
 Reading file LoanStats_securev1_2017Q4.csv
 Found 2 invalid rows which were removed
 Reading file LoanStat

In [50]:
final_data.to_csv("final_data.csv",index=False)

In [51]:
final_data.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
0,74121690,,6000,6000,6000,36 months,12.99%,202.14,C,C2,...,,,,N,,,,,,
1,74724861,,21000,21000,21000,60 months,19.53%,550.9,D,D5,...,,,,N,,,,,,
2,74826201,,7200,7200,7200,36 months,5.32%,216.83,A,A1,...,,,,N,,,,,,
3,75061311,,12000,12000,12000,60 months,11.99%,266.88,C,C1,...,,,,N,,,,,,
4,75091735,,11425,11425,11425,36 months,19.53%,421.87,D,D5,...,,,,N,,,,,,


In [52]:
final_data.shape

(1763077, 150)

In [53]:
final_data.duplicated().sum()

0

In [54]:
final_data.isnull().sum()

id                             0
member_id                1763077
loan_amnt                      0
funded_amnt                    0
funded_amnt_inv                0
                          ...   
settlement_status        1736832
settlement_date          1736832
settlement_amount        1736832
settlement_percentage    1736832
settlement_term          1736832
Length: 150, dtype: int64

In [55]:
                                        #QUESTION 2

In [7]:
# Tell where the directories that contain the files downloaded in 2017 and 2019
dir_2017 = "/".join(["Data"] + ["1712_download"] )
dir_2019 = "/".join(["Data"] + ["1912_download"] )

# Ingest the set of files downloaded in 2017 and then the files downloaded in 2019
files_2017 = ingest_files(dir_2017)
files_2019 = ingest_files(dir_2019)

Directory Data/1712_download/ has 8 files:
 Reading file LoanStats_securev1_2017Q1.csv
 Found 2 invalid rows which were removed
 Reading file LoanStats_securev1_2017Q2.csv
 Found 2 invalid rows which were removed
 Reading file LoanStats_securev1_2017Q3.csv
 Found 2 invalid rows which were removed
 Reading file LoanStats_securev1_2017Q4.csv
 Found 2 invalid rows which were removed
 Reading file LoanStats_securev1_2016Q2.csv
 Found 2 invalid rows which were removed
 Reading file LoanStats_securev1_2016Q3.csv
 Found 2 invalid rows which were removed
 Reading file LoanStats_securev1_2016Q1.csv
 Found 2 invalid rows which were removed
 Reading file LoanStats_securev1_2016Q4.csv
 Found 2 invalid rows which were removed
Directory Data/1912_download/ has 8 files:
 Reading file LoanStats_securev1_2017Q1.csv
 Found 2 invalid rows which were removed
 Reading file LoanStats_securev1_2017Q2.csv
 Found 2 invalid rows which were removed
 Reading file LoanStats_securev1_2017Q3.csv
 Found 2 invalid row

In [8]:
# Check the two sets of files have the same number of files
# with the same names
assert len(files_2017) == len(files_2019)
assert sorted(files_2017) == sorted(files_2019)

In [9]:
# Ensure each set of files has the same loan IDs in both instances
for i in files_2017:
    assert sorted(files_2017[i].id) == sorted(files_2019[i].id)

In [10]:
# Combined

data_2017 = pd.concat(files_2017.values()).reset_index(drop = True)
data_2019 = pd.concat(files_2019.values()).reset_index(drop = True)


In [13]:
data_2017.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
0,104046702,,14000,14000,14000,60 months,12.74%,316.69,C,C1,...,,,Cash,N,,,,,,
1,103470623,,15400,15400,15400,60 months,11.39%,337.84,B,B3,...,,,Cash,N,,,,,,
2,104028593,,4600,4600,4600,36 months,11.39%,151.45,B,B3,...,,,Cash,N,,,,,,
3,104280113,,15000,15000,15000,36 months,5.32%,451.73,A,A1,...,,,Cash,N,,,,,,
4,104046719,,14000,14000,14000,36 months,15.99%,492.13,C,C5,...,,,Cash,N,,,,,,


In [12]:
data_2019.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
0,103470623,,15400,15400,15400,60 months,11.39%,337.84,B,B3,...,,,,N,,,,,,
1,104090304,,2000,2000,2000,36 months,16.99%,71.3,D,D1,...,,,,N,,,,,,
2,104048783,,13000,13000,13000,36 months,11.44%,428.32,B,B4,...,,,,N,,,,,,
3,104280288,,3025,3025,3025,36 months,6.99%,93.39,A,A2,...,,,,N,,,,,,
4,104047067,,10000,10000,10000,36 months,7.99%,313.32,A,A5,...,,,,N,,,,,,


In [14]:
print(data_2017.shape)
print(data_2019.shape)

(877986, 151)
(877986, 150)


In [16]:
# There is somethings that not quite right
# 2017 dataset has an additional column which is disbursement_method
data_2017.drop(['disbursement_method'], axis=1, inplace=True)

In [17]:
data_2017.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
0,104046702,,14000,14000,14000,60 months,12.74%,316.69,C,C1,...,,,,N,,,,,,
1,103470623,,15400,15400,15400,60 months,11.39%,337.84,B,B3,...,,,,N,,,,,,
2,104028593,,4600,4600,4600,36 months,11.39%,151.45,B,B3,...,,,,N,,,,,,
3,104280113,,15000,15000,15000,36 months,5.32%,451.73,A,A1,...,,,,N,,,,,,
4,104046719,,14000,14000,14000,36 months,15.99%,492.13,C,C5,...,,,,N,,,,,,


In [18]:
print(data_2017.shape)
print(data_2019.shape)

(877986, 150)
(877986, 150)


In [19]:
#sets, unlike lists or tuples, cannot have multiple occurrences 
#of the same element and store unordered values

A=set(data_2017.columns)
B=set(data_2019.columns)

A-B

set()

In [23]:
# Ensure the loan IDs are a unique key
assert len(set(data_2019.id)) == len(data_2019)

assert len(set(data_2017.id)) == len(data_2017)


In [24]:
columns = list(data_2017.columns)

# We verified every file had the same columns, but double check just in case
assert sorted(columns) == sorted(data_2019.columns)

In [25]:
# Join the files. Because we have previously confirmed that each
# set of files contained the same sets of IDs *and* that these
# IDs were unique, the two files can be joined safely

# Just in case, check the datasets have the same number of rows
n_rows = len(data_2017)
assert n_rows == len(data_2019)

# Merge them
combined = pd.merge(data_2017, data_2019, how = 'inner', on="id", suffixes=('_x', '_y'))

# Ensure the merged dataset has the same number of rows
assert n_rows == len(combined)

In [26]:
combined.head(10)

Unnamed: 0,id,member_id_x,loan_amnt_x,funded_amnt_x,funded_amnt_inv_x,term_x,int_rate_x,installment_x,grade_x,sub_grade_x,...,orig_projected_additional_accrued_interest_y,hardship_payoff_balance_amount_y,hardship_last_payment_amount_y,debt_settlement_flag_y,debt_settlement_flag_date_y,settlement_status_y,settlement_date_y,settlement_amount_y,settlement_percentage_y,settlement_term_y
0,104046702,,14000,14000,14000,60 months,12.74%,316.69,C,C1,...,,,,N,,,,,,
1,103470623,,15400,15400,15400,60 months,11.39%,337.84,B,B3,...,,,,N,,,,,,
2,104028593,,4600,4600,4600,36 months,11.39%,151.45,B,B3,...,,,,N,,,,,,
3,104280113,,15000,15000,15000,36 months,5.32%,451.73,A,A1,...,,,,N,,,,,,
4,104046719,,14000,14000,14000,36 months,15.99%,492.13,C,C5,...,,,,N,,,,,,
5,104048967,,5000,5000,5000,36 months,25.49%,200.1,E,E4,...,,,,N,,,,,,
6,104170234,,1500,1500,1500,36 months,5.32%,45.18,A,A1,...,,,,N,,,,,,
7,104049706,,6000,6000,6000,36 months,8.24%,188.69,B,B1,...,,,,N,,,,,,
8,104045574,,10000,10000,10000,36 months,7.99%,313.32,A,A5,...,,,,N,,,,,,
9,104047042,,4700,4700,4700,36 months,15.99%,165.22,C,C5,...,,,,N,,,,,,


In [27]:
# Go through each column, and find what percentage of the values in
# that column are identical in the two datasets
static_perc = {}

# Remove the ID column first
columns = [i for i in columns if i != "id"]

for i in columns:
    combined[i+"_comp"] = (combined[i+"_x"] == combined[i+"_y"]) | (combined[i+"_x"].isnull() & combined[i+"_y"].isnull())
    static_perc[i] = combined[i+"_comp"].sum()*100.0/len(combined)

In [28]:
static_perc[i]

97.89711908845926

In [31]:
static_perc = pd.DataFrame([ [i, static_perc[i]] for i in static_perc], columns=["column", "perc_equal"]).\
                                                    sort_values("perc_equal", ascending = False)

In [32]:
static_perc

Unnamed: 0,column,perc_equal
0,member_id,100.000000
64,open_il_12m,100.000000
94,num_actv_rev_tl,100.000000
93,num_actv_bc_tl,100.000000
92,num_accts_ever_120_pd,100.000000
...,...,...
40,total_pymnt_inv,23.959152
39,total_pymnt,23.959038
51,last_fico_range_low,13.783135
50,last_fico_range_high,13.783135


In [33]:
cols_to_pick = ["id","member_id_x","loan_amnt_x","funded_amnt_x","funded_amnt_inv_x",
                "term_x","int_rate_x","installment_x","grade_x","sub_grade_x"]

In [39]:
# Ensure that the columns we want to pick for our model are in
# the above, and check how consistent they've been
set(cols_to_pick) - set(static_perc.column) - set(["id"])
#assert set(cols_to_pick) - set(static_perc.column) - set(["id"]) == set()


{'funded_amnt_inv_x',
 'funded_amnt_x',
 'grade_x',
 'installment_x',
 'int_rate_x',
 'loan_amnt_x',
 'member_id_x',
 'sub_grade_x',
 'term_x'}

In [36]:
static_perc[static_perc.column.isin(cols_to_pick)]

Unnamed: 0,column,perc_equal


In [77]:
#Consider the columns that were not consistent in both datasets

# First, make sure the columns in which int_rate and installment
# are different are the same columns
assert (combined["int_rate_comp"] != combined["installment_comp"]).sum() == 0

AssertionError: 

In [78]:
(combined["int_rate_comp"] != combined["installment_comp"]).sum()

35

In [79]:
## what do you notice looking at these loans?
combined.loc[(combined["int_rate_comp"] != combined["installment_comp"]),:].loc[:,['id','int_rate_x','int_rate_y','term_x','term_y','installment_x','installment_y']]

Unnamed: 0,id,int_rate_x,int_rate_y,term_x,term_y,installment_x,installment_y
39618,74722704,10.75%,10.75%,60 months,60 months,432.36,405.03
88840,71003261,6.00%,11.16%,36 months,36 months,452.35,452.35
99418,70391991,6.00%,6.00%,36 months,36 months,655.91,653.38
102292,69980465,6.00%,6.00%,60 months,60 months,704.42,699.75
184837,78600054,13.67%,13.67%,60 months,60 months,461.96,441.9
198862,77538744,6.00%,6.00%,36 months,36 months,1032.57,1032.72
231097,76012596,6.00%,6.00%,60 months,60 months,467.41,461.68
295368,86136443,14.49%,14.49%,60 months,60 months,258.76,134.1
320463,83709004,19.99%,19.99%,60 months,60 months,303.04,279.86
384458,93413738,12.74%,12.74%,36 months,36 months,211.49,195.32


In [110]:
# So the differences in int_rate and installment are the same. Let's look at them
combined[combined.int_rate_comp == False][["id", "issue_d_x", "issue_d_y", "term_x", "term_y", "int_rate_x", "int_rate_y"]]

Unnamed: 0,id,issue_d_x,issue_d_y,term_x,term_y,int_rate_x,int_rate_y
11702,75132541,Mar-2016,Mar-2016,60 months,60 months,7.89%,6.00%
12709,74131643,Mar-2016,Mar-2016,36 months,36 months,6.97%,6.00%
18972,75101560,Mar-2016,Mar-2016,60 months,60 months,19.53%,6.00%
20946,75314580,Mar-2016,Mar-2016,60 months,60 months,10.75%,6.00%
26817,74522695,Mar-2016,Mar-2016,60 months,60 months,18.99%,6.00%
...,...,...,...,...,...,...,...
862147,118169441,Oct-2017,Oct-2017,60 months,60 months,18.06%,6.00%
862390,120953984,Oct-2017,Oct-2017,36 months,36 months,14.08%,6.00%
867463,120631915,Oct-2017,Oct-2017,60 months,60 months,9.93%,6.00%
871222,120554473,Oct-2017,Oct-2017,36 months,36 months,12.62%,6.00%


In [111]:
#Prepare Final Dataset

In [207]:
# Keep only these columns of interest

cols_to_pick = ["id", "loan_amnt", "funded_amnt", "term", "int_rate", "grade", "emp_length", "home_ownership", "annual_inc", 
"verification_status", "issue_d","loan_status", "purpose", "dti", "delinq_2yrs", "earliest_cr_line", "fico_range_low",
"fico_range_high","open_acc", "pub_rec", "revol_bal", "revol_util", "total_pymnt", "recoveries", "perc"]


In [208]:
filepath = os.path.join(os.getcwd(), 'data', 'final_data.csv')

# # parsing the issue_d and earliest_cr_line columns as a dates
data_all = pd.read_csv(filepath, parse_dates=["issue_d","earliest_cr_line"])

In [209]:
data_all.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
0,74121690,,6000,6000,6000.0,36 months,12.99%,202.14,C,C2,...,,,,N,,,,,,
1,74724861,,21000,21000,21000.0,60 months,19.53%,550.9,D,D5,...,,,,N,,,,,,
2,74826201,,7200,7200,7200.0,36 months,5.32%,216.83,A,A1,...,,,,N,,,,,,
3,75061311,,12000,12000,12000.0,60 months,11.99%,266.88,C,C1,...,,,,N,,,,,,
4,75091735,,11425,11425,11425.0,36 months,19.53%,421.87,D,D5,...,,,,N,,,,,,


In [210]:
# Some of the columns selected will not be used directly in the model,
# but will be used to generate other features.
#
# Create variables specifying the features that will be used
#perc = (round(perc,2).sort_values(ascending=False))

data_all['perc'] = data_all.isnull().sum()/data_all.isnull().count() *100 

In [212]:
# Keep only the columns of interest
final_data = data_all[cols_to_pick].copy()

In [213]:
final_data.head()

Unnamed: 0,id,loan_amnt,funded_amnt,term,int_rate,grade,emp_length,home_ownership,annual_inc,verification_status,...,earliest_cr_line,fico_range_low,fico_range_high,open_acc,pub_rec,revol_bal,revol_util,total_pymnt,recoveries,perc
0,74121690,6000,6000,36 months,12.99%,C,9 years,MORTGAGE,43000.0,Not Verified,...,2006-05-01,675,679,16,0,15055,63%,7268.153165,0.0,
1,74724861,21000,21000,60 months,19.53%,D,10+ years,MORTGAGE,65000.0,Verified,...,2000-03-01,710,714,13,0,124946,65.5%,23643.13,0.0,
2,74826201,7200,7200,36 months,5.32%,A,10+ years,MORTGAGE,49000.0,Source Verified,...,2001-12-01,750,754,11,0,9309,18.4%,7223.41,0.0,
3,75061311,12000,12000,60 months,11.99%,C,10+ years,MORTGAGE,49000.0,Not Verified,...,1999-12-01,670,674,10,0,12152,50.6%,8448.9,443.49,
4,75091735,11425,11425,36 months,19.53%,D,5 years,RENT,26000.0,Source Verified,...,2008-09-01,730,734,11,0,2096,7.2%,14331.034913,0.0,


In [214]:
data_type = pd.DataFrame(final_data.dtypes,columns=['dtypes'])

In [215]:
data_type

Unnamed: 0,dtypes
id,int64
loan_amnt,int64
funded_amnt,int64
term,object
int_rate,object
grade,object
emp_length,object
home_ownership,object
annual_inc,float64
verification_status,object


In [None]:
# Typecast Columns

In [244]:
for i in float_cols:
    final_data[i] = final_data[i].astype(float)
    
def clean_perc(x):
    if pd.isnull(x):
        return np.nan
    else:
        return float(x.strip()[:-1])
for i in perc_cols:
    final_data[i] = final_data[i].apply( clean_perc )
    
def clean_date(issue_d ):
    if pd.isnull(issue_d):
        return None
    else:
        return datetime.datetime.strptime( str(issue_d), '%Y-%m-%d').date()
for i in date_cols:
    final_data[i] = final_data[i].apply( clean_date )
    
for i in cat_cols:
    final_data.loc[final_data[i].isnull(), i] = None

In [245]:
# # Identify the columns we'll be keeping from the dataset

# ## you will need to add the lists to each column type defined below


# # Identify the type of each of these column
float_cols = ["annual_inc", "dti", "total_pymnt", "recoveries"]

cat_cols = ["term", "int_rate", "grade", "emp_length", "home_ownership","verification_status",
            "loan_status", "purpose","revol_util"]

perc_cols = ["perc"]

date_cols = ["issue_d"]

int_cols = ["id", "loan_amnt", "funded_amnt", "delinq_2yrs", "fico_range_low","fico_range_high",
            "open_acc", "pub_rec", "revol_bal"]

# Ensure that we have types for every column
assert set(cols_to_pick) - set(float_cols) - set(cat_cols) - set(perc_cols) - set(date_cols) - set(int_cols) == set(["id"])

AssertionError: 

In [246]:
# # Some of the columns selected will not be used directly in the model,
# # but will be used to generate other features.
# #
# # Create variables specifying the features that will be used

# All categorical columns other than "loan_status" will be used as
# discrete features
discrete_features = list(set(cat_cols) - set(["loan_status"]))

# All numeric columns will be used as continuous features
continuous_features = list(float_cols + perc_cols)

In [247]:
print("Starting with " + str(len(final_data)) + " rows")

Starting with 0 rows


In [248]:
#Drop null values

In [249]:
# Deal with null values. We allow cateogrical variables to be null
# OTHER than grade, which is a particularly important categorical.
# All non-categorical variables must be non-null, and we drop
# rows that do not meet this requirement
required_cols = set(cols_to_pick) - set(cat_cols) - set(["id"])
required_cols.add("grade")

n_rows = len(final_data)
final_data.dropna(subset = required_cols ,inplace=True)
print("Removed " + str(n_rows - len(final_data)) + " rows")

Removed 0 rows


In [None]:
#@uestion 3 Keep Terminated Loans

In [250]:
# select only terminated loans
final_data = final_data[final_data.loan_status.isin(['Fully Paid','Charged Off','Default'])]
                        
print("Ending with " + str(len(final_data)) + " rows")

Ending with 0 rows


In [None]:
#Question 4  Visualize the variables

In [None]:
def visualize_columns():
    '''
    This function visualizes all columns
      - Box-and-whisker plots for continuous variables
      - Lists of distinct values for categorical columns
      - A timeline density for dates
    '''
    
    # FLoat columns
    for i in float_cols + perc_cols + ret_cols:
        seaborn.boxplot(final_data[i])

        # Print the three highest values
        highest_vals = sorted(final_data[i], reverse=True)[:3]
        smallest_val = min(final_data[i])
        plt.text(smallest_val, -0.3, highest_vals[0])
        plt.text(smallest_val, -0.2, highest_vals[1])
        plt.text(smallest_val, -0.1, highest_vals[2])

        plt.show()
        
    # Categorical columns 
    for i in cat_cols:
        print(i)
        print(str(len(set(final_data[i]))) + " distinct values")
        print(final_data[i].value_counts())
        print("")
        print("")
    
    # Date columns
    for i in date_cols:
        final_data[final_data[i].isnull() == False][i].apply(lambda x : str(x.year) +
                                                "-" + str(x.month)).value_counts(ascending = True).plot()
        plt.title(i + " (" + str(final_data[i].isnull().sum()) + " null values)")
        plt.show()

In [None]:
# Handle outliers

In [None]:
# There are quite a few outliers, but the two most obvious
# ones to remove are in annual_inc, revol_util Remove these.

## ?? below indicate you need to provide the parameters (a number) that identifies the outliers
## use the same syntax to remove outliers from other columns as you see fit

n_rows = len(final_data)
final_data = final_data[final_data.annual_inc < ??]
final_data = final_data[final_data.revol_util < ??]
print("Removed " + str(n_rows - len(final_data)) + " rows")

In [None]:
# Only include loans isssued since 2011
n_rows = len(final_data)
final_data = final_data[final_data.issue_d >= datetime.date(2011, 1, 1)]
print("Removed " + str(n_rows - len(final_data)) + " rows")

In [None]:
# Visualize the data again
visualize_columns()

In [None]:
 #Question 5

In [None]:
# Define the output path for the pickle
pickle_file = "/".join(["Data"] + ["clean_data.pickle"] )

pickle.dump( [final_data, discrete_features, continuous_features, ret_cols], open(pickle_file, "wb") )

In [None]:
# Part 3 – Data Exploration 
# Question 1

In [None]:
# Calculate Returns

In [None]:
# Define the names of the four returns we'll be calculating
ret_cols = ["ret_PESS", "ret_OPT", "ret_INTa", "ret_INTb", "ret_INTc"]

In [None]:
# Remove all rows for loans that were paid back on the days
# they were issued
final_data['loan_length'] = (final_data.last_pymnt_d - final_data.issue_d) / np.timedelta64(1, 'M')

n_rows = len(final_data)

final_data = final_data[final_data.loan_length != 0]

print("Removed " + str(n_rows - len(final_data)) + " rows")

In [None]:
# Question 2

In [None]:
# Return Method 2 (pessimistic)

In [None]:
# Calculate the return using a simple annualized profit margin
# Pessimistic fefinition (method 2)

final_data['term_num'] = final_data.term.str.extract('(\d+)',expand=False).astype(int)
final_data['ret_PESS'] = ( (final_data.total_pymnt - final_data.funded_amnt) 
                                            / final_data.funded_amnt ) * (12 / final_data['term_num'])

In [None]:
#Return Method 1 (optimistic)

In [None]:
# Assuming that if a loan gives a positive return, we can
# immediately find a similar loan to invest in; if the loan
# takes a loss, we use method 2 to compute the return

final_data['ret_OPT'] = ( (final_data.total_pymnt - final_data.funded_amnt)
                                            / final_data.funded_amnt ) * (12 / final_data['loan_length'])
final_data.loc[final_data.ret_OPT < 0,'ret_OPT'] = final_data.ret_PESS[final_data.ret_OPT < 0]

In [None]:
# Method 3 (re-investment)

In [None]:
def ret_method_3(T, i):
    '''
    Given an investment time horizon (in months) and re-investment
    interest rate, calculate the return of each loan
    '''
    
    # Assuming that the total amount paid back was paid at equal
    # intervals during the duration of the loan, calculate the
    # size of each of these installment
    actual_installment = (final_data.total_pymnt - final_data.recoveries) / final_data['loan_length']

    # Assuming the amount is immediately re-invested at the prime
    # rate, find the total amount of money we'll have by the end
    # of the loan
    cash_by_end_of_loan = actual_installment * (1 - pow(1 + i, final_data.loan_length)) / ( 1 - (1 + i) )
    
    cash_by_end_of_loan = cash_by_end_of_loan + final_data.recoveries
    
    # Assuming that cash is then re-invested at the prime rate,
    # with monthly re-investment, until T months from the start
    # of the loan
    remaining_months = T - final_data['loan_length']
    final_return = cash_by_end_of_loan * pow(1 + i, remaining_months)

    # Find the percentage return
    return( (12/T) * ( ( final_return - final_data['funded_amnt'] ) / final_data['funded_amnt'] ) )


In [None]:
## you will need to provide the appropriate parameters to the function call
final_data['ret_INTa'] = ret_method_3(?, ?)
final_data['ret_INTb'] = ret_method_3(?, ?)
final_data['ret_INTc'] = ret_method_3(?, ?)

In [None]:
# Question 3

In [None]:
# a. How many loans are in each grade? 
# b. What is the default rate in each grade?  
# c. What is the average interest rate in each grade?  
# d. What is the average percentage (annual) return for each grade using each of the return calculations? 