# Part 3

### ALWAYS run this code below first 

In [2]:
import pandas as pd
import os
from sys import platform
import matplotlib.pyplot as plt
import datetime
import numpy as np
import pickle
import seaborn

# Helper functions
def is_integer(x):
   
    try:
        return (int(x) == float(x))
    except:
        return False

def ingest_files(directory: str):
    
    
    # If the directory has no trailing slash, add one
    if directory[-1] != "/":
        directory = directory + "/"
    
    all_files = os.listdir(directory)
    output = {}
    
    print("Directory " + directory + " has " + str(len(all_files)) + " files:")
    for i in all_files:
        print(" Reading file " + i)
        output[i] = pd.read_csv(directory + i, dtype = str, skiprows = 1)
        
        # To remove those lines, find any lines with non-integer IDs
        # and remove them
        invalid_rows = (output[i].id.apply( lambda x : is_integer(x) == False ))
        if invalid_rows.sum() > 0:
            print(" Found " + str(invalid_rows.sum()) + " invalid rows which were removed")
            output[i] = output[i][invalid_rows == False]
    
    return output

def clean_perc(x):
    if pd.isnull(x):
        return np.nan
    else:
        return float(x.strip()[:-1])

def clean_date(x):
    if pd.isnull(x):
        return None
    else:
        return datetime.datetime.strptime( x, "%b-%Y").date()

def visualize_columns(data, float_cols, cat_cols, date_cols):
    '''
    This function visualizes all columns
      - Box-and-whisker plots for continuous variables
      - Lists of distinct values for categorical columns
      - A timeline density for dates
    '''
    
    # FLoat columns
    for i in float_cols:
        seaborn.boxplot(data[i])

        # Print the three highest values
        highest_vals = sorted(data[i], reverse=True)[:3]
        smallest_val = min(data[i])
        plt.text(smallest_val, -0.3, highest_vals[0])
        plt.text(smallest_val, -0.2, highest_vals[1])
        plt.text(smallest_val, -0.1, highest_vals[2])

        plt.show()
        
    # Categorical columns 
    for i in cat_cols:
        print(i)
        print(str(len(set(data[i]))) + " distinct values")
        print(data[i].value_counts())
        print("")
        print("")
    
    # Date columns
    for i in date_cols:
        data[data[i].isnull() == False][i].apply(lambda x : str(x.year) +
                                                "-" + str(x.month)).value_counts(ascending = True).plot()
        plt.title(i + " (" + str(data[i].isnull().sum()) + " null values)")
        plt.show()

# Identify the columns we'll be keeping from the dataset
cols_to_pick = ["id",
                "loan_amnt",
                "funded_amnt",
                "revol_util",
                "revol_bal",
                "fico_range_low",
                "fico_range_high",
                "pub_rec",
                "open_acc",
                "earliest_cr_line",
                "delinq_2yrs",
                "dti",
                "purpose",
                "issue_d",
                "annual_inc",
                "home_ownership",
                "emp_length",
                "grade",
                "term",
                "int_rate",
                "installment",
                "verification_status",
                "recoveries",
                "loan_status",
                "last_pymnt_d",
                "total_pymnt"]

# Identify the type of each of these column

float_cols = ["loan_amnt",
                "funded_amnt",
                "revol_bal",
                "fico_range_low",
                "fico_range_high",
                "pub_rec",
                "open_acc",
                "delinq_2yrs",
                "dti",
                "annual_inc",
                "installment",
                "recoveries",
                "total_pymnt"]

cat_cols = ["purpose",
            "home_ownership",
            "emp_length",
            "grade",
            "term",
            "verification_status",
            "loan_status"]

perc_cols = ["revol_util",
             "int_rate"]

date_cols = ["earliest_cr_line",
             "issue_d",
             "last_pymnt_d"]


# Ensure that we have types for every column
assert set(cols_to_pick) - set(float_cols) - set(cat_cols) - set(perc_cols) - set(date_cols) == set(["id"])

# Define the names of the four returns we'll be calculating
ret_cols = ["ret_PESS", "ret_OPT", "ret_INTa", "ret_INTb", "ret_INTc"]

# Some of the columns selected will not be used directly in the model, but will be used to generate other features.
# Create variables specifying the features that will be used

# All categorical columns other than "loan_status" will be used as
# discrete features
discrete_features = list(set(cat_cols) - set(["loan_status"]))

# All numeric columns will be used as continuous features
continuous_features = list(float_cols + perc_cols)

# Question 1

In [3]:
print("Reading in pickle file...")

final_data, discrete_features, continuous_features, ret_cols = pickle.load( open( "Data/clean_data.pickle", "rb" ) )

final_data['loan_length'] = (final_data.last_pymnt_d - final_data.issue_d) / np.timedelta64(1, 'M')
n_rows = len(final_data)
final_data = final_data[final_data.loan_length != 0]
print("Removed " + str(n_rows - len(final_data)) + " rows")


Reading in pickle file...
Removed 0 rows


In [13]:
final_data['total'] = ((final_data.installment * round(final_data.loan_length)))
final_data.loc[(final_data["recoveries"] != 0),:].loc[:,['loan_status', 'funded_amnt', 'term', 'total', 'total_pymnt', 'recoveries', 'int_rate','loan_length']]


Unnamed: 0,loan_status,funded_amnt,term,total,total_pymnt,recoveries,int_rate,loan_length
2,Charged Off,12000.0,60 months,8540.16,8448.90,443.49,11.99,32.033512
9,Charged Off,9000.0,36 months,3816.67,4365.68,853.35,10.75,13.010534
12,Charged Off,20000.0,36 months,7649.88,8797.98,1805.95,9.16,11.992033
15,Charged Off,9600.0,36 months,12409.66,11062.23,250.00,13.67,37.980246
18,Charged Off,21000.0,36 months,13573.03,16700.62,3873.86,13.67,19.022978
...,...,...,...,...,...,...,...,...
765855,Charged Off,4000.0,36 months,150.48,724.20,563.36,20.89,1.018501
765863,Charged Off,10000.0,60 months,264.50,1668.07,1414.64,19.92,1.018501
765931,Charged Off,23875.0,36 months,2271.33,2954.11,25.00,8.81,2.956940
766250,Charged Off,36825.0,60 months,801.04,9991.23,9212.74,11.02,0.985647


# Question 2

### Return Method 1 (pessimistic)

In [18]:
# Calculate the return using a simple annualized profit margin
# Pessimistic fefinition (method 2)

final_data['term_num'] = final_data.term.str.extract('(\d+)',expand=False).astype(int)
final_data['ret_PESS'] = ( (final_data.total_pymnt - final_data.funded_amnt) 
                                            / final_data.funded_amnt ) * (12 / final_data['term_num'])

### Return Method 2 (optimistic)

In [19]:
# Assuming that if a loan gives a positive return, we can
# immediately find a similar loan to invest in; if the loan
# takes a loss, we use method 2 to compute the return

final_data['ret_OPT'] = ( (final_data.total_pymnt - final_data.funded_amnt)
                                            / final_data.funded_amnt ) * (12 / final_data['loan_length'])
final_data.loc[final_data.ret_OPT < 0,'ret_OPT'] = final_data.ret_PESS[final_data.ret_OPT < 0]

### Method 3 (re-investment)

In [21]:
def ret_method_3(T, i):
    '''
    Given an investment time horizon (in months) and re-investment
    interest rate, calculate the return of each loan
    '''
    
    # Assuming that the total amount paid back was paid at equal
    # intervals during the duration of the loan, calculate the
    # size of each of these installment
    actual_installment = (final_data.total_pymnt - final_data.recoveries) / final_data['loan_length']

    # Assuming the amount is immediately re-invested at the prime
    # rate, find the total amount of money we'll have by the end
    # of the loan
    cash_by_end_of_loan = actual_installment * (1 - pow(1 + i, final_data.loan_length)) / ( 1 - (1 + i) )
    
    cash_by_end_of_loan = cash_by_end_of_loan + final_data.recoveries
    
    # Assuming that cash is then re-invested at the prime rate,
    # with monthly re-investment, until T months from the start
    # of the loan
    remaining_months = T - final_data['loan_length']
    final_return = cash_by_end_of_loan * pow(1 + i, remaining_months)

    # Find the percentage return
    return( (12/T) * ( ( final_return - final_data['funded_amnt'] ) / final_data['funded_amnt'] ) )


# T should be term here in months. We can get this value from term_num which was calculated earlier
final_data['term_num']

final_data['ret_INTa'] = ret_method_3(final_data['term_num'], 1.2/100) # 1.2%
final_data['ret_INTb'] = ret_method_3(final_data['term_num'], 3/100) # 3%
## final_data['ret_INTc'] = ret_method_3(?, ?) # We don't need this I don't think

### Mean and Median Results

In [47]:
def print_median_mean(data, value):
    mean = data[value].mean()*100;
    median = data[value].median()*100
    print("\tMean: " + str(mean))
    print("\tMedian: " + str(median))


print("Method 1 Pesimistic results:")
print_median_mean(final_data, 'ret_PESS')
    
print("Method 2 Optimisic results:")
print_median_mean(final_data, 'ret_OPT')

print("Method 3 (1.2%):")
print_median_mean(final_data, 'ret_INTa')

print("Method 3 (3%):")
print_median_mean(final_data, 'ret_INTb')

Method 1 Pesimistic results:
	Mean: 0.0415733204919579
	Median: 2.7863924675738607
Method 2 Optimisic results:
	Mean: 4.9245445703291955
	Median: 7.476474863868679
Method 3 (1.2%)
	Mean: 13.209380349514971
	Median: 17.644040016278385
Method 3 (3%)
	Mean: 46.793771973857005
	Median: 49.54462031819411


# Question 3

In [41]:
perc_by_grade = (final_data.grade.value_counts()*100/len(final_data)).sort_index()
default_by_grade = final_data.groupby("grade").apply(lambda x : (x.loan_status != "Fully Paid").sum()*100/len(x) )
ret_by_grade_OPT = final_data.groupby("grade").apply(lambda x : np.mean(x.ret_OPT)*100 )
ret_by_grade_PESS = final_data.groupby("grade").apply(lambda x : np.mean(x.ret_PESS)*100 )
ret_by_grade_INTa = final_data.groupby("grade").apply(lambda x : np.mean(x.ret_INTa)*100 )
ret_by_grade_INTb = final_data.groupby("grade").apply(lambda x : np.mean(x.ret_INTb)*100 )
# ret_by_grade_INTc = final_data.groupby("grade").apply(lambda x : np.mean(x.ret_INTc)*100 )
int_rate_by_grade = final_data.groupby("grade").apply(lambda x : np.mean(x.int_rate))

combined = pd.DataFrame(perc_by_grade)
combined['default'] = default_by_grade
combined['int_rate'] = int_rate_by_grade
combined['return_OPT'] = ret_by_grade_OPT
combined['return_PESS'] = ret_by_grade_PESS
combined['return_INTa'] = ret_by_grade_INTa
combined['return_INTb'] = ret_by_grade_INTb
# combined['return_INTc'] = ret_by_grade_INTc

combined['count'] = combined['grade']/100 * len(final_data)
#len(final_data)
combined

Unnamed: 0,grade,default,int_rate,return_OPT,return_PESS,return_INTa,return_INTb,count
A,18.878608,6.688156,6.981119,3.850003,1.036243,13.964298,44.180124,146737.0
B,29.852586,14.920227,10.477369,4.647929,0.764194,13.727066,45.465953,232034.0
C,29.545098,24.60417,14.113935,5.069478,-0.179073,13.113111,47.799174,229644.0
D,14.000484,33.379587,18.758675,5.754727,-1.230543,12.113887,48.07786,108821.0
E,5.363158,41.582306,23.610692,6.634144,-1.741258,11.986214,51.4569,41686.0
F,1.727337,50.357515,27.4345,6.456103,-2.456857,11.37131,53.871506,13426.0
G,0.632731,54.005693,29.965842,6.227061,-3.346234,10.38143,53.216044,4918.0


In [48]:
from scipy.stats import kurtosis, skew

def print_kurtosis_skewness(data, value):
    k = kurtosis(data[value]);
    s = skew(data[value])
    print("\tKurtosis: " + str(k))
    print("\tSkewness: " + str(s))

print('Interest Rate results')
print_kurtosis_skewness(final_data, 'int_rate')

print("Method 1 Pesimistic results:")
print_kurtosis_skewness(final_data, 'ret_PESS')

print("Method 2 Optimisic results:")
print_kurtosis_skewness(final_data, 'ret_OPT')

print('Method 3 (1.2%)')
print_kurtosis_skewness(final_data, 'ret_INTa')

print('Method 3 results (3%)')
print_kurtosis_skewness(final_data, 'ret_INTb')

Interest Rate results
	Kurtosis: 0.7923193787702627
	Skewness: 0.9274066932504201
Method 1 Pesimistic results:
	Kurtosis: 2.1895256579179883
	Skewness: -1.7151026655056627
Method 2 Optimisic results:
	Kurtosis: 1.1258894162746378
	Skewness: -1.0197136575455488
Method 3 (1.2%)
	Kurtosis: 1.836033439040972
	Skewness: -1.6591283502486942
Method 3 results (3%)
	Kurtosis: 0.24431715900694284
	Skewness: -0.25928553401635884
