In [1]:
import pandas as pd
from io import BytesIO
import boto3
import multiprocessing as mp
import pickle

def load_data_from_s3(filename, format='csv'):
    s3 = boto3.client('s3')
    obj = s3.get_object(Bucket='loan-analysis-data', Key=filename)
    data = obj['Body'].read()
    f = BytesIO(data)
    if format=='csv':
        df = pd.read_csv(f, low_memory=False)
    if format=='pkl.bz2':
        df = pd.read_pickle(f, compression='bz2')
    return df

In [3]:
#df_payments = load_data_from_s3('cleaned_payments_data.pkl.bz2', format='pkl.bz2')
cleaned_training_data = pd.read_pickle('data/cleaned_training_loans.pkl.bz2')
payments = load_data_from_s3('cleaned_payments_data.pkl.bz2', format='pkl.bz2')
#df_predictions_out_of_sample = load_data_from_s3('predictions_out_of_sample.pkl.bz2', format='pkl.bz2')

In [4]:
# Dataframe that contains all of the payment records for all loans. 
#payments = load_data_from_s3('cleaned_payments_data.pkl.bz2', format='pkl.bz2')
loans = cleaned_training_data[['loan_amnt']]

def get_one_loan_payment_data(payments, loan_id):
    try:
        return payments[payments['LOAN_ID'] == loan_id][['RECEIVED_AMT_INVESTORS', 'mths_since_issue']]
    except:
        return pd.DataFrame()
    
def calculative_npv_payments(loans_payments, r_guess):
    payments = loans_payments.RECEIVED_AMT_INVESTORS
    months = loans_payments.mths_since_issue
    return sum(payments/(1+r_guess)**(months/12))

def adjust_estimated_roi(roi_guess, roi_min, roi_max, npv):
    if npv > 0:
        new_guess = (roi_guess + roi_min)/2
        new_min = roi_min
        new_max = roi_guess
    elif npv < 0:
        new_guess = (roi_guess + roi_max)/2
        new_min = roi_guess
        new_max = roi_max
    else:
        return roi_guess
    
    return (new_guess, new_min, new_max)

def get_roi_for_loan_id(loan_id):
    loan_id = int(loan_id)
    loan_size = loans['loan_amnt'].loc[loan_id]
    loan_payments = get_one_loan_payment_data(payments, loan_id)
    
    if loan_payments.empty:
        return -100
    
    r_guess = .10
    r_min = -.999
    r_max = .50
    
    for _ in range(15):
        npv_payments = calculative_npv_payments(loan_payments, r_guess)
        npv = loan_size - npv_payments
        r_guess, r_min, r_max = adjust_estimated_roi(r_guess, r_min, r_max, npv)
    return r_guess*100

def get_rois_for_loans(loan_ids):
    return {loan_id:get_roi_for_loan_id(loan_id)for loan_id in loan_ids}

In [5]:
cleaned_training_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 529507 entries, 0 to 618014
Columns: 217 entries, id to purpose_wedding
dtypes: datetime64[ns](1), float32(69), int64(78), object(1), uint16(1), uint8(67)
memory usage: 501.4+ MB


In [6]:
payments.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 36168827 entries, 0 to 37323066
Data columns (total 5 columns):
LOAN_ID                      int64
RECEIVED_D                   datetime64[ns]
RECEIVED_AMT_INVESTORS       float32
PBAL_END_PERIOD_INVESTORS    float32
mths_since_issue             uint8
dtypes: datetime64[ns](1), float32(2), int64(1), uint8(1)
memory usage: 1.1 GB


In [8]:
loan_ids = cleaned_training_data.id.unique()
len(loan_ids)

529507

In [9]:
def get_relevant_payments(all_payments, loan_ids_from_training_set):
    return payments[payments['LOAN_ID'].isin(loan_ids)][['LOAN_ID', 'RECEIVED_AMT_INVESTORS', 'mths_since_issue']]

In [10]:
payments = get_relevant_payments(payments, loan_ids)
payments.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12882535 entries, 0 to 20318479
Data columns (total 3 columns):
LOAN_ID                   int64
RECEIVED_AMT_INVESTORS    float32
mths_since_issue          uint8
dtypes: float32(1), int64(1), uint8(1)
memory usage: 258.0 MB


In [11]:
payments.head(5)

Unnamed: 0,LOAN_ID,RECEIVED_AMT_INVESTORS,mths_since_issue
0,54734,632.770996,1
1,54734,632.770996,2
2,54734,632.770996,3
3,54734,632.770996,4
4,54734,632.770996,5


In [12]:
cleaned_training_data.head()

Unnamed: 0,id,loan_amnt,int_rate,installment,emp_length,annual_inc,issue_d,dti,delinq_2yrs,fico_range_low,...,purpose_home_improvement,purpose_house,purpose_major_purchase,purpose_medical,purpose_moving,purpose_other,purpose_renewable_energy,purpose_small_business,purpose_vacation,purpose_wedding
0,1077501,5000,10.65,162.869995,10.0,24000.0,2011-12-01,27.65,0,735.0,...,0,0,0,0,0,0,0,0,0,0
1,1077175,2400,15.96,84.330002,10.0,12252.0,2011-12-01,8.72,0,735.0,...,0,0,0,0,0,0,0,1,0,0
2,1076863,10000,13.49,339.309998,10.0,49200.0,2011-12-01,20.0,0,690.0,...,0,0,0,0,0,1,0,0,0,0
3,1075269,5000,7.9,156.460007,3.0,36000.0,2011-12-01,11.2,0,730.0,...,0,0,0,0,0,0,0,0,0,1
4,1072053,3000,18.639999,109.43,9.0,48000.0,2011-12-01,5.35,0,660.0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
cleaned_training_data['loan_amnt'] =  cleaned_training_data['loan_amnt'].astype('uint16')
cleaned_training_data['id'] = cleaned_training_data['id'].astype('int64')
cleaned_training_data.set_index('id', inplace=True)
cleaned_training_data.head()

Unnamed: 0_level_0,loan_amnt,int_rate,installment,emp_length,annual_inc,issue_d,dti,delinq_2yrs,fico_range_low,fico_range_high,...,purpose_home_improvement,purpose_house,purpose_major_purchase,purpose_medical,purpose_moving,purpose_other,purpose_renewable_energy,purpose_small_business,purpose_vacation,purpose_wedding
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1077501,5000,10.65,162.869995,10.0,24000.0,2011-12-01,27.65,0,735.0,739.0,...,0,0,0,0,0,0,0,0,0,0
1077175,2400,15.96,84.330002,10.0,12252.0,2011-12-01,8.72,0,735.0,739.0,...,0,0,0,0,0,0,0,1,0,0
1076863,10000,13.49,339.309998,10.0,49200.0,2011-12-01,20.0,0,690.0,694.0,...,0,0,0,0,0,1,0,0,0,0
1075269,5000,7.9,156.460007,3.0,36000.0,2011-12-01,11.2,0,730.0,734.0,...,0,0,0,0,0,0,0,0,0,1
1072053,3000,18.639999,109.43,9.0,48000.0,2011-12-01,5.35,0,660.0,664.0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
loans = cleaned_training_data[['loan_amnt']]

In [15]:
loans.head()

Unnamed: 0_level_0,loan_amnt
id,Unnamed: 1_level_1
1077501,5000
1077175,2400
1076863,10000
1075269,5000
1072053,3000


In [20]:
pool = mp.Pool(processes=8)

In [21]:
results = pool.map(get_roi_for_loan_id, loans.index)
len(results)

529507

In [22]:
loan_rois = dict(zip(loans.index, results))

with open('data/loan_rois.pickle', 'wb') as handle:
    pickle.dump(loan_rois, handle, protocol=pickle.HIGHEST_PROTOCOL)