In [219]:
import pandas as pd
from io import BytesIO
import boto3
import multiprocessing as mp

def load_data_from_s3(filename, format='csv'):
    s3 = boto3.client('s3')
    obj = s3.get_object(Bucket='loan-analysis-data', Key=filename)
    data = obj['Body'].read()
    f = BytesIO(data)
    if format=='csv':
        df = pd.read_csv(f, low_memory=False)
    if format=='pkl.bz2':
        df = pd.read_pickle(f, compression='bz2')
    return df

In [220]:
#df_payments = load_data_from_s3('cleaned_payments_data.pkl.bz2', format='pkl.bz2')
df_predictions_in_sample = load_data_from_s3('predictions_in_sample.pkl.bz2', format='pkl.bz2')
payments = load_data_from_s3('cleaned_payments_data.pkl.bz2', format='pkl.bz2')
#df_predictions_out_of_sample = load_data_from_s3('predictions_out_of_sample.pkl.bz2', format='pkl.bz2')

In [221]:
df_predictions_in_sample.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 391542 entries, 0 to 420479
Data columns (total 4 columns):
id               391542 non-null object
issue_d          391542 non-null datetime64[ns]
loan_amnt        391542 non-null float32
predicted_roi    391542 non-null float32
dtypes: datetime64[ns](1), float32(2), object(1)
memory usage: 11.9+ MB


In [222]:
payments.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 36168827 entries, 0 to 37323066
Data columns (total 5 columns):
LOAN_ID                      int64
RECEIVED_D                   datetime64[ns]
RECEIVED_AMT_INVESTORS       float32
PBAL_END_PERIOD_INVESTORS    float32
mths_since_issue             uint8
dtypes: datetime64[ns](1), float32(2), int64(1), uint8(1)
memory usage: 1.1 GB


In [223]:
loan_ids = df_predictions_in_sample.id.unique()
len(loan_ids)

391542

In [224]:
def get_relevant_payments(all_payments, loan_ids_from_training_set):
    return payments[payments['LOAN_ID'].isin(loan_ids)][['LOAN_ID', 'RECEIVED_AMT_INVESTORS', 'mths_since_issue']]

In [225]:
payments = get_relevant_payments(payments, loan_ids)
payments.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8282827 entries, 0 to 20318479
Data columns (total 3 columns):
LOAN_ID                   int64
RECEIVED_AMT_INVESTORS    float32
mths_since_issue          uint8
dtypes: float32(1), int64(1), uint8(1)
memory usage: 165.9 MB


In [226]:
payments.head(5)

Unnamed: 0,LOAN_ID,RECEIVED_AMT_INVESTORS,mths_since_issue
0,54734,632.770996,1
1,54734,632.770996,2
2,54734,632.770996,3
3,54734,632.770996,4
4,54734,632.770996,5


In [227]:
df_predictions_in_sample['loan_amnt'] =  df_predictions_in_sample['loan_amnt'].astype('uint16')
df_predictions_in_sample.head(5)

Unnamed: 0,id,issue_d,loan_amnt,predicted_roi
0,1077501,2011-12-01,5000,-2.406683
1,1077175,2011-12-01,2400,-6.132463
2,1076863,2011-12-01,10000,-0.148904
3,1075269,2011-12-01,5000,-1.621698
4,1072053,2011-12-01,3000,-1.602118


In [228]:
df_predictions_in_sample['id'] = df_predictions_in_sample['id'].astype('int64')

In [229]:
df_predictions_in_sample.set_index('id', inplace=True)

In [230]:
df_predictions_in_sample.head()

Unnamed: 0_level_0,issue_d,loan_amnt,predicted_roi
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1077501,2011-12-01,5000,-2.406683
1077175,2011-12-01,2400,-6.132463
1076863,2011-12-01,10000,-0.148904
1075269,2011-12-01,5000,-1.621698
1072053,2011-12-01,3000,-1.602118


In [231]:
loans = df_predictions_in_sample[['loan_amnt']]

In [232]:
loans.head()

Unnamed: 0_level_0,loan_amnt
id,Unnamed: 1_level_1
1077501,5000
1077175,2400
1076863,10000
1075269,5000
1072053,3000


In [234]:
one_loan = get_one_loan_payment_data(payments, 10149342)
one_loan.head()

Unnamed: 0,RECEIVED_AMT_INVESTORS,mths_since_issue
6574381,885.460022,2
6574382,885.460022,3
6574383,885.460022,4
6574384,885.460022,5
6574385,885.460022,6


In [236]:
get_roi_for_loan_id(10149342)

10.975341796875

In [240]:
pool = mp.Pool(processes=8)

In [217]:
'10149342' in loans[29040:29050].index

True

In [181]:
loans[:1].index

Index([1077501], dtype='object', name='id')

In [171]:
problem_index = 29043

In [203]:
get_roi_for_loan_id('90395')

-84.5694061279297

In [210]:
payments[payments['LOAN_ID'] == 10149342]

Unnamed: 0,LOAN_ID,RECEIVED_AMT_INVESTORS,mths_since_issue
6574381,10149342,885.460022,2
6574382,10149342,885.460022,3
6574383,10149342,885.460022,4
6574384,10149342,885.460022,5
6574385,10149342,885.460022,6
6574386,10149342,885.460022,7
6574387,10149342,885.460022,8
6574388,10149342,885.460022,9
6574389,10149342,885.460022,10
6574390,10149342,885.460022,11


In [243]:
results = pool.map(get_roi_for_loan_id, loans[:35000].index)
len(results)

35000

In [244]:
get_rois_for_loans(loans[:10].index)

{1077501: 10.289306640625002,
 1077175: 16.248779296875004,
 1076863: 13.707275390625002,
 1075269: 7.783084106445312,
 1072053: 19.114990234375,
 1069908: 12.606201171874998,
 1064687: -97.88431701660156,
 1069866: 9.896029663085937,
 1069057: -30.444461059570312,
 1069759: 16.585693359375004}

In [245]:
dict(zip(loans[:10].index, results))

{1077501: 10.289306640625002,
 1077175: 16.248779296875004,
 1076863: 13.707275390625002,
 1075269: 7.783084106445312,
 1072053: 19.114990234375,
 1069908: 12.606201171874998,
 1064687: -97.88431701660156,
 1069866: 9.896029663085937,
 1069057: -30.444461059570312,
 1069759: 16.585693359375004}

In [237]:
# Dataframe that contains all of the payment records for all loans. 
#payments = load_data_from_s3('cleaned_payments_data.pkl.bz2', format='pkl.bz2')
loans = df_predictions_in_sample[['loan_amnt']]

def get_one_loan_payment_data(payments, loan_id):
    try:
        return payments[payments['LOAN_ID'] == loan_id][['RECEIVED_AMT_INVESTORS', 'mths_since_issue']]
    except:
        return pd.DataFrame()
    
def calculative_npv_payments(loans_payments, r_guess):
    payments = loans_payments.RECEIVED_AMT_INVESTORS
    months = loans_payments.mths_since_issue
    return sum(payments/(1+r_guess)**(months/12))

def adjust_estimated_roi(roi_guess, roi_min, roi_max, npv):
    if npv > 0:
        new_guess = (roi_guess + roi_min)/2
        new_min = roi_min
        new_max = roi_guess
    elif npv < 0:
        new_guess = (roi_guess + roi_max)/2
        new_min = roi_guess
        new_max = roi_max
    else:
        return roi_guess
    
    return (new_guess, new_min, new_max)

def get_roi_for_loan_id(loan_id):
    loan_id = int(loan_id)
    loan_size = loans['loan_amnt'].loc[loan_id]
    loan_payments = get_one_loan_payment_data(payments, loan_id)
    
    if loan_payments.empty:
        return -100
    
    r_guess = .10
    r_min = -.999
    r_max = .50
    
    for _ in range(15):
        npv_payments = calculative_npv_payments(loan_payments, r_guess)
        npv = loan_size - npv_payments
        r_guess, r_min, r_max = adjust_estimated_roi(r_guess, r_min, r_max, npv)
    return r_guess*100

def get_rois_for_loans(loan_ids):
    return {loan_id:get_roi_for_loan_id(loan_id)for loan_id in loan_ids}