In [1]:
import pandas as pd
from io import BytesIO
import boto3

def load_data_from_s3(filename, format='csv'):
    s3 = boto3.client('s3')
    obj = s3.get_object(Bucket='loan-analysis-data', Key=filename)
    data = obj['Body'].read()
    f = BytesIO(data)
    if format=='csv':
        df = pd.read_csv(f, low_memory=False)
    if format=='pkl.bz2':
        df = pd.read_pickle(f, compression='bz2')
    return df

df = load_data_from_s3('predictions_in_sample.pkl.bz2', format='pkl.bz2')

In [2]:
df.issue_d.dt.year.value_counts()

2014    129394
2013    100342
2015     89344
2012     43419
2011     14077
2010      8450
2009      4704
2008      1561
2007       251
Name: issue_d, dtype: int64

In [3]:
df_payments = load_data_from_s3('cleaned_payments_data.pkl.bz2', format='pkl.bz2')
df_predictions_in_sample = load_data_from_s3('predictions_in_sample.pkl.bz2', format='pkl.bz2')
df_predictions_out_of_sample = load_data_from_s3('predictions_out_of_sample.pkl.bz2', format='pkl.bz2')

In [4]:
df_payments.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 36168827 entries, 0 to 37323066
Data columns (total 5 columns):
LOAN_ID                      int64
RECEIVED_D                   datetime64[ns]
RECEIVED_AMT_INVESTORS       float32
PBAL_END_PERIOD_INVESTORS    float32
mths_since_issue             uint8
dtypes: datetime64[ns](1), float32(2), int64(1), uint8(1)
memory usage: 1.1 GB


In [5]:
df_payments.head()

Unnamed: 0,LOAN_ID,RECEIVED_D,RECEIVED_AMT_INVESTORS,PBAL_END_PERIOD_INVESTORS,mths_since_issue
0,54734,2009-09-01,632.770996,18636.410156,1
1,54734,2009-10-01,632.770996,18188.363281,2
2,54734,2009-11-01,632.770996,17735.876953,3
3,54734,2009-12-01,632.770996,17278.90625,4
4,54734,2010-01-01,632.770996,16817.404297,5


In [6]:
df_payments.LOAN_ID.nunique()

2000997

In [7]:
# Objective is to calculate ROI for a loan.

def get_one_loan_payment_data(payments, loan_id):
    return payments[payments['LOAN_ID'] == loan_id]

In [8]:
df = get_one_loan_payment_data(df_payments, 54734)
df

Unnamed: 0,LOAN_ID,RECEIVED_D,RECEIVED_AMT_INVESTORS,PBAL_END_PERIOD_INVESTORS,mths_since_issue
0,54734,2009-09-01,632.770996,18636.410156,1
1,54734,2009-10-01,632.770996,18188.363281,2
2,54734,2009-11-01,632.770996,17735.876953,3
3,54734,2009-12-01,632.770996,17278.90625,4
4,54734,2010-01-01,632.770996,16817.404297,5
5,54734,2010-02-01,632.770996,16351.329102,6
6,54734,2010-03-01,632.770996,15880.633789,7
7,54734,2010-04-01,632.770996,15405.273438,8
8,54734,2010-05-01,632.770996,14925.200195,9
9,54734,2010-06-01,632.770996,14440.369141,10


In [9]:
r_guess = .13
r_min = -.999
r_max = .50
initial_investment = 19000

def calculative_npv_payments(payments_row, month_row, r_guess):
    return sum(payments_row/(1+r_guess)**(month_row/12))

In [10]:
calculative_npv_payments(df['RECEIVED_AMT_INVESTORS'], df['mths_since_issue'], r_guess)

19000.239430725473

In [11]:
df['npv_payment'] = df['RECEIVED_AMT_INVESTORS'] / (1+r_guess)**(df['mths_since_issue']/12)
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,LOAN_ID,RECEIVED_D,RECEIVED_AMT_INVESTORS,PBAL_END_PERIOD_INVESTORS,mths_since_issue,npv_payment
0,54734,2009-09-01,632.770996,18636.410156,1,626.359056
1,54734,2009-10-01,632.770996,18188.363281,2,620.012089
2,54734,2009-11-01,632.770996,17735.876953,3,613.729436
3,54734,2009-12-01,632.770996,17278.90625,4,607.510446
4,54734,2010-01-01,632.770996,16817.404297,5,601.354474
5,54734,2010-02-01,632.770996,16351.329102,6,595.260881
6,54734,2010-03-01,632.770996,15880.633789,7,589.229035
7,54734,2010-04-01,632.770996,15405.273438,8,583.25831
8,54734,2010-05-01,632.770996,14925.200195,9,577.348088
9,54734,2010-06-01,632.770996,14440.369141,10,571.497754


In [12]:
df['npv_payment'].sum()

19000.23943072547

In [13]:
# Value < 0 means r_estimate is too low
# Value > 0 means r_estimate is too high
npv = initial_investment - df['npv_payment'].sum()
npv

-0.23943072546899202

In [14]:
def adjust_estimated_roi(roi_guess, roi_min, roi_max, npv):
    if npv > 0:
        new_guess = (roi_guess + roi_min)/2
        new_min = roi_min
        new_max = roi_guess
    elif npv < 0:
        new_guess = (roi_guess + roi_max)/2
        new_min = roi_guess
        new_max = roi_max
    else:
        return roi_guess
    
    return (new_guess, new_min, new_max)

In [15]:
adjust_estimated_roi(.07, r_min, r_max, npv)

(0.28500000000000003, 0.07, 0.5)

In [16]:
r_guess = .07
r_min = -.999
r_max = .60
initial_investment = 19000
payments = df

for _ in range(15):
    npv_payments = calculative_npv_payments(payments['RECEIVED_AMT_INVESTORS'], payments['mths_since_issue'], r_guess)
    total_npv = initial_investment - npv_payments
    r_guess, r_min, r_max = adjust_estimated_roi(r_guess, r_min, r_max, total_npv)
print(r_guess)

0.13002288818359373


In [17]:
payments.head()

Unnamed: 0,LOAN_ID,RECEIVED_D,RECEIVED_AMT_INVESTORS,PBAL_END_PERIOD_INVESTORS,mths_since_issue,npv_payment
0,54734,2009-09-01,632.770996,18636.410156,1,626.359056
1,54734,2009-10-01,632.770996,18188.363281,2,620.012089
2,54734,2009-11-01,632.770996,17735.876953,3,613.729436
3,54734,2009-12-01,632.770996,17278.90625,4,607.510446
4,54734,2010-01-01,632.770996,16817.404297,5,601.354474


In [18]:
df_predictions_in_sample.head(10)

Unnamed: 0,id,issue_d,loan_amnt,predicted_roi
0,1077501,2011-12-01,5000.0,-2.406683
1,1077175,2011-12-01,2400.0,-6.132463
2,1076863,2011-12-01,10000.0,-0.148904
3,1075269,2011-12-01,5000.0,-1.621698
4,1072053,2011-12-01,3000.0,-1.602118
5,1069908,2011-12-01,12000.0,2.122347
6,1064687,2011-12-01,9000.0,-2.561695
7,1069866,2011-12-01,3000.0,-2.374362
8,1069057,2011-12-01,10000.0,1.063048
9,1069759,2011-12-01,1000.0,-4.626377


In [19]:
for test in df_predictions_in_sample['id'].head():
    print(test)

1077501
1077175
1076863
1075269
1072053


In [20]:
rows = df_predictions_in_sample.to_dict(orient='records')

In [21]:
rows[:5]

[{'id': 1077501,
  'issue_d': Timestamp('2011-12-01 00:00:00'),
  'loan_amnt': 5000.0,
  'predicted_roi': -2.4066829681396484},
 {'id': 1077175,
  'issue_d': Timestamp('2011-12-01 00:00:00'),
  'loan_amnt': 2400.0,
  'predicted_roi': -6.132463455200195},
 {'id': 1076863,
  'issue_d': Timestamp('2011-12-01 00:00:00'),
  'loan_amnt': 10000.0,
  'predicted_roi': -0.14890408515930176},
 {'id': 1075269,
  'issue_d': Timestamp('2011-12-01 00:00:00'),
  'loan_amnt': 5000.0,
  'predicted_roi': -1.6216981410980225},
 {'id': 1072053,
  'issue_d': Timestamp('2011-12-01 00:00:00'),
  'loan_amnt': 3000.0,
  'predicted_roi': -1.6021175384521484}]

In [None]:
r_guess = .13
r_min = -.999
r_max = .50

def get_one_loan_payment_data(payments, loan_id):
    return payments[payments['LOAN_ID'] == loan_id]

def calculative_npv_payments(payments_row, month_row, r_guess):
    return sum(payments_row/(1+r_guess)**(month_row/12))

def adjust_estimated_roi(roi_guess, roi_min, roi_max, npv):
    if npv > 0:
        new_guess = (roi_guess + roi_min)/2
        new_min = roi_min
        new_max = roi_guess
    elif npv < 0:
        new_guess = (roi_guess + roi_max)/2
        new_min = roi_guess
        new_max = roi_max
    else:
        return roi_guess
    
    return (new_guess, new_min, new_max)

rois_completed_loans = {}

for row in rows:
    loan_id = row['id']
    loan_size =  row['loan_amnt']
    try:
        payments = get_one_loan_payment_data(df_payments, loan_id)
    except:
        payments = []
        
    if len(payments) == 0:
        rois_completed_loans.update({loan_id:-1*100})
    else:    
        r_guess = .13
        r_min = -.999
        r_max = .50

        for _ in range(15):
            npv_payments = calculative_npv_payments(payments['RECEIVED_AMT_INVESTORS'],
                                                    payments['mths_since_issue'], r_guess)
            total_npv = loan_size - npv_payments
            r_guess, r_min, r_max = adjust_estimated_roi(r_guess, r_min, r_max, total_npv)
        rois_completed_loans.update({loan_id:r_guess*100})

In [None]:
len(rois_completed_loans)

In [35]:
len(rois_completed_loans)

29043