In [1]:
from io import BytesIO
import boto3
import multiprocessing as mp
import pickle
import pandas as pd

def load_data_from_s3(filename, format='csv'):
    s3 = boto3.client('s3')
    obj = s3.get_object(Bucket='loan-analysis-data', Key=filename)
    data = obj['Body'].read()
    f = BytesIO(data)
    if format=='csv':
        df = pd.read_csv(f, low_memory=False)
    if format=='pkl.bz2':
        df = pd.read_pickle(f, compression='bz2')
    return df    

In [2]:
# Read in loan ROIs
with open('data/loan_rois.pickle', 'rb') as handle:
    loan_rois = pickle.load(handle)

In [3]:
# Read in training data
training_loans = pd.read_pickle('data/cleaned_training_loans.pkl.bz2')

In [4]:
# Read in testing data
testing_loans = pd.read_pickle('data/cleaned_testing_loans.pkl.bz2')

In [5]:
training_loans.head()

Unnamed: 0,id,loan_amnt,int_rate,installment,emp_length,annual_inc,issue_d,dti,delinq_2yrs,fico_range_low,...,purpose_home_improvement,purpose_house,purpose_major_purchase,purpose_medical,purpose_moving,purpose_other,purpose_renewable_energy,purpose_small_business,purpose_vacation,purpose_wedding
0,1077501,5000,10.65,162.869995,10.0,24000.0,2011-12-01,27.65,0,735.0,...,0,0,0,0,0,0,0,0,0,0
1,1077175,2400,15.96,84.330002,10.0,12252.0,2011-12-01,8.72,0,735.0,...,0,0,0,0,0,0,0,1,0,0
2,1076863,10000,13.49,339.309998,10.0,49200.0,2011-12-01,20.0,0,690.0,...,0,0,0,0,0,1,0,0,0,0
3,1075269,5000,7.9,156.460007,3.0,36000.0,2011-12-01,11.2,0,730.0,...,0,0,0,0,0,0,0,0,0,1
4,1072053,3000,18.639999,109.43,9.0,48000.0,2011-12-01,5.35,0,660.0,...,0,0,0,0,0,0,0,0,0,0


In [56]:
testing_loans.head()

Unnamed: 0,id,loan_amnt,int_rate,installment,emp_length,annual_inc,issue_d,dti,delinq_2yrs,fico_range_low,...,purpose_home_improvement,purpose_house,purpose_major_purchase,purpose_medical,purpose_moving,purpose_other,purpose_renewable_energy,purpose_small_business,purpose_vacation,purpose_wedding
335281,68407277,3600,13.99,123.029999,10.0,55000.0,2015-12-01,5.91,0,675.0,...,0,0,0,0,0,0,0,0,0,0
335282,68495092,8650,19.889999,320.98999,8.0,55000.0,2015-12-01,25.49,0,675.0,...,0,0,0,0,0,0,0,0,0,0
335283,68466961,28000,6.49,858.049988,10.0,92000.0,2015-12-01,21.6,0,720.0,...,0,0,0,0,0,0,0,0,0,0
335284,68466916,25000,7.49,777.549988,10.0,109000.0,2015-12-01,26.02,0,745.0,...,0,0,0,0,0,0,0,0,0,0
335285,68354783,9600,7.49,298.579987,8.0,60000.0,2015-12-01,22.440001,0,695.0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
training_loans.set_index('id', inplace=True)

In [7]:
#training_loans['roi'].loc[1077501] = 34827943

In [8]:
training_loans.index[:10]

Index([1077501, 1077175, 1076863, 1075269, 1072053, 1069908, 1064687, 1069866,
       1069057, 1069759],
      dtype='object', name='id')

In [9]:
training_loans['roi'].head()

id
1077501    5.300597
1077175    9.418818
1076863    6.704011
1075269    3.937061
1072053    9.234829
Name: roi, dtype: float32

In [10]:
len(loan_rois)

529507

In [37]:
roi_col = pd.DataFrame.from_dict(loan_rois, orient='index')
roi_col.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 529507 entries, 1077501 to 36271262
Data columns (total 1 columns):
0    529507 non-null float64
dtypes: float64(1)
memory usage: 8.1 MB


In [54]:
training_loans['roi'] = roi_col
training_loans['roi'].head(10)

id
1077501    10.289307
1077175    16.248779
1076863    13.707275
1075269     7.783084
1072053    19.114990
1069908    12.606201
1064687   -97.884317
1069866     9.896030
1069057   -30.444461
1069759    16.585693
Name: roi, dtype: float64

In [None]:
pd.read_pickle('data/cleaned_training_loans.pkl.bz2')

In [57]:
training_loans.to_pickle('data/cleaned_training_loans.pkl.bz2', compression='bz2')

In [58]:
testing_loans.set_index('id', inplace=True)
testing_loans.head(5)

Unnamed: 0_level_0,loan_amnt,int_rate,installment,emp_length,annual_inc,issue_d,dti,delinq_2yrs,fico_range_low,fico_range_high,...,purpose_home_improvement,purpose_house,purpose_major_purchase,purpose_medical,purpose_moving,purpose_other,purpose_renewable_energy,purpose_small_business,purpose_vacation,purpose_wedding
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
68407277,3600,13.99,123.029999,10.0,55000.0,2015-12-01,5.91,0,675.0,679.0,...,0,0,0,0,0,0,0,0,0,0
68495092,8650,19.889999,320.98999,8.0,55000.0,2015-12-01,25.49,0,675.0,679.0,...,0,0,0,0,0,0,0,0,0,0
68466961,28000,6.49,858.049988,10.0,92000.0,2015-12-01,21.6,0,720.0,724.0,...,0,0,0,0,0,0,0,0,0,0
68466916,25000,7.49,777.549988,10.0,109000.0,2015-12-01,26.02,0,745.0,749.0,...,0,0,0,0,0,0,0,0,0,0
68354783,9600,7.49,298.579987,8.0,60000.0,2015-12-01,22.440001,0,695.0,699.0,...,0,0,0,0,0,0,0,0,0,0


In [59]:
testing_loans.to_pickle('data/cleaned_testing_loans.pkl.bz2', compression='bz2')

In [61]:
payments = load_data_from_s3('cleaned_payments_data.pkl.bz2', format='pkl.bz2')

In [62]:
payments.head()

Unnamed: 0,LOAN_ID,RECEIVED_D,RECEIVED_AMT_INVESTORS,PBAL_END_PERIOD_INVESTORS,mths_since_issue
0,54734,2009-09-01,632.770996,18636.410156,1
1,54734,2009-10-01,632.770996,18188.363281,2
2,54734,2009-11-01,632.770996,17735.876953,3
3,54734,2009-12-01,632.770996,17278.90625,4
4,54734,2010-01-01,632.770996,16817.404297,5


In [71]:
test = payments.set_index(['RECEIVED_D', 'LOAN_ID'])
test.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,RECEIVED_AMT_INVESTORS,PBAL_END_PERIOD_INVESTORS,mths_since_issue
RECEIVED_D,LOAN_ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2009-09-01,54734,632.770996,18636.410156,1
2009-10-01,54734,632.770996,18188.363281,2
2009-11-01,54734,632.770996,17735.876953,3
2009-12-01,54734,632.770996,17278.90625,4
2010-01-01,54734,632.770996,16817.404297,5


In [67]:
test.loc[54734, '2009-09-01']

  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,Unnamed: 1_level_0,RECEIVED_AMT_INVESTORS,PBAL_END_PERIOD_INVESTORS,mths_since_issue
LOAN_ID,RECEIVED_D,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
54734,2009-09-01,632.770996,18636.410156,1


In [74]:
from datetime import datetime

In [93]:
test.loc[datetime(2007,7,1)]

Unnamed: 0_level_0,RECEIVED_AMT_INVESTORS,PBAL_END_PERIOD_INVESTORS,mths_since_issue
LOAN_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
72176,7.189307,219.55983,1
73582,7.289357,219.637436,1
74505,7.25625,219.611313,1
77792,3.975833,121.962997,1
81085,9.03231,268.539795,1
81085,9.03231,262.019165,1
83185,19.331249,609.375916,1
83489,18.119135,560.89679,1
83979,7.769166,243.779495,1
84098,6.9921,219.401703,1


In [98]:
test[['RECEIVED_AMT_INVESTORS', 'PBAL_END_PERIOD_INVESTORS']].head()

Unnamed: 0_level_0,Unnamed: 1_level_0,RECEIVED_AMT_INVESTORS,PBAL_END_PERIOD_INVESTORS
RECEIVED_D,LOAN_ID,Unnamed: 2_level_1,Unnamed: 3_level_1
2009-09-01,54734,632.770996,18636.410156
2009-10-01,54734,632.770996,18188.363281
2009-11-01,54734,632.770996,17735.876953
2009-12-01,54734,632.770996,17278.90625
2010-01-01,54734,632.770996,16817.404297


In [96]:
test.to_pickle('data/cleaned_payments_data_indexed.pkl.bz2', compression='bz2')