In [1]:
import datetime
import pickle
import numpy as np
np.set_printoptions(suppress=True)
import pandas as pd
from datetime import datetime as dt

%run src/columns.py
%run src/data-cleaning.py
%run src/feature-engineering.py

In [2]:
loans = pd.read_pickle('data/raw_dataframe.pkl.bz2')
loans = drop_loan_status(loans)
loans = drop_joint_applicant_loans(loans)
loans = fix_rate_cols(loans)
loans.dropna(subset=['issue_d'], inplace=True)
loans = fix_date_cols(loans)
loans = clean_loan_term_col(loans)
loans = only_include_36_month_loans(loans)
loans = clean_employment_length(loans)
loans = create_missing_data_boolean_columns(loans)
loans = fill_nas(loans, value=-99)
loans = add_issue_date_and_month(loans)
loans = add_supplemental_rate_data(loans)
loans = create_rate_difference_cols(loans)
loans = create_months_since_earliest_cl_col(loans)
loans = create_loan_life_months_col(loans)
loans = change_data_types(loans)
loans = create_dummy_cols(loans)
loans = drop_unnecessary_cols(loans)

In [4]:
loans.head(5)

Unnamed: 0,id,loan_amnt,int_rate,installment,emp_length,annual_inc,issue_d,dti,delinq_2yrs,fico_range_low,...,purpose_home_improvement,purpose_house,purpose_major_purchase,purpose_medical,purpose_moving,purpose_other,purpose_renewable_energy,purpose_small_business,purpose_vacation,purpose_wedding
0,1077501,5000.0,10.65,162.869995,10.0,24000.0,2011-12-01,27.65,0,735.0,...,0,0,0,0,0,0,0,0,0,0
1,1077175,2400.0,15.96,84.330002,10.0,12252.0,2011-12-01,8.72,0,735.0,...,0,0,0,0,0,0,0,1,0,0
2,1076863,10000.0,13.49,339.309998,10.0,49200.0,2011-12-01,20.0,0,690.0,...,0,0,0,0,0,1,0,0,0,0
3,1075269,5000.0,7.9,156.460007,3.0,36000.0,2011-12-01,11.2,0,730.0,...,0,0,0,0,0,0,0,0,0,1
4,1072053,3000.0,18.639999,109.43,9.0,48000.0,2011-12-01,5.35,0,660.0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
loans.set_index('id', inplace=True)

In [8]:
loans.issue_d.max()

Timestamp('2018-06-01 00:00:00')

In [None]:
def get_training_and_testing_data(df, split_date='2015-09-01'):
    '''
    Loans before the cutoff date will be used for training the model, the others will be used for
    simulating and evaluating the model's performance.
    '''
    training_loans = df[df['issue_d'].isin(pd.date_range('2010-01-01', split_date)) == True]
    testing_loans = df[df['issue_d'].isin(pd.date_range('2010-01-01', split_date)) == False]
    return training_loans, testing_loans

In [9]:
cutoff_date = '2015-09-01'
training_loans, testing_loans = get_training_and_testing_data(loans, cutoff_date)

In [10]:
len(training_loans)

529507

In [12]:
with open('data/loan_rois.pickle', 'rb') as handle:
    loan_rois = pickle.load(handle)
    
len(loan_rois)

529507

In [13]:
training_loans.index = training_loans.index.astype(int)
testing_loans.index = testing_loans.index.astype(int)

In [14]:
roi_col = pd.DataFrame.from_dict(loan_rois, orient='index')
roi_col.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 529507 entries, 1077501 to 36271262
Data columns (total 1 columns):
0    529507 non-null float64
dtypes: float64(1)
memory usage: 8.1 MB


In [15]:
training_loans['roi'] = roi_col
training_loans['roi'].head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


id
1077501    10.289307
1077175    16.248779
1076863    13.707275
1075269     7.783084
1072053    19.114990
1069908    12.606201
1064687   -97.884317
1069866     9.896030
1069057   -30.444461
1069759    16.585693
Name: roi, dtype: float64

In [20]:
training_loans.to_pickle('data/cleaned_training_loans.pkl.bz2', compression='bz2')
testing_loans.to_pickle('data/cleaned_testing_loans.pkl.bz2', compression='bz2')