In [1]:
import datetime
import numpy as np
np.set_printoptions(suppress=True)
import pandas as pd
from datetime import datetime as dt
from xgboost import XGBRegressor
from math import sqrt
%run src/columns.py
%run src/data-cleaning.py
%run src/feature-engineering.py
%run src/modeling.py

ModuleNotFoundError: No module named 'xgboost'

In [2]:
csv_file_names = ('LoanStats3a_securev1', 'LoanStats3b_securev1', 'LoanStats3c_securev1', 'LoanStats3d_securev1',
                  'LoanStats_securev1_2016Q1', 'LoanStats_securev1_2016Q2', 'LoanStats_securev1_2016Q3',
                  'LoanStats_securev1_2016Q4', 'LoanStats_securev1_2017Q1', 'LoanStats_securev1_2017Q2', 
                  'LoanStats_securev1_2017Q3', 'LoanStats_securev1_2017Q4', 'LoanStats_securev1_2018Q1')

loans = load_data(csv_file_names, columns_to_use, number_of_rows=5000)
loans = drop_loans_not_complete(loans)
loans = drop_loan_status(loans)
loans = drop_joint_applicant_loans(loans)
loans = fix_rate_cols(loans)
loans = fix_date_cols(loans)
loans = clean_loan_term_col(loans)
loans = only_include_36_month_loans(loans)
loans = clean_employment_length(loans)
loans = create_missing_data_boolean_columns(loans)
loans = fill_nas(loans, value=-99)
loans = add_issue_date_and_month(loans)
loans = add_supplemental_rate_data(loans)
loans = create_rate_difference_cols(loans)
loans = create_months_since_earliest_cl_col(loans)
loans = create_loan_life_months_col(loans)
loans['roi'] = create_roi_col(loans['total_rec_prncp'], loans['total_rec_int'], loans['loan_amnt'], loans['loan_life_months'])
loans['roi'].replace(np.inf, 0, inplace=True)
loans = change_data_types(loans)
loans = create_dummy_cols(loans)
loans = drop_unnecessary_cols(loans)
# Loans before this date will be used for training.
# Loans after this date used for testing.
cutoff_date = '2015-09-01'
training_loans, testing_loans = get_training_and_testing_data(loans, cutoff_date)
X_train, y_train = split_data_into_labels_and_target(training_loans)
X_test, y_test = split_data_into_labels_and_target(testing_loans)

In [4]:
model = XGBRegressor()
fit_model = train_model(model, X_train, y_train)

test_predictions = get_predictions(fit_model, X_test)

In [5]:
test_predictions = get_predictions(fit_model, X_test)
test_predictions

array([ -6.660845  ,   4.1249332 ,   2.5834808 , ...,   3.256811  ,
       -17.233112  ,   0.31343883], dtype=float32)

Get Payment History Data

In [6]:
testing_loans.head()

Unnamed: 0,id,loan_amnt,int_rate,installment,emp_length,annual_inc,issue_d,dti,delinq_2yrs,fico_range_low,...,purpose_home_improvement,purpose_house,purpose_major_purchase,purpose_medical,purpose_moving,purpose_other,purpose_renewable_energy,purpose_small_business,purpose_vacation,purpose_wedding
9043,68495092,8650,19.889999,320.98999,8.0,55000.0,2015-12-01,25.49,0,675,...,0,0,0,0,0,0,0,0,0,0
9044,68466961,28000,6.49,858.049988,10.0,92000.0,2015-12-01,21.6,0,720,...,0,0,0,0,0,0,0,0,0,0
9045,68466916,25000,7.49,777.549988,10.0,109000.0,2015-12-01,26.02,0,745,...,0,0,0,0,0,0,0,0,0,0
9046,68355089,24700,11.99,820.280029,10.0,65000.0,2015-12-01,16.059999,1,715,...,0,0,0,0,0,0,0,1,0,0
9047,67275481,20000,8.49,631.26001,10.0,85000.0,2015-12-01,17.610001,1,705,...,0,0,1,0,0,0,0,0,0,0


In [7]:
test_predictions

array([ -6.660845  ,   4.1249332 ,   2.5834808 , ...,   3.256811  ,
       -17.233112  ,   0.31343883], dtype=float32)

In [8]:
df_loans_available = testing_loans[['id', 'issue_d', 'loan_amnt']]
df_loans_available['predicted_roi'] = test_predictions
df_loans_available.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


Unnamed: 0,id,issue_d,loan_amnt,predicted_roi
9043,68495092,2015-12-01,8650,-6.660845
9044,68466961,2015-12-01,28000,4.124933
9045,68466916,2015-12-01,25000,2.583481
9046,68355089,2015-12-01,24700,1.311146
9047,67275481,2015-12-01,20000,3.582185
9048,68476668,2015-12-01,20000,2.919997
9049,68338832,2015-12-01,1400,1.552664
9050,68616873,2015-12-01,8000,-0.659829
9051,68426831,2015-12-01,11950,-1.68711
9052,68607141,2015-12-01,17600,-0.311338


In [23]:
%run src/portfolio.py

In [12]:
myPortfolio = Portfolio(10000, 50, datetime.date(2015,12,1))

In [13]:
myPortfolio.date

datetime.date(2015, 12, 1)

In [14]:
myPortfolio.increment_date_by_one_month()
myPortfolio.date

datetime.date(2016, 1, 1)

In [15]:
myPortfolio.active_loans

[]

In [16]:
myPortfolio.date

datetime.date(2016, 1, 1)

In [19]:
df_payments = pd.read_pickle('data/cleaned_payments_data', compression='bz2')

In [20]:
df_loans_available.head(10)

Unnamed: 0,id,issue_d,loan_amnt,predicted_roi
9043,68495092,2015-12-01,8650,-6.660845
9044,68466961,2015-12-01,28000,4.124933
9045,68466916,2015-12-01,25000,2.583481
9046,68355089,2015-12-01,24700,1.311146
9047,67275481,2015-12-01,20000,3.582185
9048,68476668,2015-12-01,20000,2.919997
9049,68338832,2015-12-01,1400,1.552664
9050,68616873,2015-12-01,8000,-0.659829
9051,68426831,2015-12-01,11950,-1.68711
9052,68607141,2015-12-01,17600,-0.311338


In [25]:
### def simulation:
%run src/portfolio.py
myPortfolio.get_payments_for_given_date(df_loans_available)

AttributeError: 'Portfolio' object has no attribute 'get_payments_for_given_date'

In [None]:
def convert_df_rows_to_loans(df, investment):
    loans = []
    rows = df.to_dict(orient='records')
    for row in rows:
        loans.append(Loan(row['id'], row['loan_amnt'], investment))
    return loans

def get_loans_available_for_given_date(loans_df):
    '''
    date parameter needs to be of type datetime.date
    '''
    return loans_df[(loans_df['issue_d'].dt.year == self.date.year) & (loans_df['issue_d'].dt.month == self.date.month)]

def get_top_n_loans_to_buy(loans, n):
    loans.sort_values(by='predicted_roi', ascending=False, inplace=True)
    return loans.head(n)

def get_loans_over_required_roi_threshold(df, min_roi):
    return df[df['predicted_roi'] >= min_roi]

def get_payments_for_date(payments_df, date):
    '''
    date parameter needs to be of type datetime.date
    '''
    return payments_df[(payments_df['RECEIVED_D'].dt.year == date.year) & (payments_df['RECEIVED_D'].dt.month == date.month)]

def get_top_n_loans_to_buy(loans, n):
    loans.sort_values(by='predicted_roi', ascending=False, inplace=True)
    return loans.head(n)

def add():
    pass