In [1]:
import pandas as pd
import numpy as np
import re
from datetime import datetime
from ggplot import *
%matplotlib inline

You can access Timestamp as pandas.Timestamp
  pd.tslib.Timestamp,
  from pandas.lib import Timestamp
  from pandas.core import datetools


In [2]:
data_path = '/Users/erourke/Desktop'
loans_full = pd.read_csv(data_path+'/loans_full.csv', low_memory=False)
loans_details = pd.read_csv(data_path+'/loans_details.csv', low_memory=False)

In [4]:
additional_columns =  loans_full.columns.difference(loans_details.columns).tolist() + ['id']
df = loans_details.merge(loans_full[['id', 'borrower_count']], left_on='id', right_on='id')

In [5]:
df = df.rename(columns={'terms.disbursal_date': 'disbursal_date'})

## convert columns to datetime format
df['funded_date_cln'] = pd.to_datetime(df.funded_date.str[:10], infer_datetime_format=True)
df['post_date_cln'] = pd.to_datetime(df.posted_date.str[:10], infer_datetime_format=True)
df['disb_date_cln'] = pd.to_datetime(df.disbursal_date.str[:10], infer_datetime_format=True)
df['exp_date_cln'] = pd.to_datetime(df.planned_expiration_date.str[:10], infer_datetime_format=True)

## Time to fund is the funded date minus the posted date
df['time_to_fund'] =df['funded_date_cln'] - df['post_date_cln']
df['days_to_fund'] = df.time_to_fund.dt.days

## Time to dispursement is the Disbursed date minus the Posted Date
df['time_to_disb'] =df['disb_date_cln'] - df['post_date_cln']
df['days_to_disb'] = df.time_to_disb.dt.days

## Time to expiration is the expiration date minus the Posted Date
df['time_to_exp'] =df['exp_date_cln'] - df['post_date_cln']
df['days_to_exp'] = df.time_to_exp.dt.days

In [6]:
import statsmodels.formula.api as sm

# Set training-test split %
split_pct = 0.80

# the pandas command "sample" already randomizes its selection. 
loans_full_shuffled = df.sample(frac=1)

train_set = loans_full_shuffled[:int((len(loans_full_shuffled)+1*split_pct))] 
test_set = loans_full_shuffled[int(len(loans_full_shuffled)*split_pct+1):]  

# Now we have two separate datasets: train_set and test_set. We will run the regression on train_set
# using statsmodel's "ols" function. Let's try a simple example to start:

model1 = sm.ols(formula="days_to_fund ~ loan_amount", data=train_set).fit()
print(model1.summary())

                            OLS Regression Results                            
Dep. Variable:           days_to_fund   R-squared:                       0.020
Model:                            OLS   Adj. R-squared:                  0.020
Method:                 Least Squares   F-statistic:                     2448.
Date:                Fri, 26 May 2017   Prob (F-statistic):               0.00
Time:                        10:19:26   Log-Likelihood:            -4.8066e+05
No. Observations:              122331   AIC:                         9.613e+05
Df Residuals:                  122329   BIC:                         9.613e+05
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept       6.9034      0.043    161.535      

In [7]:
import matplotlib.pyplot as plt
y_pred['predictions'] = model1.predict(test_set)
y_pred

fig, ax = plt.subplots()
ax.plot(test_set['loan_amount'], test_set['days_to_fund'], 'o', label="Data")
ax.plot(test_set['loan_amount'], y_pred['predictions'], 'r--.', label="Predicted")
axes = plt.gca()
axes.set_xlim([0,2000])
axes.set_ylim([0,100])
ax.legend(loc="best");

NameError: name 'y_pred' is not defined