In [27]:
import pandas as pd
import numpy as np
from datetime import date

In [28]:
# import sys
# sys.path.append('.../Jane')

import extract_data_ez
import transformers_ez

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [29]:
zillow_full_initial = extract_data_ez.zillow_full
zillow_full_initial = pd.read_csv('zillow_full_imputed.csv')

In [30]:
cluster_df_initial = pd.read_csv('all_areas_clusters.csv')
cluster_df = cluster_df_initial.copy()
cluster_df.set_index('Date',inplace = True)

In [31]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [32]:
drop_cols = ['City','State','Metro','County','Zipcode','SizeRank','Year','State-County']
for col in drop_cols:
    if col in cluster_df:
        del cluster_df[col]

In [33]:
dict_of_dummy_var = {'Clusters':'Cluster','Zipcode':'Zip'}

for var, prefix in dict_of_dummy_var.items():
    if var in cluster_df:
        dummy_df = pd.get_dummies(cluster_df[var], prefix=prefix, drop_first=True)
        cluster_df = pd.concat([cluster_df, dummy_df], axis=1)

In [34]:
exclude_cols = ['Clusters','Rent']
reg_vars = []
for col in cluster_df.columns:
    if col not in exclude_cols:
        reg_vars.append(col)

In [35]:
def train_test(X,y):
    '''
    Input the output of the preProc function. Make sure 'Date' is the index of X dataframe.
    '''
    Xtrain = X.loc[X.index <'2019-01-01']
    train_index = Xtrain.shape[0]
    Xtest = X[train_index:]
    ytrain = y[:train_index]
    ytest = y[train_index:]
    return Xtrain,Xtest,ytrain,ytest

In [36]:
cluster_df['avg_commute_time'] = cluster_df['avg_commute_time'] / 60

In [37]:
X = cluster_df[reg_vars]
y = cluster_df['Rent']

X_train, X_test, y_train, y_test = train_test(X, y)

In [38]:
lasso = Lasso()

In [39]:
train_R2 = []
test_R2  = []
train_predictions = []
test_predictions = []

alphaRange = np.linspace(1e-10,0.003,200)
#alphaRange = [.0001]
for alpha in alphaRange:
    lasso.set_params(alpha=alpha, normalize=True, max_iter=10000)  
    lasso.fit(X, y)
    train_R2.append(lasso.score(X_train, y_train))
    test_R2.append(lasso.score(X_test, y_test))
    
    test_prediction = lasso.predict(X_test)
    train_prediction = lasso.predict(X_train)
    
    train_predictions.append(abs(train_prediction-y_train).mean())
    test_predictions.append(abs(test_prediction-y_test).mean())

In [40]:
train_R2

[0.6693826873748288,
 0.6693828301618084,
 0.6693829472313182,
 0.6693830385476958,
 0.6693831041882079,
 0.6693831440881443,
 0.669383158291154,
 0.6693831468129965,
 0.6693831095316887,
 0.6693830465813431,
 0.6693829579650558,
 0.6693828436074745,
 0.6693827035538673,
 0.6693825378621272,
 0.6693823463946846,
 0.6693821292275144,
 0.6693818864279142,
 0.6693816178882377,
 0.6693813237445985,
 0.669381003838472,
 0.6693806582415511,
 0.6693802869882277,
 0.6693798900550474,
 0.6693794674415006,
 0.6693790191604155,
 0.6693785451956806,
 0.6693780455684947,
 0.6693775202625278,
 0.669376969253155,
 0.6693763925509713,
 0.6693757902911841,
 0.6693751622868324,
 0.6693745086845186,
 0.6693738288655566,
 0.669373040747606,
 0.6693725162991355,
 0.6693720003592671,
 0.6693714821671395,
 0.6693709588571835,
 0.669370430425722,
 0.6693698968689823,
 0.6693693581830904,
 0.6693688143640661,
 0.6693682654078211,
 0.6693677113101565,
 0.6693671520667552,
 0.6693665876731859,
 0.669366018124892

In [41]:
test_R2

[0.6475048446554118,
 0.6475042002906533,
 0.6475035277739873,
 0.6475028272862827,
 0.6475020984729939,
 0.6475013416628779,
 0.6475005566399981,
 0.6474997432980965,
 0.6474989024457597,
 0.6474980332550303,
 0.6474971356559371,
 0.6474962102568327,
 0.6474952567314072,
 0.6474942744550585,
 0.6474932649275904,
 0.6474922274665562,
 0.6474911610635521,
 0.6474900674136551,
 0.6474889438202973,
 0.6474877937556847,
 0.6474866161183516,
 0.6474854095004893,
 0.647484174254143,
 0.647482913069989,
 0.6474816224895482,
 0.6474803032446298,
 0.6474789564871359,
 0.6474775813564501,
 0.6474761774217154,
 0.6474747451689948,
 0.6474732868998183,
 0.647471799190966,
 0.6474702847106064,
 0.647468741649418,
 0.6474677858014226,
 0.6474686401848571,
 0.6474698008018149,
 0.6474709497319042,
 0.6474720932695829,
 0.6474732314159393,
 0.6474743641720573,
 0.6474754915390193,
 0.647476613517913,
 0.6474777301098277,
 0.647478841315855,
 0.6474799471370942,
 0.6474810475746424,
 0.6474821426296065

In [None]:
def get_mll_vals(temp_df,sale_prices):
    lm = LinearRegression()
    lm.fit(temp_df, sale_prices)

    r2 = lm.score(temp_df, sale_prices)
    residuals = sale_prices - lm.predict(temp_df)
    rss = np.sum((residuals)**2)

    coef = lm.coef_
    intercept = lm.intercept_
    prfx_vals = {'r^2':r2,
                 #'coef':coef,
                 #'intercept':intercept,
                 'rss':rss
                }
    return prfx_vals

In [42]:
lm = LinearRegression()
lm.fit(X_train, y_train)

LinearRegression()

In [43]:
r2 = lm.score(X_train, y_train)

In [44]:
r2

0.6702175865327185

In [45]:
r2_test = lm.score(X_test, y_test)

In [46]:
r2_test

0.639827350938885