In [1]:
import wrangle
import pandas as pd
import numpy as np
import modeling
import explore
import prepare

import warnings
warnings.filterwarnings("ignore")
from sklearn.feature_selection import SelectKBest, f_regression, RFE

from sklearn.preprocessing import MinMaxScaler, QuantileTransformer, StandardScaler, RobustScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.preprocessing import PolynomialFeatures
import xgboost as xgb

from sklearn.feature_selection import SelectKBest, RFE, f_regression, SequentialFeatureSelector
from sklearn.metrics import explained_variance_score

import matplotlib.pyplot as plt

In [2]:
train, validate, test = wrangle.wrangle_zillow()

In [3]:
train.shape, validate.shape, test.shape

((25908, 21), (8637, 21), (8637, 21))

In [4]:
train.tract

50660    4300
14557    5508
47007     758
26586      66
50383     423
         ... 
637        76
37673     759
32816     996
23077    1375
34188     994
Name: tract, Length: 25908, dtype: int64

In [5]:
train.describe().astype(int).T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
parcelid,25908,13007162,2586803,10711855,11503757,12653312,14128263,162960814
bathrooms,25908,2,0,0,2,2,2,5
bedrooms,25908,3,0,0,3,3,4,6
sqft,25908,1722,666,128,1240,1573,2073,5256
fireplacecnt,25908,0,0,0,0,0,0,4
garagecarcnt,25908,0,0,0,0,0,2,9
lotsize,25908,7160,3127,236,5457,6624,8039,21663
poolcnt,25908,0,0,0,0,0,0,1
lat,25908,34,0,33,33,34,34,34
long,25908,-118,0,-119,-118,-118,-117,-117


In [6]:
train.tract.nunique()

1324

In [37]:
train.columns

Index(['parcelid', 'bathrooms', 'bedrooms', 'sqft', 'county', 'fireplacecnt',
       'garagecarcnt', 'lotsize', 'poolcnt', 'lat', 'long', 'logerror',
       'tract', 'tax_value', 'Los Angeles', 'Orange', 'Ventura', 'age',
       '4plusBath', '3to5garage', '3plusBR', 'low_tract', 'mid_tract',
       'high_tract'],
      dtype='object')

In [7]:
features = ['bathrooms', 'bedrooms', 'sqft', 'fireplacecnt',
       'garagecarcnt', 'lotsize', 'poolcnt', 
       'age',
       '4plusBath', '3to5garage', '3plusBR']

In [48]:
# create X and y
X_train = train[features]
y_train = train['tax_value']

X_validate = validate[features]
y_validate = validate['tax_value']

In [49]:
# create MinMaxScaler and fit to train
scaler = MinMaxScaler()
X_train[features] = scaler.fit_transform(X_train[features])
X_validate[features] = scaler.transform(X_validate[features])

# make a dataframe
y_train = pd.DataFrame(y_train)
y_validate = pd.DataFrame(y_validate)



In [50]:
def predict_baseline(y_train):
# 1. Predict value_pred_mean
    value_pred_mean = y_train['tax_value'].mean()
    y_train['value_pred_mean'] = value_pred_mean
    y_validate['value_pred_mean'] = value_pred_mean

    # 2. compute value_pred_median
    value_pred_median = y_train['tax_value'].median()
    y_train['value_pred_median'] = value_pred_median
    y_validate['value_pred_median'] = value_pred_median

    # 3. RMSE of value_pred_mean
    rmse_train = mean_squared_error(y_train.tax_value, y_train.value_pred_mean)**(1/2)
    rmse_validate = mean_squared_error(y_validate.tax_value, y_validate.value_pred_mean)**(1/2)

    print("RMSE using Mean\nTrain/In-Sample:        ", round(rmse_train, 2), 
          "\nValidate/Out-of-Sample: ", round(rmse_validate, 2))

    # 4. RMSE of value_pred_median
    rmse_train = mean_squared_error(y_train.tax_value, y_train.value_pred_median)**(1/2)
    rmse_validate = mean_squared_error(y_validate.tax_value, y_validate.value_pred_median)**(1/2)

    print("\nRMSE using Median\nTrain/In-Sample:        ", round(rmse_train, 2), 
          "\nValidate/Out-of-Sample: ", round(rmse_validate, 2))

In [51]:
predict_baseline(y_train)

RMSE using Mean
Train/In-Sample:         233115.06 
Validate/Out-of-Sample:  234810.22

RMSE using Median
Train/In-Sample:         235121.87 
Validate/Out-of-Sample:  236574.11


In [44]:
# scale y
# target_scaler = QuantileTransformer()
# y_train.tax_value = target_scaler.fit_transform(y_train[['tax_value']])
# y_validate.tax_value = target_scaler.transform(y_validate[['tax_value']])

In [52]:
# create the model object
lm = LinearRegression(normalize=True)

# fit the model to our training data. We must specify the column in y_train, 
# since we have converted it to a dataframe from a series! 
lm.fit(X_train, y_train['tax_value'])

# predict train
y_train['value_pred_lm'] = lm.predict(X_train)
y_train['value_pred_lm'] = target_scaler.inverse_transform(y_train[['value_pred_lm']])

# evaluate: rmse
rmse_train = mean_squared_error(y_train.tax_value, y_train.value_pred_lm)**(1/2)

# predict validate
y_validate['value_pred_lm'] = lm.predict(X_validate)
y_validate['value_pred_lm'] = target_scaler.inverse_transform(y_validate[['value_pred_lm']])

# evaluate: rmse
rmse_validate = mean_squared_error(y_validate.tax_value, y_validate.value_pred_lm)**(1/2)

print("RMSE for OLS using LinearRegression\nTraining/In-Sample: ", rmse_train, 
      "\nValidation/Out-of-Sample: ", rmse_validate)

RMSE for OLS using LinearRegression
Training/In-Sample:  672949.4732126321 
Validation/Out-of-Sample:  675190.5314208487


In [None]:
# create the model object
lars = LassoLars(alpha=0)

# fit the model to our training data. We must specify the column in y_train, 
# since we have converted it to a dataframe from a series! 
lars.fit(X_train, y_train['tax_value'])

# predict train
y_train['value_pred_llars'] = lars.predict(X_train)
#y_train['value_pred_llars'] = target_scaler.inverse_transform(y_train[['value_pred_llars']])

# evaluate: rmse
rmse_train = mean_squared_error(y_train.tax_value, y_train.value_pred_llars)**(1/2)

# predict validate
y_validate['value_pred_llars'] = lars.predict(X_validate)
#y_validate['value_pred_llars'] = target_scaler.inverse_transform(y_validate[['value_pred_llars']])

# evaluate: rmse
rmse_validate = mean_squared_error(y_validate.tax_value, y_validate.value_pred_llars)**(1/2)

print("RMSE for Lasso + Lars\nTraining/In-Sample: ", rmse_train, 
      "\nValidation/Out-of-Sample: ", rmse_validate)

In [34]:
train['low_tract'] = np.where(train['tract'] < 3000 ,1,0)
train['mid_tract'] = np.where((train['tract'] >= 3000) & (train['tract'] < 6200), 1, 0)
train['high_tract'] = np.where(train['tract'] > 6200, 1 , 0)

In [38]:
validate['low_tract'] = np.where(validate['tract'] < 3000 ,1,0)
validate['mid_tract'] = np.where((validate['tract'] >= 3000) & (validate['tract'] < 6200), 1, 0)
validate['high_tract'] = np.where(validate['tract'] > 6200, 1 , 0)

In [18]:
train[features]

Unnamed: 0,bathrooms,bedrooms,sqft,fireplacecnt,garagecarcnt,lotsize,poolcnt,lat,long,tract,Los Angeles,Orange,age,4plusBath,3to5garage,3plusBR
50660,2.0,4.0,1444.0,0.0,0.0,7490.0,1.0,34.148374,-117.932549,4300,1,0,58.0,0,0,0
14557,2.0,3.0,1545.0,0.0,0.0,8807.0,0.0,33.952336,-118.132573,5508,1,0,71.0,0,0,0
47007,3.0,4.0,2286.0,0.0,2.0,8400.0,0.0,33.775387,-117.824133,758,0,1,57.0,0,0,0
26586,2.0,3.0,1431.0,1.0,2.0,11100.0,0.0,34.213986,-118.883992,66,0,0,59.0,0,0,0
50383,2.5,3.0,1463.0,0.0,2.0,4400.0,0.0,33.520144,-117.700476,423,0,1,29.0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
637,3.0,4.0,2694.0,2.0,2.0,10148.0,0.0,34.263311,-118.871266,76,0,0,36.0,0,0,0
37673,2.5,3.0,1385.0,0.0,2.0,2032.0,0.0,33.790663,-117.859077,759,0,1,38.0,0,0,0
32816,1.5,2.0,1350.0,1.0,2.0,2062.0,0.0,33.754233,-118.038927,996,0,1,49.0,0,0,0
23077,2.0,3.0,1266.0,0.0,0.0,13351.0,0.0,34.163516,-118.599202,1375,1,0,63.0,0,0,0


In [33]:
features = ['4plusBath', 'bedrooms', 'sqft']

In [41]:
# create array of models trained by tract
model_dict = {}
tract_frame = train[train.groupby('tract')['tract'].transform('size') > 15]
tract_set = set(tract_frame.tract.tolist())

# create dictionary of models for most common tracts
for tract in tract_set:
    #get subset of data in that tract
    tract_data = train[train.tract == tract]
    
    # create X and y
    tract_X = tract_data[features]
    tract_y = tract_data['tax_value']
    
    # scale
    scaler = MinMaxScaler()
    tract_X = scaler.fit_transform(tract_X)
    
    # create and fit model
    model = LassoLars(alpha=0)
    model.fit(tract_X, tract_y)
    
    # add fit model to dictionary with key of tract
    model_dict[tract] = model, scaler
   
# create generic model
X_gen = train[features]
y_gen = train.tax_value

gen_scaler = MinMaxScaler()
X_gen = gen_scaler.fit_transform(X_gen)

gen_model = LassoLars(normalize=True)

gen_model.fit(X_gen, y_gen)
model_dict['generic'] = gen_model, gen_scaler

In [42]:
model_dict

{3: (LassoLars(alpha=0), MinMaxScaler()),
 8: (LassoLars(alpha=0), MinMaxScaler()),
 9: (LassoLars(alpha=0), MinMaxScaler()),
 10: (LassoLars(alpha=0), MinMaxScaler()),
 11: (LassoLars(alpha=0), MinMaxScaler()),
 12: (LassoLars(alpha=0), MinMaxScaler()),
 13: (LassoLars(alpha=0), MinMaxScaler()),
 14: (LassoLars(alpha=0), MinMaxScaler()),
 15: (LassoLars(alpha=0), MinMaxScaler()),
 16: (LassoLars(alpha=0), MinMaxScaler()),
 17: (LassoLars(alpha=0), MinMaxScaler()),
 18: (LassoLars(alpha=0), MinMaxScaler()),
 19: (LassoLars(alpha=0), MinMaxScaler()),
 28: (LassoLars(alpha=0), MinMaxScaler()),
 29: (LassoLars(alpha=0), MinMaxScaler()),
 31: (LassoLars(alpha=0), MinMaxScaler()),
 33: (LassoLars(alpha=0), MinMaxScaler()),
 36: (LassoLars(alpha=0), MinMaxScaler()),
 45: (LassoLars(alpha=0), MinMaxScaler()),
 47: (LassoLars(alpha=0), MinMaxScaler()),
 50: (LassoLars(alpha=0), MinMaxScaler()),
 52: (LassoLars(alpha=0), MinMaxScaler()),
 53: (LassoLars(alpha=0), MinMaxScaler()),
 54: (LassoLar

In [60]:
# make predicitons:

def get_predicitons(df, features, model_dict):
    
    tract_df = df[df['tract'].isin(list(model_dict.keys()))]
    non_df = df[~df['tract'].isin(list(model_dict.keys()))]

    tract_df
    non_df
#     gen_model = model_dict.get('generic')[0]
#     gen_scaler = model_dict.get('generic')[1]
    
#     y_features = ['tax_value', 'parcelid']

#     X = non_df[features]
#     y = non_df[y_features]
    
#     # transform X
#     gen_scaler.transform(X)
#     # make non_df predictions
#     y['predicted'] = gen_model.predict(X)
    
#     error = mean_squared_error(y['tax_value'], y['predicted'], squared=False)
#     r2_score = (y['tax_value'], y['predicted'])
    
#     for row in tract_df.index:
#         print(row)
#         X_tract = row[features]
#         y_tract = row[y_features]
        
#         model = model_dict.get(tract)[0]
#         scaler = model_dict.get(tract)[1]
        
#         X_tract = scaler.transform(X_tract)
#         y_tract['predicted'] = model.predict(X_tract)
        
#         y = pd.concat([y,y_tract])
    
#
    #return y, error     
    
    
    
    

In [62]:
y_preget_predicitons(validate, features, model_dict)

In [56]:
print(RMSE)

1384936166.6036766


In [57]:
y_pred

Unnamed: 0,tax_value,parcelid,predicted
31596,45155.0,12370886,1.245888e+09
27163,814837.0,11551468,1.169814e+09
18690,371363.0,14260387,1.424818e+09
22090,66884.0,12503139,1.129430e+09
27573,498697.0,12061796,8.226748e+08
...,...,...,...
44395,53172.0,10966115,8.032764e+08
43867,280224.0,10966081,1.344534e+09
12926,336932.0,11410188,1.412899e+09
46367,403130.0,11999403,1.046523e+09


In [31]:
tract_df = validate[~validate['tract']= 3]
X = tract_df[features]
y = tract_df['tax_value']

model = model_dict.get('generic')[0]
scaler = model_dict.get('generic')[1]

X = scaler.transform(X)
y_pred = model.predict(X)

RMSE = mean_squared_error(y, y_pred, squared=False)


In [32]:
RMSE

197572.161491384

In [20]:
print(model_dict.get('4300'))

None


In [None]:
# create the model object
glm = TweedieRegressor(power=0, alpha=0) # changed power to 0 since we normalized the target

# fit the model to our training data. We must specify the column in y_train, 
# since we have converted it to a dataframe from a series! 
glm.fit(X_train, y_train['tax_value'])

# predict train
y_train['value_pred_glm'] = glm.predict(X_train)
#y_train['value_pred_glm'] = target_scaler.inverse_transform(y_train[['value_pred_glm']])

# evaluate: rmse
rmse_train = mean_squared_error(y_train.tax_value, y_train.value_pred_glm)**(1/2)

# predict validate
y_validate['value_pred_glm'] = glm.predict(X_validate)
#y_validate['value_pred_glm'] = target_scaler.inverse_transform(y_validate[['value_pred_glm']])

# evaluate: rmse
rmse_validate = mean_squared_error(y_validate.tax_value, y_validate.value_pred_glm)**(1/2)

print("RMSE for GLM using Tweedie, power=1 & alpha=0\nTraining/In-Sample: ", rmse_train, 
      "\nValidation/Out-of-Sample: ", rmse_validate)

# Polynomial Regression

In [None]:
# create the model object
pf = PolynomialFeatures(degree=2)

# create the X and y datasets
X2_train = train[features]

X2_validate = validate[features]

 

In [None]:
# fit the model to our training data. We must specify the column in y_train, 
lm2 = LinearRegression(normalize=True)

# since we have converted it to a dataframe from a series! 
lm2.fit(X2_train, y_train.tax_value)

# predict train
y_train['value_pred_lm2'] = lm2.predict(X2_train)
#y_train['value_pred_lm2'] = target_scaler.inverse_transform(y_train[['value_pred_lm2']])

# evaluate: rmse
rmse_train = mean_squared_error(y_train.tax_value, y_train.value_pred_lm2)**(1/2)

# predict validate
y_validate['value_pred_lm2'] = lm2.predict(X2_validate)
#y_validate['value_pred_lm2'] = target_scaler.inverse_transform(y_validate[['value_pred_lm2']])

# evaluate: rmse
rmse_validate = mean_squared_error(y_validate.tax_value, y_validate.value_pred_lm2)**(1/2)

print("RMSE for Polynomial Model, degrees=2\nTraining/In-Sample: ", rmse_train, 
      "\nValidation/Out-of-Sample: ", rmse_validate)

In [None]:
y_train

In [None]:
def zillow_model(model, X_train, y_train, X_val, y_val, features):
    
    
    model.fit(X_train[features], y_train)
    
    yhat = model.predict(X_train[features])
    yhat_val = model.predict(X_val[features])
    
    rmse = mean_squared_error(y_train, yhat, squared=False)
    rmse_val = mean_squared_error(y_val, yhat_val, squared=False)
    print("RMSE for",model,"\nTraining/In-Sample: ", rmse,
          "\nValidation/Out-of-Sample: ", rmse_val)
    return model, rmse, rmse_val, features
      


In [None]:
results = pd.DataFrame(columns = ['model', 'RMSE_train', 'RMSE_val', 'features'])

In [None]:
model_list = [LinearRegression(normalize=True), LassoLars(alpha=0), TweedieRegressor(power=0, alpha=0) ]

In [None]:
zillow_model(LinearRegression(normalize=True), train, train.tax_value, validate, validate.tax_value, features)

In [None]:
results.loc[len(results)] = newresult

In [None]:
results

In [None]:
model_list = [LinearRegression(normalize=False), LassoLars(alpha=0), TweedieRegressor(power=0, alpha=0) ]

In [None]:
for model in model_list:
    newresult = zillow_model(model, train, train.tax_value, validate, validate.tax_value, features)
    results.loc[len(results)] = newresult

In [None]:
train.columns

In [46]:
features = ['bathrooms',
  'bedrooms',
  'sqft',
  'fireplacecnt',
  'garagecarcnt',
  'lotsize',
  'poolcnt',
  'lat',
  'long',
  'tract',
  'Los Angeles',
  'Orange',
  'age',
  '4plusBath',
  '3to5garage']

In [None]:
results[results.RMSE_val == results.RMSE_val.min()].features.tolist()

In [None]:
results.RMSE_val.min()