## Loading Dataset

In [295]:
import pickle
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, LassoLars, BayesianRidge, TweedieRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [296]:
df = pickle.load(open("datafiles/after-EDA.ft", "rb"))
df = df.drop(columns=["description", "desc_subjectivity" ]) # Dropping Description Column
df.head()

Unnamed: 0,totPurchaseAmt,bathrooms,bedrooms,livingArea,yearBuilt,lotSize,averageSchoolRating,zip,zestimate,desc_sentiment
1,198000.0,2.0,4.0,1716.0,1996,10716,5.333333,55303,285985.0,-0.017857
2,130000.0,2.0,4.0,864.0,1963,12100,5.666667,55448,132773.0,-0.094286
4,329900.0,3.0,4.0,1814.0,2001,10018,6.5,55038,318162.0,-0.094286
5,262000.0,2.0,3.0,2158.0,1985,12632,4.666667,55043,272169.0,0.296383
7,237000.0,2.0,3.0,2058.0,1959,10890,3.666667,55432,246844.0,0.383032


## Data Scaling 

In [297]:
scaler = MinMaxScaler()
attributes = ['livingArea', 'yearBuilt', 'lotSize', 'averageSchoolRating', "desc_sentiment"]
df[attributes] = scaler.fit_transform(df[attributes])
df.head()

Unnamed: 0,totPurchaseAmt,bathrooms,bedrooms,livingArea,yearBuilt,lotSize,averageSchoolRating,zip,zestimate,desc_sentiment
1,198000.0,2.0,4.0,0.133289,0.858824,0.089136,0.541667,55303,285985.0,0.418367
2,130000.0,2.0,4.0,0.039591,0.664706,0.103295,0.583333,55448,132773.0,0.374694
4,329900.0,3.0,4.0,0.144067,0.888235,0.081996,0.6875,55038,318162.0,0.374694
5,262000.0,2.0,3.0,0.181898,0.794118,0.108738,0.458333,55043,272169.0,0.597933
7,237000.0,2.0,3.0,0.170901,0.641176,0.090917,0.333333,55432,246844.0,0.647447


## Converting Categorical Variables 

In [298]:
df = pd.get_dummies(data=df, columns=["bathrooms", "bedrooms", "zip"])
df.head()

Unnamed: 0,totPurchaseAmt,livingArea,yearBuilt,lotSize,averageSchoolRating,zestimate,desc_sentiment,bathrooms_0.5,bathrooms_0.75,bathrooms_1.0,...,zip_55708,zip_55732,zip_55733,zip_55906,zip_55929,zip_55992,zip_56071,zip_56097,zip_56434,zip_56751
1,198000.0,0.133289,0.858824,0.089136,0.541667,285985.0,0.418367,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,130000.0,0.039591,0.664706,0.103295,0.583333,132773.0,0.374694,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,329900.0,0.144067,0.888235,0.081996,0.6875,318162.0,0.374694,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,262000.0,0.181898,0.794118,0.108738,0.458333,272169.0,0.597933,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,237000.0,0.170901,0.641176,0.090917,0.333333,246844.0,0.647447,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Train Test Split 

In [299]:
X = df.drop(columns=["totPurchaseAmt"])
y = df["totPurchaseAmt"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

X_train_zestimate = X_train["zestimate"]
X_train = X_train.drop(columns=["zestimate"])

X_test_zestimate = X_test["zestimate"]
X_test = X_test.drop(columns=["zestimate"])

In [300]:
X_train.head()

Unnamed: 0,livingArea,yearBuilt,lotSize,averageSchoolRating,desc_sentiment,bathrooms_0.5,bathrooms_0.75,bathrooms_1.0,bathrooms_1.2,bathrooms_1.25,...,zip_55708,zip_55732,zip_55733,zip_55906,zip_55929,zip_55992,zip_56071,zip_56097,zip_56434,zip_56751
2169,0.091939,0.764706,0.068635,0.583333,0.760335,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20363,0.202353,0.770588,0.090917,0.5,0.418367,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15653,0.276476,0.841176,0.094927,0.5,0.685055,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
53,0.12977,0.647059,0.073085,0.291667,0.62038,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23221,0.031783,0.329412,0.095377,0.25,0.418367,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


## Models

In [301]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

### Multiple Linear Models Regression 

In [302]:
models = [LinearRegression(), Ridge(), RidgeCV(), LassoLars(), BayesianRidge(), TweedieRegressor(max_iter = 10000)]

for x in models:
    print (x)
    
    lreg = x.fit(X_train, y_train)
    
    y_pred = lreg.predict(X_test)
    print('Mean absolute error: %.2f' % mean_absolute_error(y_test, y_pred))

    print('Mean squared error: %.2f' % mean_squared_error(y_test, y_pred))

    print('Coefficient of determination: %.2f'% r2_score(y_test, y_pred))
    
    print ("% Error", mean_absolute_percentage_error(y_test, y_pred))
    print ("")

LinearRegression()
Mean absolute error: 10524213903709.16
Mean squared error: 611877736092332179099425439744.00
Coefficient of determination: -18839405132495114240.00
% Error 7488711544.516325

Ridge()
Mean absolute error: 58131.35
Mean squared error: 11350288265.22
Coefficient of determination: 0.65
% Error 25.21497892139944

RidgeCV(alphas=array([ 0.1,  1. , 10. ]))
Mean absolute error: 58131.35
Mean squared error: 11350288265.23
Coefficient of determination: 0.65
% Error 25.21497892139416

LassoLars()
Mean absolute error: 58313.18
Mean squared error: 11571862055.32
Coefficient of determination: 0.64
% Error 25.22404698946394

BayesianRidge()
Mean absolute error: 58215.48
Mean squared error: 11385979351.49
Coefficient of determination: 0.65
% Error 25.234575798408716

TweedieRegressor(max_iter=10000)
Mean absolute error: 98752.57
Mean squared error: 28130832680.59
Coefficient of determination: 0.13
% Error 44.43375468025835



In [303]:
mean_absolute_percentage_error(y_test, X_test_zestimate)

25.450395024040045

In [304]:
mean_absolute_percentage_error(y_train, X_train_zestimate)

23.301211136427895

In [305]:
mean_absolute_error(y_test, X_test_zestimate)

47075.68992922144

### Neural Networks 