In [None]:
# Dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import mpl_toolkits
%matplotlib inline
import re
import tensorflow as tf
from keras.callbacks import ModelCheckpoint

In [None]:
# Read data
data_w = pd.read_csv("zillow_wake_co.csv")
data_d = pd.read_csv("zillow_durham_co_df.csv")

In [None]:
# Concatenate both dataframes
data = pd.concat([data_w, data_d], axis=0)
data

In [None]:
# If props.AreaUnit is = to sqft then convert it to acres
data.loc[(data['props.lotAreaUnit'] == 'sqft'), 'props.lotAreaValue'] /= 43560
data

In [None]:
# Extract zip code from address
data['zip'] = data['props.address'].str.extract('(?<=NC )([^,]*)(?=$)', expand = True)
data['zip'].astype(str).astype(int)
data['zip'].head()

In [None]:
# Extract city code from address
data['city'] = data['props.address'].str.extract('(?<=, )([^,]*)(?=,)', expand = True)   
data['city'].head()

In [None]:
# Create data_df dataframe from select columns in data dataframe
data_df = data[['city','zip','props.zestimate','props.propertyType','props.bedrooms','props.bathrooms','props.livingArea','props.lotAreaValue','props.latitude','props.longitude']]

# Drop rows where bedrooms, bathrooms, or livingArea = 0
data_df = data_df[data_df['props.bedrooms'] != 0]
data_df = data_df[data_df['props.bathrooms'] != 0]
data_df = data_df[data_df['props.livingArea'] != 0]

# Drop NA
data_df = data_df.dropna()

data_df.head()

In [None]:
# Change dtypes of columns
data_df['zip'] = data_df['zip'].astype(str).astype(int)
data_df['props.bedrooms'] = data_df['props.bedrooms'].astype(int)
data_df['props.bathrooms'] = data_df['props.bathrooms'].astype(int)
data_df.dtypes

In [None]:
# Describe data_df
data_df.describe()

In [None]:
# Determine # of unique values per column
data_df.nunique()

In [None]:
data_df['props.bedrooms'].value_counts().plot(kind='bar')
plt.title('number of Bedroom')
plt.xlabel('Bedrooms')
plt.ylabel('Count')
sns.despine

In [None]:
plt.scatter(data_df["props.zestimate"],data_df["props.livingArea"])
plt.title("Price vs Square Feet")

In [None]:
# Create dataframe that contains all variables except the zestimates
train_data = data_df.drop(['props.zestimate','props.longitude','props.latitude'], axis=1)
train_data

In [None]:
# categorical columns
categorical_cols = ['city','props.propertyType']

train_data= pd.get_dummies(train_data, columns = categorical_cols)
train_data

In [None]:
# set up linear regression on data
from sklearn.linear_model import LinearRegression
reg = LinearRegression()

In [None]:
# Train-Test-Split data
from sklearn.model_selection import train_test_split

labels = data_df['props.zestimate']
X_train, X_test, y_train, y_test = train_test_split(train_data, labels, test_size = 0.20, random_state = 42)

In [None]:
reg.fit(X_train,y_train)

In [None]:
reg.score(X_test,y_test)

In [None]:
####################
## LGBMRegressor  ##
####################

In [None]:
import lightgbm as ltb

model = ltb.LGBMRegressor()
model.fit(X_train, y_train)
print(); print(model)

expected_y  = y_test
predicted_y = model.predict(X_test)

In [None]:
from sklearn import metrics
print(metrics.r2_score(expected_y, predicted_y))
print(metrics.mean_squared_log_error(expected_y, predicted_y))

In [None]:
    plt.figure(figsize=(10,10))
    sns.regplot(expected_y, predicted_y, fit_reg=True, scatter_kws={"s": 100})

In [None]:
####################
##  XGBRegressor  ##
####################

In [None]:


from numpy import absolute
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from xgboost import XGBRegressor


# create an xgboost regression model
model = XGBRegressor()
# define model evaluation method
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=42)
# evaluate model
scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
# force scores to be positive
scores = absolute(scores)
print('Mean MAE: %.3f (%.3f)' % (scores.mean(), scores.std()) )

In [None]:
model.fit(X_train, y_train)

In [None]:
model.score(X_test,y_test)

In [None]:
################################
##  Random Forest Regression  ##
################################

In [None]:
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import RandomForestRegressor

In [None]:
# Train the RandomForestRegressor
rf_regressor = RandomForestRegressor(n_estimators = 100, random_state = 42)
rf_regressor.fit(X_train, y_train)

In [None]:
# Training Accuracy
y_pred_train = rf_regressor.predict(X_train)
r2_score(y_train, y_pred_train)

In [None]:
# Testing Accuracy
y_pred = rf_regressor.predict(X_test)
r2_score(y_test, y_pred)

In [None]:
##################################
## Gradient Boosting Regressor  ##
##################################

In [None]:
from sklearn import ensemble
clf = ensemble.GradientBoostingRegressor(n_estimators = 400, max_depth = 5, min_samples_split = 2,
          learning_rate = 0.1, loss = 'squared_error')

In [None]:
clf.fit(X_train, y_train)

In [None]:
clf.score(X_test,y_test)

In [None]:
## Train the model with PMML wrapper
from sklearn2pmml.pipeline import PMMLPipeline

clf = PMMLPipeline(
    [
        (
            "classifier",
            ensemble.GradientBoostingRegressor(n_estimators = 400, max_depth = 5, min_samples_split = 2,
          learning_rate = 0.1, loss = 'squared_error'),
        )
    ]
)

In [None]:
clf.fit(X_train, y_train)

In [None]:
clf.score(X_test,y_test)

In [None]:
## Save model to PMML
from sklearn2pmml import sklearn2pmml

sklearn2pmml(clf, "boosting_model.pmml", with_repr=True)

In [None]:
# convert best model to pure Javascript code  
import m2cgen as m2c 

model_to_javascript = m2c.export_to_javascript(rf_regressor) 
model_to_javascript