In [1]:
# Import Dependencies
import pandas as pd
import sqlite3
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import session
from sqlalchemy import create_engine, func
import numpy as np
from datetime import datetime
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from matplotlib import pyplot
from scipy.stats.stats import pearsonr
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
import matplotlib.pyplot as plt
import xgboost

In [2]:
# Create connection with database file
dbfile = '../housedata.db'
con = sqlite3.connect(dbfile)

In [3]:
# Creating Cursor
cur = con.cursor()
table_list = [a for a in cur.execute("SELECT name FROM sqlite_master WHERE type = 'table'")]
# Print table list
print(table_list)

[]


In [4]:
# Read in SQL query
db_df = pd.read_sql_query('SELECT * FROM joined_table', con)

DatabaseError: Execution failed on sql 'SELECT * FROM joined_table': no such table: joined_table

In [None]:
db_df.head(10)

In [None]:
# Check for columns
db_df.columns

In [None]:
# Check for datatpyes
db_df.dtypes

In [None]:
# Drop unneccessary columns
db_df = db_df.drop(columns=(['type', 'decommissioned', 'acceptable_cities', 'unacceptable_cities', 'state', 'county', 'timezone', 'area_codes', 'world_region', 'country', 'latitude', 'longitude', 'irs_estimated_population_2015', 'id', 'sqft_above', 'sqft_basement', 'lat', 'long', 'sqft_living15', 'sqft_lot15', 'zip']))

In [None]:
# Check for columns
db_df.columns

In [None]:
# Determine the number of unique values in each column.
db_list = db_df.dtypes.index.tolist()


# Check the number of unique values in each column
db_df[db_list].nunique()

In [None]:
# Check for missing data
for column in db_df.columns:
    print(f"Column {column} has {db_df[column].isnull().sum()} null values")

In [None]:
# Convert date to show only year
db_df['date'] = db_df['date'].str[:4]

In [None]:
db_df.head(10)

In [None]:
# Convert columns with dates to integers
db_df['date'] = pd.to_numeric(db_df['date'])
db_df['yr_built'] = pd.to_numeric(db_df['yr_built'])
db_df['house_age'] = db_df['date'] - db_df['yr_built']
db_df.head(5)

In [None]:
# Custom encode for primary city
# Rank cities by average price
average_price = db_df.groupby("primary_city")["price"].mean().rank(axis=0, ascending=False)
# Convert to dataframe
average_price_df = average_price.to_frame(name="city_rank")

# Merge rank with housedata and drop primary city
db_df = db_df.merge(average_price_df,on = "primary_city", how="left").drop("primary_city",1)
db_df.head()

In [None]:
# Convert datatype for all columns to numeric
for c in db_df.columns:
    db_df[c] = pd.to_numeric(db_df[c], errors='coerce')
# Check datatype of columns
db_df.dtypes

In [None]:
# Save DataFrame to csv
db_df.to_csv('merged_data.csv')

In [None]:
# Filter out outliers
db_df = db_df[db_df['price'] < 4000000]  
db_df.head(10)

In [None]:
# Calculate Pearsons Correlation coefficient
corr_full = db_df.corr(method = "pearson")
corr = corr_full[["price"]].copy()
corr["price"] = abs(corr["price"])
corr = corr.drop(index="price")
corr = corr.sort_values(by=['price'],ascending=False)
corr.plot(kind ="bar",y="price")

In [None]:
# Select features that have correlation greater than or equal to 0.5
sel_feature_pearsons = corr[corr["price"]>0.05].index.values.tolist()
sel_feature_pearsons

In [None]:
# Filter for selected features only
corr_test = corr_full.filter(sel_feature_pearsons, axis=1)
corr_test

In [None]:
# Create independant & dependant variables
X = db_df[sel_feature_pearsons]
y = db_df['price']

# Split the data into training/testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y)

#Transforming y_train to log
y_train = np.log(y_train)

# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)
# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
# Create linear regression object
regr = LinearRegression()
# Train the model using the training sets
regr = regr.fit(X_train_scaled, y_train)
# Make predictions using the testing set
y_pred = regr.predict(X_test_scaled)
y_pred = np.exp(y_pred)

# The coefficients
print('Root Mean Squared Error:', metrics.mean_absolute_error(y_test, y_pred))
# The mean squared error
print('Mean squared error: %.2f'
      % mean_squared_error(y_test, y_pred))
# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.2f'
      % r2_score(y_test, y_pred))

In [None]:
# Plot Linear Regression model
plt.figure(figsize=(15,5))
plt.scatter(y_test, y_pred)
plt.scatter(y_test,y_test,color="red")

In [None]:
# Create Random Forst Regressor object
rf_reg = RandomForestRegressor(random_state=0, n_estimators=500)
rf_reg = rf_reg.fit(X_train_scaled, y_train)
rf_pred_y = rf_reg.predict(X_test_scaled)
rf_pred_y = np.exp(rf_pred_y)

# The root mean squared error
print('Root Mean Squared Error:', metrics.mean_absolute_error(y_test, rf_pred_y))
# The mean squared error
print('Mean squared error: %.2f'
      % mean_squared_error(y_test, rf_pred_y))
# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.2f'
      % r2_score(y_test, rf_pred_y))

In [None]:
rf_reg = RandomForestRegressor(random_state=0, n_estimators=1000)
rf_reg = rf_reg.fit(X_train_scaled, y_train)
rf_pred_y = rf_reg.predict(X_test_scaled)
rf_pred_y = np.exp(rf_pred_y)

# The root mean squared error
print('Root Mean Squared Error:', metrics.mean_absolute_error(y_test, rf_pred_y))
# The mean squared error
print('Mean squared error: %.2f'
      % mean_squared_error(y_test, rf_pred_y))
# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.2f'
      % r2_score(y_test, rf_pred_y))

In [None]:
# Plot the Random Forest Regressor model
plt.figure(figsize=(15,5))
plt.scatter(y_test, rf_pred_y)
plt.scatter(y_test,y_test,color="red")

In [None]:
# Create a Xgboost regressor model.
xgb_model = xgboost.XGBRegressor(learning_rate=0.05, max_depth=5,
                                 min_child_weight=1.5,n_estimators=500,subsample=0.8,seed=42)
# Fitting the model
xgb_model = xgb_model.fit(X_train_scaled, y_train)
# Making predictions using the testing data.
y_pred_xgb = xgb_model.predict(X_test_scaled)
y_pred_xgb = np.exp(y_pred_xgb)
# The root mean squared error
print('Root Mean Squared Error:', metrics.mean_absolute_error(y_test, y_pred_xgb))
# The mean squared error
print('Mean squared error: %.2f'
      % mean_squared_error(y_test, y_pred_xgb))
# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.2f'
      % r2_score(y_test, y_pred_xgb))

plt.figure(figsize=(18,8))
plt.scatter(y_test, y_pred_xgb)
plt.scatter(y_test,y_test,color="red")