<a href="https://colab.research.google.com/github/D-Studios/Predicting-Median-Home-Prices-And-Inflation-Adjusted-Prices-In-The-USA-Given-A-Date/blob/main/Main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Project: Predicting Median Home Prices And Inflation Adjusted Prices In The USA Given A Date
#Created By: Devang Bhatnagar

import requests
import pandas as pd
import datetime as dt
import numpy as np
from datetime import datetime
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
import math

#This code gets a table from a website to store as a csv file. 
url = 'https://dqydj.com/historical-home-prices/'
html = requests.get(url).content
df_list = pd.read_html(html)
df = df_list[0]
print(df)
df.to_csv('data.csv')

In [None]:
#This function will aid in determining what century the year is in.
def to_datetime(dates, historical_data=True):
  for i in range(len(dates)):
    numbers = dates[i].split("/")
    year = int(numbers[2])
    if year>21 and historical_data:
      year+=1900
    numbers[2]=str(year)
    dates[i]=numbers[0]+"/"+numbers[1]+"/"+numbers[2]
  return dates


In [None]:
#The csv file is taken and all of the table values are converted from strings to numerical values that can be used for computation.
Housing = pd.read_csv('data.csv')
Housing = Housing.iloc[1: , :]
housing_array=[]
for element in Housing['0']:
  housing_array.append(element)
housing_array=to_datetime(housing_array)
Housing['0'] = pd.to_datetime(housing_array)
Housing['0'] = Housing['0'].map(dt.datetime.toordinal) - 712954
Housing['1'] = Housing['1'].replace({'\$':''}, regex = True)
Housing['1'] = Housing['1'].replace({'\,':''}, regex = True)
Housing['2'] = Housing['2'].replace({'\$':''}, regex = True)
Housing['2'] = Housing['2'].replace({'\,':''}, regex = True)
Housing['0'] = Housing['0'].astype(int)
Housing['1'] = Housing['1'].astype(float)
Housing['2'] = Housing['2'].astype(float)
Housing

In [None]:
#The non-linear data is transformed using logarithms in order to create a more accurate linear regression model.
Y1 = Housing['1']
Y2 = Housing['2']
Y1=[]
Y2=[]
for y in range(1, len(Housing['1'])+1):
  Y1.append(math.log(Housing['1'][y]))
for y in range(1, len(Housing['2'])+1):
  Y2.append(math.log(Housing['2'][y]))
print(Y1)
Y2


In [None]:
#The oordinal time for the date will be stored in the X variable.
X = Housing['0']
X

In [None]:
#This library allows the usage of splitting data into training and testing datasets.
from sklearn.model_selection import train_test_split

In [None]:
#Training and testing datasets are created for both the Median Home Price (NSA) and the Inflation Adjusted Price.
X_train1, X_test1, Y_train1, Y_test1 = train_test_split(X, Y1, test_size=0.2)
X_train2, X_test2, Y_train2, Y_test2 = train_test_split(X, Y2, test_size=0.2)

In [None]:
#The models will be using linear regression.
model1 = linear_model.LinearRegression()
model2 = linear_model.LinearRegression()

In [None]:
#The X training datasets are converted into 2D arrays. They are fitted to their respective Y_trains.
model1.fit(np.array(X_train1).reshape(-1,1), Y_train1)
model2.fit(np.array(X_train2).reshape(-1,1), Y_train2)

In [None]:
#This block of code is used to make the model predict what the output should be given the input.
Y_pred1 = model1.predict(np.array(X_test1).reshape(-1,1))
Y_pred2 = model2.predict(np.array(X_test2).reshape(-1,1))
Y_pred1

In [None]:
#This block of code lists the attributes of the linear regression models, including the equations, mean squared errors, and r^2 scores.
print("Median Home Price (NSA)")
print('Coefficients:', model1.coef_)
print('Intercept:', model1.intercept_)
print('Mean squared error (MSE): %.2f' % mean_squared_error(Y_test1, Y_pred1))
print('Coefficient of determination (R^2): %.2f' % r2_score(Y_test1, Y_pred1))
 
print("\n\nInflation Adjusted Price")
print('Coefficients:', model2.coef_)
print('Intercept:', model2.intercept_)
print('Mean squared error (MSE): %.2f'
      % mean_squared_error(Y_test2, Y_pred2))
print('Coefficient of determination (R^2): %.2f'
      % r2_score(Y_test2, Y_pred2))

In [None]:
#Seaborn is used for statistical data visualization.
import seaborn as sns

In [None]:
#This block of code shows the testing datasets for the outputs.
print(Y_test1)
print("\n\n\n")
print(Y_test2)

In [None]:
#This block of code shows the predictions of outputs for the testing datasets of the x-variables.
print(Y_pred1)
print("\n\n\n")
print(Y_pred2)

In [None]:
#This is a scatterplot where the testing dataset (Y_test1) is on the x-axis, while the predictions(Y_pred1) are on the y-axis.
sns.scatterplot(x=Y_test1, y=Y_pred1)

In [None]:
#This is the same scatterplot as above, except the points are changed to + signs instead of dots.
sns.scatterplot(x=np.array(Y_test1), y=np.array(Y_pred1), marker="+")

In [None]:
#This is the same scatterplot as above, except this plot changes the alpha component of the points instead of the signs.
sns.scatterplot(x=np.array(Y_test1), y=np.array(Y_pred1), alpha=0.2)

In [None]:
#This is a scatterplot where the testing dataset (Y_test2) is on the x-axis, while the predictions(Y_pred2) are on the y-axis.
sns.scatterplot(x=np.array(Y_test2), y=np.array(Y_pred2))

In [None]:
#This is the same scatterplot as above, except the points are changed to + signs instead of dots.
sns.scatterplot(x=np.array(Y_test2), y=np.array(Y_pred2), marker="+")

In [None]:
#This is the same scatterplot as above, except this plot changes the alpha component of the points instead of the signs.
sns.scatterplot(x=np.array(Y_test2), y=np.array(Y_pred2), alpha=0.2)

In [None]:
#This block of code is used to calculate the residuals from the predictions and actual datapoints.
residuals_median_home_price = []
for i in range(0, len(Y_pred1)):
  residuals_median_home_price.append(Y_test1[i]-Y_pred1[i])
residuals_inflation_adjusted_price=[]
for i in range(0, len(Y_pred2)):
  residuals_inflation_adjusted_price.append(Y_test2[i]-Y_pred2[i])

In [None]:
#This is a scatterplot where the tested x-points are on the x-axis, while the residuals for the Median Home Price are on the y-axis. 
sns.scatterplot(x=X_test1, y=residuals_median_home_price)

In [None]:
#This is the same scatterplot as above, except the points are changed to + signs instead of dots.
sns.scatterplot(x=X_test1, y=residuals_median_home_price, marker='+')

In [None]:
#This is the same scatterplot as above, except this plot changes the alpha component of the points instead of the signs.
sns.scatterplot(x=X_test1, y=residuals_median_home_price, alpha=0.2)

In [None]:
#This is a scatterplot where the tested x-points are on the x-axis, while the residuals for the Inflation Adjusted Price are on the y-axis. 
sns.scatterplot(x=X_test2, y=residuals_inflation_adjusted_price)

In [None]:
#This is the same scatterplot as above, except the points are changed to + signs instead of dots.
sns.scatterplot(x=X_test2, y=residuals_inflation_adjusted_price, marker='+')

In [None]:
#This is the same scatterplot as above, except this plot changes the alpha component of the points instead of the signs.
sns.scatterplot(x=X_test2, y=residuals_inflation_adjusted_price, alpha=0.2)

In [None]:
#In this block of code, the program will try to predict the Median Home Price and Inflation Adjusted Price given user entered dates.
print("Predicting Home Prices Using Dates: ")
while True:
  try:
    date=input("Enter in a date in month/day/year format. Type STOP to stop. ")
    if date=="STOP":
      break
    date_array=[]
    date_array.append(date)
    date_array=to_datetime(date_array, False)
    date_array=pd.to_datetime(date_array)
    date_array=date_array.map(dt.datetime.toordinal)-712954
    date_array.astype(int)
    x = np.array(date_array).reshape(-1,1)
    y = model1.predict(x)[0]
    y=int(math.exp(y))
    print("The Median Home Price (NSA) on this date will probably be around " + str(y)+" dollars")
    y2=model2.predict(x)[0]
    y2=int(math.exp(y2))
    print("The Inflation Adjusted Price on this date will probably be around " + str(y2) + " dollars")
    print("--------------------------------------------------------------------------------------------")
  except:
    print("An error has occurred. Please try again.")

Predicting Home Prices Using Dates: 
