<a href="https://colab.research.google.com/github/AymanTawfeeq01/426-House-price-Prediction-app/blob/main/436_Project_Code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Let's fit a model on the housing price data!
1. Import packages
2. Load Data
3. Fit a LR model


Note: In order for this model to work, make sure that you run all the cells below.

# Import Packages

In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_selection import SelectKBest, chi2

# Load Data

In [None]:
raw_data_train = pd.read_csv('https://raw.githubusercontent.com/jmpark0808/pl_mnist_example/main/train_hp_msci436.csv')
raw_data_train.tail()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125
1459,1460,20,RL,75.0,9937,Pave,,Reg,Lvl,AllPub,...,0,,,,0,6,2008,WD,Normal,147500


In [None]:
print(raw_data_train.shape)
raw_data_train.dtypes

(1460, 81)


Id                 int64
MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
                  ...   
MoSold             int64
YrSold             int64
SaleType          object
SaleCondition     object
SalePrice          int64
Length: 81, dtype: object

In [None]:
df = raw_data_train.select_dtypes(include = ['float64', 'int64']).fillna(0)
df.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
0,1,60,65.0,8450,7,5,2003,2003,196.0,706,...,0,61,0,0,0,0,0,2,2008,208500
1,2,20,80.0,9600,6,8,1976,1976,0.0,978,...,298,0,0,0,0,0,0,5,2007,181500
2,3,60,68.0,11250,7,5,2001,2002,162.0,486,...,0,42,0,0,0,0,0,9,2008,223500
3,4,70,60.0,9550,7,5,1915,1970,0.0,216,...,0,35,272,0,0,0,0,2,2006,140000
4,5,60,84.0,14260,8,5,2000,2000,350.0,655,...,192,84,0,0,0,0,0,12,2008,250000


In [None]:
#make a function to take top k features, given df
#iterate over df columns
#turn each column into numpy array and calcualte corrcoef to get Pearsons's r
#take magnitude of each r and order them, maybe use dict like {}

def get_pearsons(df):
  feature_df = df.drop(columns=['SalePrice'])
  results = {}
  for col in feature_df:
    col_arr = feature_df[col].values
    tar_arr = df['SalePrice'].values
    pearsons = np.corrcoef(col_arr, tar_arr)
    results[col] = abs(pearsons[0][1])
  return results


pearsons = get_pearsons(df)

sorted_terms = sorted(pearsons.items(), key=lambda x: x[1], reverse=True)
top_cols = []
for x in range(8):
  top_cols.append(sorted_terms[x][0])
top_cols.append('SalePrice')

top_cols

['OverallQual',
 'GrLivArea',
 'GarageCars',
 'GarageArea',
 'TotalBsmtSF',
 '1stFlrSF',
 'FullBath',
 'TotRmsAbvGrd',
 'SalePrice']

In [None]:
df = df[top_cols]
df.head()

Unnamed: 0,OverallQual,GrLivArea,GarageCars,GarageArea,TotalBsmtSF,1stFlrSF,FullBath,TotRmsAbvGrd,SalePrice
0,7,1710,2,548,856,856,2,8,208500
1,6,1262,2,460,1262,1262,2,6,181500
2,7,1786,2,608,920,920,2,6,223500
3,7,1717,3,642,756,961,1,7,140000
4,8,2198,3,836,1145,1145,2,9,250000


In [None]:
X = df.values[:int(0.8*len(df.values)), 0:-1]
y = df.values[:int(0.8*len(df.values)), -1]

In [None]:
y

array([208500, 181500, 223500, ..., 233170, 245350, 173000])

# Fit a Linear Regression Model

In [None]:
reg = LinearRegression().fit(X, y)
print(reg.coef_)
print(reg.coef_.tolist())


[2.17368456e+04 4.96131681e+01 5.46839472e+03 4.43471064e+01
 3.63953996e+01 5.86326356e+00 4.30316755e+02 1.54346211e+02]
[21736.845557646797, 49.61316809228731, 5468.39471698553, 44.34710636332585, 36.395399643820774, 5.863263562950706, 430.31675484702197, 154.34621144382183]


In [None]:
X_val = df.values[int(0.8*len(df.values)):, 0:-1]

test_pred = reg.predict(X_val)
mse_test = mean_squared_error(test_pred, df.values[int(0.8*len(df.values)):, -1])
r2 = r2_score(df.values[int(0.8*len(df.values)):, -1], test_pred)
print(mse_test)
print(r2)

2435963567.118162
0.6367144928782085


# Try Standardizing

In [None]:
df_standardized = df.apply(lambda x: (x - x.min())/(x.max() - x.min()))
X = df_standardized.values[:int(0.8*len(df.values)), 0:-1]





In [None]:
X_val = df_standardized.values[int(0.8*len(df.values)):, 0:-1]

In [None]:
reg = LinearRegression().fit(X, y)
test_pred = reg.predict(X_val)


mse_test = mean_squared_error(test_pred, df.values[int(0.8*len(df.values)):, -1])
r2 = r2_score(df.values[int(0.8*len(df.values)):, -1], test_pred)
print(mse_test)
print(r2)

2435963567.1181464
0.6367144928782109


In [None]:
print(reg.coef_)

[195631.61001882 263346.69623386  21873.57886794  62884.1968232
 222375.89182373  25552.10260735   1290.95026454   1852.15453733]


In [None]:
#this is how to use the model
#where each element in the numpy array is one of the top columns from above
reg.predict(np.array([[7,1710,2,548,856,856,2,8]]))

array([6.98376982e+08])

User Interface


The code below builds the streamlit application. Ensure that you run all the cells for the app to run.

In [None]:
!pip install -q streamlit
!npm install localtunnel

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m56.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m164.8/164.8 kB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m188.5/188.5 kB[0m [31m22.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.8/4.8 MB[0m [31m88.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m82.1/82.1 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m341.8/341.8 kB[0m [31m33.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for validators (setup.py) ... [?25l[?25hdone
[K[?25h[37;40mnpm[0m [0m

In [None]:
%%writefile app.py
import numpy as np
import pandas as pd
import streamlit as st
import altair as alt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns


#Load data
raw_data_train = pd.read_csv('https://raw.githubusercontent.com/jmpark0808/pl_mnist_example/main/train_hp_msci436.csv')
raw_data_train.tail()

df = raw_data_train.select_dtypes(include = ['float64', 'int64']).fillna(0)

#make a function to take top k features, given df
#iterate over df columns
#turn each column into numpy array and calcualte corrcoef to get Pearsons's r
#take magnitude of each r and order them, maybe use dict like {}

def get_pearsons(df):
  feature_df = df.drop(columns=['SalePrice'])
  results = {}
  for col in feature_df:
    col_arr = feature_df[col].values
    tar_arr = df['SalePrice'].values
    pearsons = np.corrcoef(col_arr, tar_arr)
    results[col] = abs(pearsons[0][1])
  return results


pearsons = get_pearsons(df)

sorted_terms = sorted(pearsons.items(), key=lambda x: x[1], reverse=True)
top_cols = []
for x in range(8):
  top_cols.append(sorted_terms[x][0])
top_cols.append('SalePrice')

df = df[top_cols]

X = df.values[:int(0.8*len(df.values)), 0:-1]
y = df.values[:int(0.8*len(df.values)), -1]

reg = LinearRegression().fit(X, y)

X_val = df.values[int(0.8*len(df.values)):, 0:-1]

test_pred = reg.predict(X_val)
mse_test = mean_squared_error(test_pred, df.values[int(0.8*len(df.values)):, -1])
r2 = r2_score(df.values[int(0.8*len(df.values)):, -1], test_pred)


reg = LinearRegression().fit(X, y)
test_pred = reg.predict(X_val)


mse_test = mean_squared_error(test_pred, df.values[int(0.8*len(df.values)):, -1])
r2 = r2_score(df.values[int(0.8*len(df.values)):, -1], test_pred)

#Standardizing
df_standardized = df.apply(lambda x: (x - x.min())/(x.max() - x.min()))
X = df_standardized.values[:int(0.8*len(df.values)), 0:-1]

X_val = df_standardized.values[int(0.8*len(df.values)):, 0:-1]


##User Interfae

#Create page title
st.set_page_config(page_title="House Price Prediction", layout="wide")


# Create page title

st.markdown(
    """
    <div style='text-align: center; background-color: #333333; padding: 10px; margin-bottom: 20px;'>
        <h1 style='color: white;'>Housing Price Predictor</h1>
        <h4 style='color: white; text-align: center; font-size: 17px;'>Welcome to your house price prediction ML app! You can fill out the form below with your house information to obtain an estimate of the price for the house.</h4>
    </div>
    """,
    unsafe_allow_html=True
)

with st.form("my_form"):
    OverallQual = st.number_input("Overall quality of the house: Between 1 - 10", min_value=1, max_value=10, step=1)
    GrLivArea = st.number_input("Living area square feet above ground: E.x. 2000",min_value=0)
    GarageCars = st.number_input("How many cars does the garage fit: E.x. 2",min_value=0)
    GarageArea = st.number_input("Size of garage in square feet: E.x. 2000",min_value=0)
    TotalBsmtSF = st.number_input("Basement squarefootage: E.x. 1500",min_value=0)
    FirstFlrSF = st.number_input("First floor squarefootage E.x 2000:",min_value=0)
    FullBath = st.number_input("Number of full bathrooms: E.x. 2",min_value=0)
    TotRmsAbvGrd = st.number_input("Number of full rooms above ground (not including bathrooms): E.x. 2",min_value=0)

    submit_button = st.form_submit_button("Predict Price")

    # Perform actions when the submit button is clicked
    if submit_button:
        result =reg.predict(np.array([[OverallQual,GrLivArea,GarageCars,GarageArea,TotalBsmtSF,FirstFlrSF,FullBath,TotRmsAbvGrd]]))

        # Display the result
        prediction_html = f"""
            <div style="#F08080;padding:10px">
                <h2 style="color:black;text-align:center;">The predicted sale price of your house is ${result[0]:,.2f}</h2>
            </div>
            """
        st.markdown(prediction_html,unsafe_allow_html=True)


st.markdown(
    """
    <div style="text-align: center; background-color: #333333;; padding: 10px;">
        <h1 style="color:white; background-color: #333333;">Prediction Insight</h3>
        <h4 style="color: white;background-color: #333333;font-size: 17px">The following Visualizations can be used to get a better understanding of the price predictions. </h3>
    </div>
    """,
    unsafe_allow_html=True


)

st.write(
    """
#### Select the variables you want to compare
"""
)
# Get variable options
variable_options = df.columns.tolist()

# Set default variables
default_x_variable = "OverallQual"
default_y_variable = "SalePrice"

# Select variables for the chart
x_variable = st.selectbox('Select X Variable', variable_options, index=variable_options.index(default_x_variable))
variable_options_y = [var for var in variable_options if var != x_variable]
y_variable = st.selectbox('Select Y Variable', variable_options_y, index=variable_options_y.index(default_y_variable))

# Check if the selected variables are the same
if x_variable == y_variable:
    st.warning("Please select different variables for X and Y.")
else:
    # Create the chart based on user-selected variables
    chart_data = df.groupby([x_variable])[y_variable].mean().sort_values(ascending=True)
    chart = alt.Chart(chart_data.reset_index()).mark_line().encode(
        x=alt.X(x_variable, axis=alt.Axis(title=x_variable)),
        y=alt.Y(y_variable, axis=alt.Axis(title=y_variable))
    ).interactive()

    # Display the chart using Streamlit
    st.altair_chart(chart, use_container_width=True)

# Manually rerun the app without refreshing the page
st.write(
    """
#### Heatmap showing the correlation
"""
)

corr_matrix = df.corr()


plt.figure(figsize=(12, 10))
heatmap = sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)

# Adjust plot elements to match Streamlit theme
plt.title('Correlation Heatmap')
plt.xticks(rotation=45)
plt.yticks(rotation=0)
plt.tight_layout()

# Display the heatmap using Streamlit
st.pyplot(heatmap.figure)

st.write(
    """
#### Insight on the affect of categorical features against the sales price
"""
)

# Separate continuous and categorical variable options
categorical_variable_options = raw_data_train.select_dtypes(include=['object']).columns.tolist()


# Select a categorical variable for correlation with SalePrice
categorical_variable = st.selectbox('Select Categorical Variable', categorical_variable_options, key="categorical_variable")

if categorical_variable:
    # Create boxen plot for correlation between categorical variable and SalePrice
    boxen_data = raw_data_train[[categorical_variable, 'SalePrice']]
    fig, ax = plt.subplots(figsize=(10, 8))
    sns.boxenplot(data=boxen_data, x=categorical_variable, y='SalePrice')
    plt.xlabel(categorical_variable)
    plt.ylabel('SalePrice')
    plt.title('Correlation between Categorical Variable and Sale Price (Boxen Plot)')
    plt.xticks(rotation=45)
    st.pyplot(fig)




Writing app.py


In [None]:
!streamlit run app.py &>/content/logs.txt &

Run the cell below to obtain the endpoint Ip. Copy it and click on the url in the output. Paste the endpoint IP and paste it in the page that you get directed to. This will allow you to run the application.

In [None]:
!curl ipv4.icanhazip.com
!npx localtunnel --port 8501

34.80.130.131
[K[?25hnpx: installed 22 in 1.323s
your url is: https://polite-garlics-count.loca.lt
