In [1]:
# Import Dependencies
import pandas as pd
from pathlib import Path
import numpy as np
import hvplot.pandas
from sklearn.linear_model import LinearRegression

In [2]:
# Note (regarding the file path) - when using a Mac, change "\" to "/"
# When using a Windows based PC, change "/" to "\"
housing_csv = Path("../Resources/housing_price_dataset.csv")

In [3]:
housing_df = pd.read_csv(housing_csv)
housing_df

Unnamed: 0,SquareFeet,Bedrooms,Bathrooms,Neighborhood,YearBuilt,Price
0,2126,4,1,Rural,1969,215355.283618
1,2459,3,2,Rural,1980,195014.221626
2,1860,2,1,Suburb,1970,306891.012076
3,2294,2,1,Urban,1996,206786.787153
4,2130,5,2,Suburb,2001,272436.239065
...,...,...,...,...,...,...
49995,1282,5,3,Rural,1975,100080.865895
49996,2854,2,2,Suburb,1988,374507.656727
49997,2979,5,3,Suburb,1962,384110.555590
49998,2596,5,2,Rural,1984,380512.685957


In [4]:
# Price, highest to lowest
price_ascending = housing_df.sort_values(["Price"], ascending=False)
price_ascending = price_ascending.reset_index(drop=True)
price_ascending

Unnamed: 0,SquareFeet,Bedrooms,Bathrooms,Neighborhood,YearBuilt,Price
0,2758,5,2,Rural,1967,492195.259972
1,2957,5,1,Suburb,2001,482577.163405
2,2995,5,2,Urban,1954,476671.733263
3,2901,2,3,Urban,1996,470989.679074
4,2998,3,2,Urban,1995,468493.877841
...,...,...,...,...,...,...
49995,1140,4,1,Urban,2020,-23911.003119
49996,1235,3,1,Urban,1952,-24183.000515
49997,1024,2,2,Urban,2006,-24715.242482
49998,1080,5,1,Rural,1955,-28774.998022


In [5]:
# Note - I referenced class activity 2 (from class 20.1) for this section.
# Create a scatter plot of SquareFeet versus Price

sqfootage__price_plot = housing_df.hvplot.scatter(
    x="SquareFeet",
    y="Price",
    title="Home Prices Evaluated by Square Footage"
)
sqfootage__price_plot

## Prepare the Data to Fit the Linear Regression Model

In [6]:
# Create the X set by using the `reshape` function to format the SquareFeet data as a single column array.
X = housing_df["SquareFeet"].values.reshape(-1, 1)

# Display the sample data
X[:5]

array([[2126],
       [2459],
       [1860],
       [2294],
       [2130]])

In [7]:
# Create an array for the dependent variable y (Price)
y = housing_df["Price"]

## Build the Linear Regression Model

In [8]:
# Create a model with scikit-learn
model = LinearRegression()

In [9]:
# Fit the data into the model
model.fit(X, y)

In [10]:
# Display the slope
print(f"Model's slope: {model.coef_}")

Model's slope: [99.32210752]


In [11]:
# Display the y-intercept
print(f"Model's y-intercept: {model.intercept_}")

Model's y-intercept: 25549.963448873197


In [12]:
# Display the model's best fit line formula
print(f"Model's formula: y = {model.intercept_} + {model.coef_[0]}X")

Model's formula: y = 25549.963448873197 + 99.32210752486034X


## Plot the Best Fit Line for the Sales Prediction Model

In [13]:
# Make predictions using the X set
predicted_y_values = model.predict(X)

In [14]:
# Create a copy of the original data
housing_prices_predicted = housing_df.copy()

# Add a column with the predicted home prices
housing_prices_predicted["Prices_Predicted"] = predicted_y_values

# Display sample data
housing_prices_predicted.head()

Unnamed: 0,SquareFeet,Bedrooms,Bathrooms,Neighborhood,YearBuilt,Price,Prices_Predicted
0,2126,4,1,Rural,1969,215355.283618,236708.764047
1,2459,3,2,Rural,1980,195014.221626,269783.025853
2,1860,2,1,Suburb,1970,306891.012076,210289.083445
3,2294,2,1,Urban,1996,206786.787153,253394.878111
4,2130,5,2,Suburb,2001,272436.239065,237106.052477


In [15]:
# Create a line plot of ads versus the predicted sales values
best_fit_line = housing_prices_predicted.hvplot.line(
    x = "SquareFeet",
    y = "Prices_Predicted",
    color = "red"
)
best_fit_line

In [16]:
# Superpose the original data and the best fit line
sqfootage__price_plot * best_fit_line

## Make Manual Predictions

In [17]:
# Display the formula to predict the price of a home that has 2000 square feet
print(f"Model's formula: y = {model.intercept_} + {model.coef_[0]} * 2000")

# Predict the home price for the home with 2000 square feet
y_2000 = model.intercept_ + model.coef_[0] * 2000

# Display the prediction
print(f"Predicted price of a home with 2000 square feet: ${y_2000:.2f}")

Model's formula: y = 25549.963448873197 + 99.32210752486034 * 2000
Predicted price of a home with 2000 square feet: $224194.18


## Make Predictions Using the `predict` Function

In [18]:
# Create an array to predict sales for 1000, 1500, 2000, 2500, and 3000 square feet
X_square_feet = np.array([1000, 1500, 2000, 2500, 3000])

# Format the array as a one-column array
X_square_feet = X_square_feet.reshape(-1,1)

# Display sample data
X_square_feet

array([[1000],
       [1500],
       [2000],
       [2500],
       [3000]])

In [19]:
# Predict the home prices for houses of 1000, 1500, 2000, 2500, and 3000 square feet
predicted_home_prices = model.predict(X_square_feet)

In [20]:
# Create a DataFrame for the predicted home prices
df_predicted_home_prices = pd.DataFrame(
    {
        "SquareFeet": X_square_feet.reshape(1, -1)[0],
        "predicted_home_prices": predicted_home_prices
    }
)

# Display data
df_predicted_home_prices

Unnamed: 0,SquareFeet,predicted_home_prices
0,1000,124872.070974
1,1500,174533.124736
2,2000,224194.178499
3,2500,273855.232261
4,3000,323516.286023


## Assess the Linear Regression Model

In [21]:
# Import relevant metrics from scikit-learn
from sklearn.metrics import mean_squared_error, r2_score

In [22]:
# Compute metrics for the linear regression model: score, r2, mse, rmse, std
score = model.score(X, y, sample_weight=None)
r2 = r2_score(y, predicted_y_values)
mse = mean_squared_error(y, predicted_y_values)
rmse = np.sqrt(mse)
std = np.std(y)

# Print relevant metrics.
print(f"The score is {score}.")
print(f"The r2 is {r2}.")
print(f"The mean squared error is {mse}.")
print(f"The root mean squared error is {rmse}.")
print(f"The standard deviation is {std}.")

The score is 0.5635801965977383.
The r2 is 0.5635801965977383.
The mean squared error is 2530128229.453327.
The root mean squared error is 50300.380013011105.
The standard deviation is 76141.08154380998.
