### ECON 2350: Regression Project - Economics of Real Estate Pricing
##### Arnav Mehta | 08/19/2025

In [2]:
# Import neccessary libraries
import pandas as pd
import statsmodels.api as sm

##### Step 1: Load the dataset "ArlHome5.xlsx"

In [4]:
# Use pandas (pd) read function to load the dataset
df = pd.read_excel("ArlHome5.xlsx") # For this set of code, if you use jupyter notebook as the IDE, ensure the file is in the same directory
df.columns = [col.strip().lower() for col in df.columns] 

In [5]:
print("First 5 rows of the dataset:")
display(df.head())

First 5 rows of the dataset:


Unnamed: 0,price,sqft,beds,baths
0,728000,2399,4,2.5
1,569077,1731,3,1.5
2,831833,2800,4,3.0
3,689000,2200,3,2.5
4,685000,2716,3,3.5


##### Step 2: Exploratory Analysis

In [7]:
print("Descriptive Statistics:")
display(df.describe())


Descriptive Statistics:


Unnamed: 0,price,sqft,beds,baths
count,36.0,36.0,36.0,36.0
mean,538270.111111,1987.277778,3.222222,2.055556
std,165309.213916,703.280298,0.831904,0.860048
min,247500.0,850.0,1.0,1.0
25%,413625.0,1567.5,3.0,1.5
50%,537375.0,1921.5,3.0,2.0
75%,651100.0,2403.75,4.0,3.0
max,838500.0,3964.0,5.0,3.5


##### Step 3: Define helper function for regression

In [9]:
# use statsmodels.api's run_regression function to define the helper function
def run_regression(y, X):
    X = sm.add_constant(X)  # add intercept
    model = sm.OLS(y, X).fit()
    return model

##### Step 4: Build out the four models

In [11]:
#Set the y value
y = df["price"]

###### Model 1: Affect of SqFt on Price

In [44]:
# Model 1: Price ~ SqFt
model1 = run_regression(y, df[["sqft"]])

# Display regression result
print("Model 1 Summary")
print(model1.summary())

Model 1 Summary
                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.713
Model:                            OLS   Adj. R-squared:                  0.705
Method:                 Least Squares   F-statistic:                     84.50
Date:                Wed, 20 Aug 2025   Prob (F-statistic):           9.64e-11
Time:                        08:17:15   Log-Likelihood:                -460.66
No. Observations:                  36   AIC:                             925.3
Df Residuals:                      34   BIC:                             928.5
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       1.438e+05   4.54e+04    

In [34]:
# Standard Error calculated by taking the root of the Mean Squared Error
model1.mse_resid ** 0.5

89842.11675944798

###### Model 2: Affect of SqFt + Beds on Price

In [15]:
# Model 2: Price ~ SqFt + Beds
model2 = run_regression(y, df[["sqft", "beds"]])

# Display regression results
print("Model 2 Summary")
print(model2.summary())

Model 2 Summary
                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.717
Model:                            OLS   Adj. R-squared:                  0.700
Method:                 Least Squares   F-statistic:                     41.81
Date:                Tue, 19 Aug 2025   Prob (F-statistic):           8.99e-10
Time:                        16:12:00   Log-Likelihood:                -460.41
No. Observations:                  36   AIC:                             926.8
Df Residuals:                      33   BIC:                             931.6
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       1.162e+05   6.12e+04    

In [36]:
# Standard Error
model2.mse_resid ** 0.5

90562.3779213671

##### Model 3: Affect of SqfT + Baths on Price

In [17]:
# Model 3: Price ~ SqFt + Baths
model3 = run_regression(y, df[["sqft", "baths"]])

# Display regression results
print("Model 3 Summary")
print(model3.summary())

Model 3 Summary
                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.783
Model:                            OLS   Adj. R-squared:                  0.770
Method:                 Least Squares   F-statistic:                     59.43
Date:                Tue, 19 Aug 2025   Prob (F-statistic):           1.15e-11
Time:                        16:12:00   Log-Likelihood:                -455.66
No. Observations:                  36   AIC:                             917.3
Df Residuals:                      33   BIC:                             922.1
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       1.328e+05   4.03e+04    

In [38]:
# Standard Error
model3.mse_resid ** 0.5

79359.43161736199

###### Model 4: Affect of Sqft + Beds + Baths

In [19]:
# Model 4: Price ~ SqFt + Beds + Baths
model4 = run_regression(y, df[["sqft", "beds", "baths"]])

# Display regression results
print("Model 4 Summary")
print(model4.summary())

Model 4 Summary
                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.785
Model:                            OLS   Adj. R-squared:                  0.765
Method:                 Least Squares   F-statistic:                     38.97
Date:                Tue, 19 Aug 2025   Prob (F-statistic):           8.54e-11
Time:                        16:12:00   Log-Likelihood:                -455.46
No. Observations:                  36   AIC:                             918.9
Df Residuals:                      32   BIC:                             925.3
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       1.115e+05   5.42e+04    

In [42]:
# Standard Error
model4.mse_resid ** 0.5

80147.13928829394

##### Step 6: Predictions for 2000 sq ft, 3 bed, 2 bath 

In [21]:
# Set variables
new_house = pd.DataFrame({
    "sqft": [2000],
    "beds": [3],
    "baths": [2]
})

# For Model 1 (only sqft)
X1 = sm.add_constant(new_house[["sqft"]], has_constant="add")
pred1 = model1.predict(X1)

# For Model 4 (sqft + beds + baths)
X4 = sm.add_constant(new_house[["sqft", "beds", "baths"]], has_constant="add")
pred4 = model4.predict(X4)

print("\nPredicted Price (Model 1):", round(pred1[0], 2))
print("Predicted Price (Model 4):", round(pred4[0], 2))



Predicted Price (Model 1): 540795.32
Predicted Price (Model 4): 532035.81
