In [1]:
from loguru import logger
import pandas as pd
import statsmodels.api as sm
from scipy import stats

In [2]:
houses = pd.read_excel("Houses.xlsx")
trad_homes = houses[houses['Traditional'] == 1]

### 1. Linear regression: SalePrice ~ SquareFeet for traditional homes 


H0: slope <= 0, H1: slope > 0, alpha = 0.01
Reject H0 if p-value < 0.01

In [3]:
X = trad_homes['SquareFeet']
X = sm.add_constant(X) 
y = trad_homes['SalePrice']
model = sm.OLS(y, X).fit()
logger.success(f"\n{model.summary()}")

[32m2025-09-06 20:55:19.920[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36m<module>[0m:[36m5[0m - [32m[1m
                            OLS Regression Results                            
Dep. Variable:              SalePrice   R-squared:                       0.639
Model:                            OLS   Adj. R-squared:                  0.639
Method:                 Least Squares   F-statistic:                     1028.
Date:                Sat, 06 Sep 2025   Prob (F-statistic):          1.56e-130
Time:                        20:55:19   Log-Likelihood:                -7106.4
No. Observations:                 582   AIC:                         1.422e+04
Df Residuals:                     580   BIC:                         1.423e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
-------------

### 2. Hypothesis Test on Expected Sale Price for Non-Modern Homes:

##### Test the null hypothesis that the expected price of a nonmodern home with 1500 square feet is less than or equal to $100,000.

In [4]:
X2 = pd.DataFrame({'SquareFeet': [1500]})
X2 = sm.add_constant(X2, has_constant='add')
pred = model.get_prediction(X2)
pred_summary = pred.summary_frame(alpha=0.01)
logger.info(f"\n{pred_summary}")
pred_mean = pred.predicted_mean[0]
pred_se = pred.se_mean[0]
t_stat = (pred_mean - 100000) / pred_se
df = model.df_resid
p_value = 1 - stats.t.cdf(t_stat, df)
logger.info(f'p value = {p_value}')
if p_value < 0.1:
    logger.success('The hypothesis is rejected')
else:
    logger.error('Failed to reject the hypothesis')

[32m2025-09-06 20:55:19.943[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m5[0m - [1m
          mean    mean_se  mean_ci_lower  mean_ci_upper  obs_ci_lower  \
0  82250.36379  2778.1901   75070.597006   89430.130575 -43767.615305   

    obs_ci_upper  
0  208268.342885  [0m
[32m2025-09-06 20:55:19.945[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m11[0m - [1mp value = 0.9999999998284276[0m
[32m2025-09-06 20:55:19.946[0m | [31m[1mERROR   [0m | [36m__main__[0m:[36m<module>[0m:[36m15[0m - [31m[1mFailed to reject the hypothesis[0m


### 3. Construct a 95% interval estimate of the expected price of traditional house of 1500 square feet

In [5]:
X_new = pd.DataFrame({'SquareFeet': [1500]})
X_new = sm.add_constant(X_new, has_constant='add')  
prediction = model.get_prediction(X_new)
pred_summary = prediction.summary_frame(alpha=0.05)  
logger.success(f"\n{pred_summary}")

[32m2025-09-06 20:55:19.962[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36m<module>[0m:[36m5[0m - [32m[1m
          mean    mean_se  mean_ci_lower  mean_ci_upper  obs_ci_lower  \
0  82250.36379  2778.1901   76793.824768   87706.902813 -13521.830643   

    obs_ci_upper  
0  178022.558223  [0m


### 4. Quadratic Regression for Traditional style homes

In [6]:
trad_homes['SquareFeet2'] = trad_homes['SquareFeet']**2
X = trad_homes[['SquareFeet', 'SquareFeet2']]
X = sm.add_constant(X)  
y = trad_homes['SalePrice']
quad_model = sm.OLS(y, X).fit()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  trad_homes['SquareFeet2'] = trad_homes['SquareFeet']**2


#### Test 1: at 1500 sq ft

In [7]:
L_1500 = [0, 1, 2*1500] 
test_1500 = quad_model.t_test((L_1500, 60))  # H0: marginal effect = 60
t_stat_1500 = test_1500.tvalue[0][0]
p_value_1500 = stats.t.cdf(t_stat_1500, quad_model.df_resid)  # one-sided (Ha: < 60)

logger.info(f"At 1500 sq ft: t={t_stat_1500}, p={p_value_1500}")
if p_value_1500 < 0.01:
    logger.success("Reject H0: Marginal effect is significantly less than $60")
else:
    logger.error("Fail to reject H0")

[32m2025-09-06 20:55:19.994[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1mAt 1500 sq ft: t=-4.009546354737429, p=3.439825307923216e-05[0m
[32m2025-09-06 20:55:19.995[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [32m[1mReject H0: Marginal effect is significantly less than $60[0m


#### Test 2: at 3000 sq ft

In [8]:
L_3000 = [0, 1, 2*3000]
test_3000 = quad_model.t_test((L_3000, 60))
t_stat_3000 = test_3000.tvalue[0][0]
p_value_3000 = stats.t.cdf(t_stat_3000, quad_model.df_resid)

logger.info(f"At 3000 sq ft: t={t_stat_3000}, p={p_value_3000}")
if p_value_3000 < 0.01:
    logger.success("Reject H0: Marginal effect is significantly less than $60")
else:
    logger.error("Fail to reject H0")

[32m2025-09-06 20:55:20.011[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1mAt 3000 sq ft: t=6.158236925290441, p=0.9999999993118289[0m
[32m2025-09-06 20:55:20.013[0m | [31m[1mERROR   [0m | [36m__main__[0m:[36m<module>[0m:[36m10[0m - [31m[1mFail to reject H0[0m
