# Ames Housing Price Prediction
## Final Exam - GSB 544

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.impute import SimpleImputer

## Data Loading

In [2]:
train = pd.read_csv(r"C:\Users\spink\OneDrive\Desktop\Machine Learning\Data\train_new.csv")
test = pd.read_csv(r"C:\Users\spink\OneDrive\Desktop\Machine Learning\Data\test_new.csv")

train.shape, test.shape

((2197, 25), (605, 24))

In [3]:
train.head()

Unnamed: 0,SalePrice,PID,Lot Frontage,Lot Area,Street,Neighborhood,Bldg Type,House Style,Overall Qual,Overall Cond,Year Built,Roof Style,Heating,Central Air,Electrical,Full Bath,Half Bath,Bedroom AbvGr,TotRms AbvGrd,Gr Liv Area,Functional,Screen Porch,Pool Area,Yr Sold,Sale Type
0,159000,531363010,80.0,9605,Pave,SawyerW,1Fam,1Story,7,6,2007,Gable,GasA,Y,SBrkr,1,1,3,6,1218,Typ,0,0,2009,WD
1,271900,906203120,90.0,14684,Pave,SawyerW,1Fam,1Story,7,7,1990,Hip,GasA,Y,SBrkr,2,0,3,7,2196,Typ,0,0,2009,WD
2,137500,916176030,,14375,Pave,Timber,1Fam,SLvl,6,6,1958,Gable,GasA,Y,FuseA,1,0,3,7,1344,Typ,233,0,2009,COD
3,248500,528180130,48.0,6472,Pave,NridgHt,TwnhsE,1Story,9,5,2008,Hip,GasA,Y,SBrkr,2,0,2,6,1456,Typ,0,0,2009,WD
4,167000,528290030,61.0,9734,Pave,Gilbert,1Fam,SLvl,7,5,2004,Gable,GasA,Y,SBrkr,2,1,3,7,1374,Typ,0,0,2009,WD


In [4]:
train['SalePrice'].describe()

count      2197.000000
mean     182376.851161
std       81168.157405
min       13100.000000
25%      130000.000000
50%      163500.000000
75%      215000.000000
max      755000.000000
Name: SalePrice, dtype: float64

## Data Prep

In [5]:
X_train = train.drop(columns=['SalePrice', 'PID'])
y_train = np.log(train['SalePrice'])
X_test = test.drop(columns=['PID'])

In [6]:
num_cols = ['Lot Frontage', 'Lot Area', 'Overall Qual', 'Overall Cond', 'Year Built', 
            'Full Bath', 'Half Bath', 'Bedroom AbvGr', 'TotRms AbvGrd', 'Gr Liv Area',
            'Screen Porch', 'Pool Area', 'Yr Sold']
cat_cols = ['Street', 'Neighborhood', 'Bldg Type', 'House Style', 'Roof Style', 
            'Heating', 'Central Air', 'Electrical', 'Functional', 'Sale Type']

num_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('encoder', OneHotEncoder(drop='first', handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', num_transformer, num_cols),
    ('cat', cat_transformer, cat_cols)
])

In [7]:
def rmse_scorer(estimator, X, y):
    preds = estimator.predict(X)
    return -np.sqrt(np.mean((preds - y)**2))

## Model 1: Ridge

In [8]:
ridge_pipe = Pipeline([
    ('pre', preprocessor),
    ('model', Ridge())
])

param_grid_ridge = {'model__alpha': [0.1, 1, 10, 100]}
grid_ridge = GridSearchCV(ridge_pipe, param_grid_ridge, cv=5, scoring=rmse_scorer)
grid_ridge.fit(X_train, y_train)

grid_ridge.best_params_, -grid_ridge.best_score_



({'model__alpha': 10}, np.float64(0.1488202317913189))

## Model 2: Lasso

In [9]:
lasso_pipe = Pipeline([
    ('pre', preprocessor),
    ('model', Lasso(max_iter=10000))
])

param_grid_lasso = {'model__alpha': [0.0001, 0.001, 0.01, 0.1]}
grid_lasso = GridSearchCV(lasso_pipe, param_grid_lasso, cv=5, scoring=rmse_scorer)
grid_lasso.fit(X_train, y_train)

grid_lasso.best_params_, -grid_lasso.best_score_



({'model__alpha': 0.0001}, np.float64(0.1487657234687381))

## Model 3: ElasticNet

In [10]:
en_pipe = Pipeline([
    ('pre', preprocessor),
    ('model', ElasticNet(max_iter=10000))
])

param_grid_en = {'model__alpha': [0.001, 0.01, 0.1], 'model__l1_ratio': [0.2, 0.5, 0.8]}
grid_en = GridSearchCV(en_pipe, param_grid_en, cv=5, scoring=rmse_scorer)
grid_en.fit(X_train, y_train)

grid_en.best_params_, -grid_en.best_score_



({'model__alpha': 0.001, 'model__l1_ratio': 0.2},
 np.float64(0.14834391962818863))

## Model Comparison

In [11]:
results = pd.DataFrame({
    'Model': ['Ridge', 'Lasso', 'ElasticNet'],
    'CV RMSE': [-grid_ridge.best_score_, -grid_lasso.best_score_, -grid_en.best_score_]
})
results

Unnamed: 0,Model,CV RMSE
0,Ridge,0.14882
1,Lasso,0.148766
2,ElasticNet,0.148344


ElasticNet preformed the best so I will use that for my final predictions.

## Final Predictions

In [12]:
best_model = grid_en

log_preds = best_model.predict(X_test)
preds = np.exp(log_preds)

In [13]:
submission = pd.DataFrame({
    'PID': test['PID'],
    'SalePrice': preds
})

submission.to_csv('submission_regression.csv', index=False)
submission

Unnamed: 0,PID,SalePrice
0,907135180,127052.450255
1,528181040,220595.588185
2,528175010,219889.132676
3,531379030,185807.570642
4,923275090,129045.549606
...,...,...
600,528174060,181826.698880
601,903400180,175747.372553
602,903227150,131658.644025
603,909250070,160545.394327
