In [47]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler


In [48]:
df_train = pd.read_csv("05_data/train_data.csv")
df_test = pd.read_csv("05_data/test_data.csv")

In [49]:
df_train.head()

Unnamed: 0,ID,Square_Footage,Num_Bedrooms,Num_Bathrooms,Year_Built,Lot_Size,Garage_Size,Neighborhood_Quality,Footage_to_Lot_Ratio,Total_Rooms,Age_of_House,Garage_to_Footage_Ratio,Avg_Room_Size,Price,House_Orientation_Angle,Street_Alignment_Offset,Solar_Exposure_Index,Magnetic_Field_Strength,Vibration_Level
0,1,2028,2,3,1967,1.78479,2,2,1136.268444,5,58,0.000986,405.6,11184.929934,16.722149,298.409571,235.502857,227.621575,129.770822
1,2,3519,5,3,1966,4.009947,0,10,877.567605,8,59,0.0,439.875,13941.315383,340.115663,43.878994,300.292055,46.684432,211.676987
2,3,4507,2,3,2014,4.122337,0,7,1093.311933,5,11,0.0,901.4,19686.885572,219.823215,24.542031,186.851621,10.837394,316.769266
3,4,3371,4,2,2000,1.580318,0,1,2133.114532,6,25,0.0,561.833333,20964.530841,10.361763,147.970249,107.843644,175.620355,244.463978
4,5,2871,5,1,1974,3.426914,2,6,837.78009,6,51,0.000697,478.5,12180.466278,329.344524,46.114469,357.571806,335.719756,135.850744


In [50]:
df_train.info()
df_train.describe(include="all")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800 entries, 0 to 799
Data columns (total 19 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   ID                       800 non-null    int64  
 1   Square_Footage           800 non-null    int64  
 2   Num_Bedrooms             800 non-null    int64  
 3   Num_Bathrooms            800 non-null    int64  
 4   Year_Built               800 non-null    int64  
 5   Lot_Size                 800 non-null    float64
 6   Garage_Size              800 non-null    int64  
 7   Neighborhood_Quality     800 non-null    int64  
 8   Footage_to_Lot_Ratio     800 non-null    float64
 9   Total_Rooms              800 non-null    int64  
 10  Age_of_House             800 non-null    int64  
 11  Garage_to_Footage_Ratio  800 non-null    float64
 12  Avg_Room_Size            800 non-null    float64
 13  Price                    800 non-null    float64
 14  House_Orientation_Angle  8

Unnamed: 0,ID,Square_Footage,Num_Bedrooms,Num_Bathrooms,Year_Built,Lot_Size,Garage_Size,Neighborhood_Quality,Footage_to_Lot_Ratio,Total_Rooms,Age_of_House,Garage_to_Footage_Ratio,Avg_Room_Size,Price,House_Orientation_Angle,Street_Alignment_Offset,Solar_Exposure_Index,Magnetic_Field_Strength,Vibration_Level
count,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0
mean,400.5,2813.9,2.96875,1.97125,1986.9025,2.772345,1.015,5.59625,1386.217438,4.94,38.0975,0.00048,663.178961,15255.03564,178.313854,185.194401,182.648299,175.105709,175.631046
std,231.0844,1252.769853,1.421372,0.816245,20.809536,1.283141,0.820437,2.91528,1248.816283,1.671045,20.809536,0.00057,443.41883,8567.717106,105.401913,105.105681,104.217502,103.351843,103.057842
min,1.0,503.0,1.0,1.0,1950.0,0.514134,0.0,1.0,126.641221,2.0,3.0,0.0,63.125,2033.91717,1.667528,1.158575,0.530576,0.717169,0.011059
25%,200.75,1746.75,2.0,1.0,1969.0,1.689746,0.0,3.0,628.038058,4.0,20.0,0.0,343.485714,8966.532577,86.399887,92.250029,99.078754,86.950004,85.921421
50%,400.5,2855.5,3.0,2.0,1987.0,2.808737,1.0,6.0,995.099133,5.0,38.0,0.000383,574.1,14259.530183,180.956901,190.076148,181.800427,174.241789,173.498254
75%,600.25,3858.75,4.0,3.0,2005.0,3.87673,2.0,8.0,1638.312977,6.0,56.0,0.000653,827.6,19426.249787,269.526559,275.735429,274.256675,263.691198,262.882291
max,800.0,4999.0,5.0,3.0,2022.0,4.989303,2.0,10.0,9036.553315,8.0,75.0,0.00361,2477.5,57187.683656,359.898362,359.788941,358.357506,359.60134,358.464403


In [51]:
# Subtask 1
df_test["Total_Area"] = (
    df_test["Square_Footage"] + df_test["Garage_Size"] + df_test["Lot_Size"]
)

# Subtask 2
df_test["Garage_to_Room_Ratio"] = df_test["Garage_Size"] / df_test[
    "Total_Rooms"
].replace(0, np.nan)

# Subtask 3
df_test["Env_Stability_Index"] = (
    df_test["Solar_Exposure_Index"] - df_test["Vibration_Level"]
) / df_test["Magnetic_Field_Strength"].replace(0, np.nan)

# Subtask 4
mean_square_footage = df_train["Square_Footage"].mean()

df_test["Abs_Diff_Square_Footage"] = (
    df_test["Square_Footage"] - mean_square_footage
).abs()

In [None]:
from sklearn.ensemble import (
    BaggingRegressor,
    HistGradientBoostingRegressor,
    RandomForestRegressor,
)
from sklearn.linear_model import Lasso, LinearRegression, Ridge
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor

TARGET = "Price"
ID_COL = "ID"
numeric_features = [c for c in df_train.columns if c not in [TARGET, ID_COL]]

numeric_preprocess = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
    ]
)
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_preprocess, numeric_features),
    ]
)

X = df_train.drop(columns=[TARGET])
y = df_train[TARGET]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

models = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(random_state=42),
    "Lasso": Lasso(random_state=42),
    "DecisionTree": DecisionTreeRegressor(random_state=42),
    "RandomForest": RandomForestRegressor(random_state=42),
    "SVR": SVR(),
    "Bagging": BaggingRegressor(random_state=42),
    "HistGradientBoosting": HistGradientBoostingRegressor(random_state=42),
    "GradientBoosting": GradientBoostingRegressor(random_state=42),
}

results = []
for name, model in models.items():
    pipe = Pipeline(steps=[("preprocessor", preprocessor), ("model", model)])
    pipe.fit(X_train, y_train)
    y_pred_train = pipe.predict(X_train)
    y_pred_val = pipe.predict(X_val)
    mse_train = mean_absolute_error(y_train, y_pred_train)
    mse_val = mean_absolute_error(y_val, y_pred_val)
    print(f"{name} – Train MSE: {mse_train:.3f}, Val MSE: {mse_val:.3f}")
    results.append((name, mse_train, mse_val))

best_name, _, _ = min(results, key=lambda x: x[2])
print(f"Best model on validation: {best_name}")
best_model = models[best_name]

pipeline_best = Pipeline(steps=[("preprocessor", preprocessor), ("model", best_model)])
pipeline_best.fit(X, y)
price_predictions = pipeline_best.predict(df_test)

LinearRegression – Train MSE: 257.015, Val MSE: 253.748
Ridge – Train MSE: 257.980, Val MSE: 254.975
Lasso – Train MSE: 256.946, Val MSE: 254.379
DecisionTree – Train MSE: 0.000, Val MSE: 941.478
RandomForest – Train MSE: 218.081, Val MSE: 578.707
SVR – Train MSE: 6445.700, Val MSE: 6233.451
Bagging – Train MSE: 262.253, Val MSE: 718.191
HistGradientBoosting – Train MSE: 183.357, Val MSE: 340.659
GradientBoosting – Train MSE: 173.222, Val MSE: 323.497
Best model on validation: LinearRegression


In [53]:
pipeline_best.fit(X, y)
price_predictions = pipeline_best.predict(df_test)

rows = []
for idx, row in df_test.iterrows():
    dp_id = row[ID_COL]
    rows.extend(
        [
            (1, dp_id, row["Total_Area"]),
            (2, dp_id, row["Garage_to_Room_Ratio"]),
            (3, dp_id, row["Env_Stability_Index"]),
            (4, dp_id, row["Abs_Diff_Square_Footage"]),
            (5, dp_id, price_predictions[idx]),
        ]
    )

submission = pd.DataFrame(rows, columns=["subtaskID", "datapointID", "answer"])
submission.to_csv("05_data/predictions.csv", index=False)