### Skeleton of Preprocessing pipeline 

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge, Lasso, ElasticNet

In [7]:
train = pd.read_csv("train2.csv")
test = pd.read_csv("test2.csv")
y = train["SalePrice"]
X = train.drop("SalePrice", axis=1)

y_log = np.log1p(y)

num_feats = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_feats = X.select_dtypes(include=["object"]).columns.tolist()

### Preprocessors


In [8]:
num_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

cat_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(transformers=[
    ("num", num_transformer, num_feats),
    ("cat", cat_transformer, cat_feats)
])

## Ridge Regression

In [9]:
pipe_ridge = Pipeline(steps=[
    ("preproc", preprocessor),
    ("ridge", Ridge())
])

In [10]:
param_grid = {"ridge__alpha": [0.1, 1.0, 10.0, 50.0, 100.0, 200.0, 500.0]}
grid_ridge = GridSearchCV(pipe_ridge, param_grid, cv=5, scoring="neg_root_mean_squared_error", verbose=1)
grid_ridge.fit(X, y_log)  
print("Best ridge alpha:", grid_ridge.best_params_)
print("Best CV RMSE:", -grid_ridge.best_score_)

Fitting 5 folds for each of 7 candidates, totalling 35 fits
Best ridge alpha: {'ridge__alpha': 10.0}
Best CV RMSE: 0.13987861352192976


### Prediction

In [11]:
best_ridge = grid_ridge.best_estimator_
best_ridge.fit(X, y_log)

X_test = test.copy()
y_test_log_pred = best_ridge.predict(X_test)

y_test_pred = np.expm1(y_test_log_pred) # converting to the oginal scale

submission = pd.DataFrame({
    "Id": test["Id"],
    "SalePrice": y_test_pred
})
submission.to_csv("ridge_submission.csv", index=False)

## Lasso Regression

In [12]:
pipe_lasso = Pipeline(steps=[
    ("preproc", preprocessor),
    ("lasso", Lasso(max_iter=10000))
])

In [13]:
param_grid = {"lasso__alpha": [0.0001, 0.001, 0.01, 0.1, 1.0]}
grid_lasso = GridSearchCV(pipe_lasso, param_grid, cv=5, scoring="neg_root_mean_squared_error", verbose=1)
grid_lasso.fit(X, y_log)

print("Best lasso alpha:", grid_lasso.best_params_)
print("Best CV RMSE:", -grid_lasso.best_score_)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best lasso alpha: {'lasso__alpha': 0.001}
Best CV RMSE: 0.13728117657977687


### Results

In [14]:
best_lasso = grid_lasso.best_estimator_
best_lasso.fit(X, y_log)
y_test_log_pred = best_lasso.predict(X_test)
y_test_pred = np.expm1(y_test_log_pred)

submission = pd.DataFrame({"Id": test["Id"], "SalePrice": y_test_pred})
submission.to_csv("lasso_submission.csv", index=False)

## Elastic Net

In [15]:
pipe_enet = Pipeline(steps=[
    ("preproc", preprocessor),
    ("enet", ElasticNet(max_iter=10000))
])

In [16]:
param_grid = {
    "enet__alpha": [0.0001, 0.001, 0.01, 0.1, 1.0],
    "enet__l1_ratio": [0.1, 0.3, 0.5, 0.7, 0.9]
}

grid_enet = GridSearchCV(pipe_enet, param_grid, cv=5, scoring="neg_root_mean_squared_error", verbose=1)
grid_enet.fit(X, y_log)

print("Best enet params:", grid_enet.best_params_)
print("Best CV RMSE:", -grid_enet.best_score_)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
Best enet params: {'enet__alpha': 0.001, 'enet__l1_ratio': 0.5}
Best CV RMSE: 0.13600570135674078


In [17]:
best_enet = grid_enet.best_estimator_
best_enet.fit(X, y_log)
y_test_log_pred = best_enet.predict(X_test)
y_test_pred = np.expm1(y_test_log_pred)

submission = pd.DataFrame({"Id": test["Id"], "SalePrice": y_test_pred})
submission.to_csv("enet_submission.csv", index=False)
