# Housing Price Prediction with Feature Selection, Linear Rigression, Ridge Rigression and Lasso Regression

#### Import All Required Libraries

In [None]:
import pandas as pd
import numpy as np

import os
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import Lasso
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import make_column_selector as selector


In [None]:
train_df = pd.read_csv("Data/train.csv")
train_df.head()

In [None]:
print(train_df.columns)
print(train_df.columns.size)

#### Drop the columns which have null values over 40% from the data

In [None]:
class DropOverFourtyPercentNaNColumns(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.4):
        self.threshold = threshold
        self.cols_to_drop = []

    def fit(self, X_df, y=None):
        nan_ratio = X_df.isna().mean()
        self.cols_to_drop_ = nan_ratio[nan_ratio > self.threshold].index.tolist()
        return self

    def transform(self, X):
        dropped_df = pd.DataFrame(X).drop(columns=self.cols_to_drop_, errors="ignore")
        return dropped_df

#### Seperate the variable that need to be predicted

In [None]:
X = train_df.drop(columns=["SalePrice"])
Y = train_df["SalePrice"]

X.head()

#### Fill the columns which have null data with not null values
#### Encode categorical data to neumerical data

In [None]:
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

In [None]:
num_pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('encode', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

preprocessor = ColumnTransformer([
    ('num', num_pipeline, selector(dtype_include=['int64','float64'])),
    ('cat', cat_pipeline, selector(dtype_include=['object']))
], remainder='drop')

#### Model With Filtered Columns

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
model = Pipeline(
    [
        ('drop_over_40Percent_nan', DropOverFourtyPercentNaNColumns(threshold=0.4)),
        ('preprocessor', preprocessor),
        ('feature_selection', SelectKBest(score_func=f_regression, k=40)),
        ('regressor', LinearRegression())
    ]
)

In [None]:
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

In [None]:
print("Model Performance:")
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("R² Score:", r2)

#### Model With all The Columns

In [None]:
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
model_1 = Pipeline(
    [
        ('drop_over_40Percent_nan', DropOverFourtyPercentNaNColumns(threshold=0.4)),
        ('preprocessor', preprocessor),
        ('regressor', LinearRegression())
    ]
)

In [None]:
# model_1 = LinearRegression()
model_1.fit(X_train_1, y_train_1)

In [None]:
y_pred_1 = model_1.predict(X_test_1)

In [None]:
mse_1 = mean_squared_error(y_test_1, y_pred_1)
rmse_1 = np.sqrt(mse_1)
r2_1 = r2_score(y_test_1, y_pred_1)

In [None]:
print("Model Performance:")
print("Mean Squared Error (MSE):", mse_1)
print("Root Mean Squared Error (RMSE):", rmse_1)
print("R² Score:", r2_1)

#### Model With Ridge Regression

In [None]:
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
model_2 = Pipeline(
    [
        ('drop_over_40Percent_nan', DropOverFourtyPercentNaNColumns(threshold=0.4)),
        ('preprocessor', preprocessor),
        ('regressor', Ridge(alpha=100.0) )
    ]
)

In [None]:
model_2.fit(X_train_2, y_train_2)

In [None]:
y_pred_2 = model_2.predict(X_test_2)

In [None]:
mse_2 = mean_squared_error(y_test_2, y_pred_2)
rmse_2 = np.sqrt(mse_2)
r2_2 = r2_score(y_test_2, y_pred_2)

In [None]:
print("Model Performance:")
print("Mean Squared Error (MSE):", mse_2)
print("Root Mean Squared Error (RMSE):", rmse_2)
print("R² Score:", r2_2)

#### Model With Lasso Regression

In [None]:
X_train3, X_test3, y_train3, y_test3 = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
model_3 = Pipeline(
    [
        ('drop_over_40Percent_nan', DropOverFourtyPercentNaNColumns(threshold=0.4)),
        ('preprocessor', preprocessor),
        ('lasso_regressor', Lasso(alpha=0.5, max_iter=1000, random_state=42) )
    ]
)

In [None]:
model_3.fit(X_train3, y_train3)

In [None]:
y_pred3 = model_3.predict(X_test3)

In [None]:
mse_3 = mean_squared_error(y_test3, y_pred3)
rmse_3 = np.sqrt(mse_3)
r2_3 = r2_score(y_test3, y_pred3)

In [None]:
print("Model Performance:")
print("Mean Squared Error (MSE):", mse_3)
print("Root Mean Squared Error (RMSE):", rmse_3)
print("R² Score:", r2_3)