# General

## Import Packages

In [None]:
#--Basics---------------
import pandas as pd
import numpy as np
#--Data Visualization----
import matplotlib.pyplot as plt
import seaborn as sns

#---Scikit-Learn--------
from sklearn import set_config
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, MaxAbsScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.utils import estimator_html_repr

#---Misc------------------
import expectexception
from itertools import combinations

## Define Data

In [32]:
data = pd.read_csv(r"ames.csv")
data = data.drop(columns=['Alley', 'Fireplace.Qu', 'Pool.QC', 'Fence', 'Misc.Feature'], axis=1)
columns_numeric = ['Lot.Frontage', 'Mas.Vnr.Area', 'BsmtFin.SF.1', 'BsmtFin.SF.2',
                         'Bsmt.Unf.SF', 'Total.Bsmt.SF', 'Bsmt.Full.Bath', 'Bsmt.Half.Bath',
                        'Garage.Yr.Blt', 'Garage.Cars', 'Garage.Area']
columns_categorical = [ 'Mas.Vnr.Type','Bsmt.Qual', 'Bsmt.Cond', 'Bsmt.Exposure', 'BsmtFin.Type.1',
                        'BsmtFin.Type.2','Electrical', 'Garage.Type', 'Garage.Finish', 'Garage.Qual', 'Garage.Cond']


## Handle Missing Values

In [36]:
# Impute missing values with the median of the respective column
for column in columns_numeric:
    median_value = data[column].median()
    data[column].fillna(median_value, inplace=True)
    
# Impute missing values with the mode of the respective column
for col in columns_categorical:
    data[col].fillna(data[col].mode()[0], inplace=True)

X, y = data.loc[:, data.columns.drop('price')], data['price']

## Encoding

In [37]:
encoder = OneHotEncoder(sparse_output = False).set_output(transform="pandas")
X_ohe_cat = encoder.fit_transform(X[columns_categorical])
X_ohe = pd.concat([X_ohe_cat, X[columns_numeric]], axis=1)
X_ohe.to_csv("df.csv")


## Forward Selection

In [62]:
estimator = LinearRegression()
sfs_forward = SequentialFeatureSelector(estimator=estimator, direction="forward", tol=.001).set_output(transform="pandas")
M_forward = sfs_forward.fit_transform(X_ohe, y)
print(M_forward.columns)

Index(['Bsmt.Qual_Ex', 'Bsmt.Qual_Gd', 'Bsmt.Exposure_Gd',
       'BsmtFin.Type.1_GLQ', 'Garage.Type_Attchd', 'Garage.Type_BuiltIn',
       'Garage.Finish_Fin', 'Garage.Qual_Gd', 'Mas.Vnr.Area', 'Total.Bsmt.SF',
       'Garage.Cars', 'Garage.Area'],
      dtype='object')


## Backward Selection

In [63]:
sfs_backward = SequentialFeatureSelector(estimator=estimator, direction="backward", tol=-.001).set_output(transform="pandas")
M_backward = sfs_backward.fit_transform(X_ohe, y)
print(M_backward.columns)

Index(['Bsmt.Qual_Ex', 'Bsmt.Qual_Gd', 'Bsmt.Exposure_Gd',
       'Garage.Type_Attchd', 'Garage.Type_BuiltIn', 'Garage.Finish_RFn',
       'Garage.Finish_Unf', 'Garage.Qual_Gd', 'Mas.Vnr.Area', 'Total.Bsmt.SF',
       'Garage.Cars', 'Garage.Area'],
      dtype='object')
