# Get Data


In [None]:
import numpy as np
import pandas as pd

In [None]:
pd.set_option('display.max_columns', None)  
pd.set_option('display.width', None)        
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.max_rows', None)

In [None]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [None]:
train.shape, test.shape

In [None]:
train.head(5)

In [None]:
test.head(5)

# Cleaning


Some columns can have NaN value, so should overview the columns themselves


In [None]:
train.isna().sum()[train.isna().sum() > 0]

## Remove Duplicates


In [None]:
train.drop_duplicates(inplace=True)

## MSSubClass

This is stored as a number, but behaves as a category


In [None]:
train['MSSubClass'] = train['MSSubClass'].astype('object')
test['MSSubClass'] = test['MSSubClass'].astype('object')

## LotFrontage

The NaN values here mostly mean incomplete data, so it will be filled with the median (because it is continuous data)


In [None]:
LotFrontage_median = train['LotFrontage'].median()
train.loc[train['LotFrontage'].isna(), 'LotFrontage'] = LotFrontage_median
test.loc[test['LotFrontage'].isna(), 'LotFrontage'] = LotFrontage_median

## MasVnrType

NaN here means that it does not exist, so MasVnrArea should be 0


In [None]:
train.loc[train['MasVnrType'].isna(), 'MasVnrArea'] = 0
test.loc[test['MasVnrType'].isna(), 'MasVnrArea'] = 0

## Electric

NaN here likely means that is was not documented, so it will be set to the most common one


In [None]:
most_common_electrical = train['Electrical'].mode()[0]
train.loc[train['Electrical'].isna(), 'Electrical'] = most_common_electrical
test.loc[test['Electrical'].isna(), 'Electrical'] = most_common_electrical

## GarageYrBlt

When a house does not have a garage, the year for it is NaN. Giving all of these values some year value would be wrong, so I decided to put these years into bins, where the NaNs will all go inside the same bin


In [None]:
bins = [0, 1900, 1920, 1940, 1960, 1980, 2000, 2020]
labels = ['None', '1900-1919', '1920-1939', '1940-1959', '1960-1979', '1980-1999', '2000-2019']

train['GarageYrBltInt'] = pd.cut(train['GarageYrBlt'].fillna(0), bins=bins, labels=labels, right=False).astype('object')
test['GarageYrBltInt'] = pd.cut(test['GarageYrBlt'].fillna(0), bins=bins, labels=labels, right=False).astype('object')

train.drop(columns=['GarageYrBlt'], inplace=True)
test.drop(columns=['GarageYrBlt'], inplace=True)


## Other Columns

The rest of the columns where it is NaN is actually a valid category, so I will change them to 'None'


In [None]:
train.fillna('None', inplace=True)
test.fillna('None', inplace=True)

## Check cleaned


In [None]:
train.isna().sum()[train.isna().sum() > 0]

In [None]:
train.head(10)

# Feature Engineering


In [None]:
def check_columns(df):
    cat_cols = df.select_dtypes(include=['object']).columns
    num_cols = df.select_dtypes(exclude=['object']).columns
    
    print(f"Categorical columns: {len(cat_cols)}")
    print(cat_cols)
    print(f"Numerical columns: {len(num_cols)}")
    print(num_cols)

In [None]:
check_columns(train)

In [None]:
cat_cols = train.select_dtypes(include=['object']).columns

## Category Encoding


In [None]:
from sklearn.preprocessing import LabelEncoder

train_ref = train.copy()

### Definitions


In [None]:
def one_hot_encoding(df, cat_cols):
    df = pd.get_dummies(df, columns=cat_cols, drop_first=True)
    return df

In [None]:
def label_encoding(df, cat_cols):
    encoder = LabelEncoder()
    for col in cat_cols:
        df[col] = encoder.fit_transform(df[col])
    return df

In [None]:
def target_encoding(df, cat_cols, target_col, df_ref=None):
    if df_ref is None:
        df_ref = df
    for col in cat_cols:
        mean = df_ref.groupby(col)[target_col].mean()
        df[col] = df[col].map(mean)
    return df

In [None]:
def frequency_encoding(df, cat_cols, def_ref=None):
    if df_ref is None:
        df_ref = df
    for col in cat_cols:
        freq = df_ref[col].value_counts(normalize=True)
        df[col] = df[col].map(freq)
    return df

### Encoding

In [None]:
target_encoding(train, cat_cols, 'SalePrice')
train.head(10)

In [None]:
target_encoding(test, cat_cols, 'SalePrice', train_ref)
test.head(10)

In [None]:
check_columns(train)

# Feature Selection


In [None]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

In [None]:
def correlation_filter(df, target_col, threshold=0.8):
    corr = df.drop(columns=[target_col]).corr().abs()

    to_drop_pairs = []
    for i in range(len(corr.columns)):
        for j in range(i):
            if corr.iloc[i, j] > threshold:
                to_drop_pairs.append((corr.columns[i], corr.columns[j], corr.iloc[i, j]))

    print("Highly correlated pairs:")
    for col1, col2, corr_val in to_drop_pairs:
        print(f"{col1} and {col2}: {corr_val}")
        
    to_drop = set()
    for col1, col2, _ in to_drop_pairs:
        if abs(df[col1].corr(df[target_col])) > abs(df[col2].corr(df[target_col])):
            to_drop.add(col2)
        else:
            to_drop.add(col1)
            
    to_drop = list(to_drop)
    
    print(f"Columns to drop: {to_drop}")
    
    return to_drop

In [None]:
def rfe(df, target_col, n_features_to_select=10):
    x = df.drop(columns=[target_col])
    y = df[target_col]

    model = LinearRegression(iter=1000, random_state=42)
    rfe = RFE(model, n_features_to_select=n_features_to_select, step=1)
    rfe.fit(x, y)

    selected_features = x.columns[rfe.support_].tolist()
    
    print(f"Selected features: {selected_features}")
    
    return selected_features

In [None]:
to_drop = correlation_filter(train, 'SalePrice', 0.8)
to_stay = rfe(train, 'SalePrice', 10)

In [None]:
train = train.drop(columns=to_drop)
test = test.drop(columns=to_drop)