In [None]:
# Import libraries

import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LassoCV

import joblib

In [2]:
# Data loading

train_data = pd.read_csv('../data/train.csv')
test_data = pd.read_csv('../data/test.csv')

In [3]:
# Separating lable and features

X_train = train_data.drop(columns= 'SalePrice')
y_train = train_data['SalePrice']
X_test = test_data.copy()

In [4]:
# Taking the logarithm of the lable column due to skewness

y_train = np.log1p(train_data['SalePrice'])

In [5]:
# Remove columns with a percentage of missing values ​​greater than 40

missing_values_Percentage = train_data.isnull().mean() * 100
columns_to_drop = missing_values_Percentage[missing_values_Percentage > 40].index
train_data = train_data.drop(columns=columns_to_drop)
test_data = test_data.drop(columns=columns_to_drop)
print(f"Dropped columns: {list(columns_to_drop)}")

Dropped columns: ['Alley', 'MasVnrType', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature']


In [6]:
# Select numeric and categorical columns

num_cols = X_train.select_dtypes(include=[np.number]).columns
cat_cols = X_train.select_dtypes(include=['object']).columns

In [7]:
# Numeric pipeline: impute missing values with median

num_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

In [8]:
# Categorical pipeline: impute with most frequent value, then one-hot encode

cat_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', drop='first'))
])    

In [9]:
# Combine both pipelines into a single preprocessor

preprocessor = ColumnTransformer(transformers=[
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
])

In [10]:
# Fit on train, transform train and test

X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

In [11]:
# Train LassoCV on preprocessed data

lasso = LassoCV(cv=5, random_state=42, n_jobs=-1)
lasso.fit(X_train_processed, y_train)

# Get mask of important features (non-zero coefficients)

feature_mask = np.abs(lasso.coef_) > 1e-5
selected_features = feature_mask.sum()

print(f"Selected features: {selected_features} / {X_train_processed.shape[1]}")


Selected features: 84 / 245


In [12]:
X_train_selected = X_train_processed[:, feature_mask]
X_test_selected = X_test_processed[:, feature_mask]

In [None]:
# Load preprocessed data

joblib.dump(X_train_selected, '../data/processed/X_train_selected.pkl')
joblib.dump(X_test_selected, '../data/processed/X_test_selected.pkl')
joblib.dump(y_train, '../data/processed/y_train.pkl')

['../data/processed/y_train.pkl']