In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import time 
import warnings; warnings.filterwarnings(action='ignore')

import os

In [2]:
train = pd.read_csv("/kaggle/input/playground-series-s4e9/train.csv")
test = pd.read_csv("/kaggle/input/playground-series-s4e9/test.csv")

test_id = test['id']
test = test.drop('id', axis=1)

In [3]:
train.select_dtypes(include='object').nunique()

brand             57
model           1897
fuel_type          7
engine          1117
transmission      52
ext_col          319
int_col          156
accident           2
clean_title        1
dtype: int64

# ***PREPROCESSING***

In [4]:
import re 

def preprocess(dataframe):    
    pattern = r'(\d+\.?\d*)HP'
    dataframe['horsepower'] = dataframe['engine'].apply(lambda x: np.float64(re.search(pattern, x).group(1)) if re.search(pattern, x) else np.nan)

    pattern = r'(\d+\.?\d*)L'
    dataframe['liter'] = dataframe['engine'].apply(lambda x: np.float64(re.search(pattern, x).group(1)) if re.search(pattern, x) else np.nan)
    
    dataframe = dataframe.drop(['engine', 'model'], axis=1)
    
    return dataframe 

train = preprocess(train)
test = preprocess(test)

In [5]:
X = train.drop(['price', 'id'], axis=1)
y = train['price']

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline 
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# subset the data 
numerical = X.select_dtypes(include='int64').columns
categorical = X.select_dtypes(include='object').columns

num_pipeline = Pipeline(steps=[
    ('mean_impute', SimpleImputer(strategy='mean'))
])

cat_pipeline = Pipeline(steps=[
    ('mode_impute', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(drop='first', handle_unknown='ignore')),
])

transformer = ColumnTransformer(transformers=[
    ('num', num_pipeline, numerical),
    ('cat', cat_pipeline, categorical)])

X = transformer.fit_transform(X)
test = transformer.transform(test)

# ***SELECTION***

In [7]:
from sklearn.model_selection import StratifiedKFold

NUM_FOLDS = 10 
skfolds = StratifiedKFold(n_splits=NUM_FOLDS, shuffle=True, random_state=42)

def rmse_cv(model, X, y):
    tot_rmse = 0 
    
    for fold, (train_idx, test_idx) in enumerate(skfolds.split(X, y)): 
        tic = time.time()  # Start timer
        X_train_folds, y_train_folds = X[train_idx], y[train_idx]
        X_test_fold, y_test_fold = X[test_idx], y[test_idx]
        
        model.fit(X_train_folds, y_train_folds)
        y_pred_fold = model.predict(X_test_fold)
        
        toc = time.time() 
        tictoc = toc - tic 
        
        rmse = np.sqrt(np.mean((y_pred_fold - y_test_fold)**2))
        print(f"RMSE: {rmse:.5f}. FOLD: {fold+1}. TIME: {tictoc:.2f}.")
        
        tot_rmse += rmse
        
    return tot_rmse / NUM_FOLDS

In [9]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import cross_val_score

model = Lasso()
# cv_score = cross_val_score(model, X, y, scoring='neg_root_mean_squared_error', cv=10)

# rmse = -cv_score.mean()
# rmse

In [10]:
model.fit(X, y) 
y_pred = model.predict(test)

submission = pd.DataFrame({'id': test_id, 'price': y_pred})

In [11]:
submission.to_csv('submission.csv', index=False)