In [None]:
import pandas as pd
import numpy as np
import json
import gzip
import pickle
import os

from sklearn.model_selection import GridSearchCV, KFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.metrics import r2_score, mean_squared_error, median_absolute_error

In [None]:
train_data = pd.read_csv("../files/input/train_data.csv.zip", index_col=False, compression="zip")
test_data = pd.read_csv("../files/input/test_data.csv.zip", index_col=False, compression="zip")

train_data["Age"] = 2021 - train_data["Year"]
test_data["Age"] = 2021 - test_data["Year"]

train_data.drop(["Year", "Car_Name"], axis=1, inplace=True)
test_data.drop(["Year", "Car_Name"], axis=1, inplace=True)

train_data.head()

In [None]:
y_train = train_data["Present_Price"]
y_test = test_data["Present_Price"]

x_train = train_data.drop(["Present_Price"], axis=1)
x_test = test_data.drop(["Present_Price"], axis=1)

In [None]:
categorical_features = ['Fuel_Type','Selling_type','Transmission']
numeric_features = [col for col in x_train.columns if col not in categorical_features]

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_features),
        ('scaler',MinMaxScaler(), numeric_features),
    ],
)

pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("selectk", SelectKBest(score_func=f_regression)), 
    ("regressor", LinearRegression())
])

In [None]:
param_grid = {
    'selectk__k': range(1, 15),
    'regressor__fit_intercept': [True, False],
    'regressor__positive': [True, False]
}

grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=10,
    scoring="neg_mean_absolute_error",
    n_jobs=-1,
    )

grid_search.fit(x_train, y_train)

In [None]:
os.makedirs("../files/models", exist_ok=True)
with gzip.open("../files/models/model.pkl.gz", "wb") as f:
    pickle.dump(grid_search, f)