In [8]:
import pandas as pd

def load_data():
    train_data = pd.read_csv('../files/input/train_data.csv.zip', compression='zip')
    test_data = pd.read_csv('../files/input/test_data.csv.zip', compression='zip')

    return train_data, test_data

train_data, test_data = load_data()

In [9]:
def clean_data(df):
    df["Age"] = 2021 - df["Year"]
    df = df.drop(columns=["Year", "Car_Name"])
    return df

train_data, test_data = clean_data(train_data), clean_data(test_data)

In [10]:
x_train, y_train = train_data.drop(columns="Present_Price"), train_data["Present_Price"]
x_test, y_test = test_data.drop(columns="Present_Price"), test_data["Present_Price"]

In [14]:
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler


def create_pipeline():
    numerical_features = [col for col in x_train.columns if col not in ["Fuel_Type", "Selling_type", "Transmission", "Owner"]]
    transformer = ColumnTransformer(
        transformers=[
            ('ohe', OneHotEncoder(handle_unknown='ignore'), ["Fuel_Type", "Selling_type", "Transmission", "Owner"]),
            ("num", MinMaxScaler(), numerical_features),
        ],
        remainder='passthrough'
    )

    pipeline = Pipeline(
        steps=[
            ("transformer", transformer),
            ("selectkbest", SelectKBest(score_func=f_regression)),
            ("regressor", LinearRegression())
        ]
    )

    return pipeline

pipeline = create_pipeline()

In [15]:
from sklearn.model_selection import GridSearchCV

def make_grid_search(pipeline):
    estimator = GridSearchCV(
        estimator=pipeline,
        param_grid={
            "selectkbest__k": [x for x in range(2, 25)]
        },
        cv=10,
        scoring="neg_mean_squared_error",
        n_jobs=-1
    )

    estimator.fit(x_train, y_train)

    return estimator

estimator = make_grid_search(pipeline)



In [16]:
import os
import gzip
import pickle

def save_model(estimator):
    if not os.path.exists("../files/models"):
        os.makedirs("../files/models")
    with gzip.open("../files/models/model.pkl.gz", "wb") as f:
        pickle.dump(estimator, f)

save_model(estimator)

In [21]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

def metrics():
    y_train_pred = estimator.best_estimator_.predict(x_train)
    y_test_pred = estimator.best_estimator_.predict(x_test)

    return [
        {
            "type": "metrics",
            "dataset": "train",
            "r2": r2_score(y_train, y_train_pred),
            "mse": mean_squared_error(y_train, y_train_pred),
            "mad": mean_absolute_error(y_train, y_train_pred)
        },
        {
            "type": "metrics",
            "dataset": "test",
            "r2": r2_score(y_test, y_test_pred),
            "mse": mean_squared_error(y_test, y_test_pred),
            "mad": mean_absolute_error(y_test, y_test_pred)
        }
    ]

metrics = metrics()

In [22]:
import json

def save_metrics(metrics):
    if not os.path.exists("../files/output"):
        os.makedirs("../files/output")
    with open("../files/output/metrics.json", "w") as f:
        for m in metrics:
            f.write(json.dumps(m) + "\n")
save_metrics(metrics)