In [43]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import median_absolute_error, mean_squared_error, r2_score
import os
import pickle
import gzip
import json


In [44]:
# Paso 1.
# Preprocese los datos.

def preprocess_data(train_path, test_path):
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    
    for df in [train_df, test_df]:
        df['Age'] = 2021 - df['Year']
        df.drop(['Year', 'Car_Name'], axis=1, inplace=True)
    
    return train_df, test_df

train_path = '../files/input/train_data.csv.zip'
test_path = '../files/input/test_data.csv.zip'


train_df, test_df = preprocess_data(train_path, test_path)

In [45]:
# Paso 2.
# Divida los datasets en x_train, y_train, x_test, y_test.

def split_data(train_df, test_df):
    x_train = train_df.drop('Present_Price', axis=1)
    y_train = train_df['Present_Price']
    x_test = test_df.drop('Present_Price', axis=1)
    y_test = test_df['Present_Price']
     
    return x_train, y_train, x_test, y_test

x_train, y_train, x_test, y_test = split_data(train_df, test_df)

In [46]:
# Paso 3.
# Cree un pipeline para el modelo de clasificación.
def build_pipeline():
    numeric_features = ['Selling_Price', 'Driven_kms', 'Age', 'Owner']
    categorical_features = ['Fuel_Type', 'Selling_type', 'Transmission']
    
    numeric_transformer = Pipeline(steps=[
        ('scaler', MinMaxScaler())
    ])
    
    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ])
    
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('feature_selection', SelectKBest(score_func=f_regression)),
        ('regressor', LinearRegression())
    ])
    
    return pipeline

pipeline = build_pipeline()

In [47]:
# Paso 4.
# Optimice los hiperparametros del pipeline usando validación cruzada.
def optimize_hyperparameters(pipeline, x_train, y_train):
    param_grid = {
        'feature_selection__k': [5, 10, 'all']
    }
    
    grid_search = GridSearchCV(pipeline, param_grid, cv=10, scoring='neg_mean_absolute_error')
    grid_search.fit(x_train, y_train)
    
    return grid_search

best_model = optimize_hyperparameters(pipeline, x_train, y_train)

In [48]:
# Paso 5.
# Guarde el modelo (comprimido con gzip) como "files/models/model.pkl.gz".
def save_model(model):    
    models_dir = '../files/models'
    os.makedirs(models_dir, exist_ok=True)

    with gzip.open("../files/models/model.pkl.gz", "wb") as file:
        pickle.dump(model, file)

save_model(best_model)

In [49]:
def calculate_metrics(model, x_train, y_train, x_test, y_test):
    metrics = []
    
    for x, y, dataset_name in [(x_train, y_train, 'train'), (x_test, y_test, 'test')]:
        y_pred = model.predict(x)
        metrics.append({
            'type': 'metrics',
            'dataset': dataset_name,
            'r2': r2_score(y, y_pred),
            'mse': mean_squared_error(y, y_pred),
            'mad': median_absolute_error(y, y_pred)
        })
    
    return metrics

metrics = calculate_metrics(best_model, x_train, y_train, x_test, y_test)

In [50]:
def save_metrics(metrics):
    metrics_path = "../files/output/metrics.json"
    os.makedirs("../files/output", exist_ok=True)
    with open(metrics_path, 'w') as f:
        for metric in metrics:
            f.write(json.dumps(metric, ensure_ascii=False))
            f.write('\n')

save_metrics(metrics)