In [150]:
TESTING = [
    {
        "type": "metrics",
        "dataset": "train",
        "r2": 0.889,
        "mse": 5.950,
        "mad": 1.600,
    },
    {
        "type": "metrics",
        "dataset": "test",
        "r2": 0.728,
        "mse": 32.910,
        "mad": 2.430,
    },
]


In [151]:
# importacion de datasets

import pandas as pd

train = pd.read_csv('../files/input/train_data.csv.zip', compression='zip')
test = pd.read_csv('../files/input/test_data.csv.zip', compression='zip')

In [152]:
# Paso 1: Preprocesamiento de datos

def preprocesar_data(df):
    df['Age'] = 2021 - df['Year']
    df = df.drop(columns=['Year', 'Car_Name'])
    return df

train = preprocesar_data(train)
test = preprocesar_data(test)

In [153]:
# eliminar outliers
def calcular_outliers_iqr(df, column):
    if pd.api.types.is_numeric_dtype(df[column]):
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.43 * IQR
        upper_bound = Q3 + 1.43 * IQR
        return df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    return pd.DataFrame()

outliers_train = calcular_outliers_iqr(train, 'Selling_Price')
outliers_test = calcular_outliers_iqr(test, 'Selling_Price')

train_without_outliers = train[~train.index.isin(outliers_train.index)]
test_without_outliers = test[~test.index.isin(outliers_test.index)]


In [154]:
# Paso 2: Dividir el dataset en conjunto de entrenamiento y prueba

x_train = train.drop('Present_Price', axis=1)
y_train = train['Present_Price']
x_test = test.drop('Present_Price', axis=1)
y_test = test['Present_Price']

x_train_without_outliers = train_without_outliers.drop('Present_Price', axis=1)
y_train_without_outliers = train_without_outliers['Present_Price']
x_test_without_outliers = test_without_outliers.drop('Present_Price', axis=1)
y_test_without_outliers = test_without_outliers['Present_Price']

In [155]:
# Paso 3: Crear pipeline

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.metrics import mean_squared_error

categorical_features = ['Fuel_Type', 'Selling_type', 'Transmission']
numerical_features = ['Selling_Price', 'Driven_kms', 'Age', 'Owner']

def negative_mean_squared_error(y_true, y_pred):
    return -mean_squared_error(y_true, y_pred)

transformer = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('num', MinMaxScaler(), numerical_features)
    ]
)

pipeline = Pipeline(steps=[
    ('preprocessor', transformer),
    ('selector', SelectKBest(score_func=f_regression, k=2)),  
    ('model', LinearRegression())  
])

pipeline.fit(x_train_without_outliers, y_train_without_outliers)
print('Train score:', negative_mean_squared_error(y_train, pipeline.predict(x_train)))
print('Test score:', negative_mean_squared_error(y_test, pipeline.predict(x_test)))

Train score: -12.19587478926426
Test score: -38.31682870634173


In [156]:
# Paso 4: Optimizar hiperparámetros

from sklearn.model_selection import GridSearchCV

param_grid = {
    'selector__k': [2],
}

grid_params = {
    'preprocessor__num__feature_range': [(0, 1)],
    'selector__k': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 'all'],
    'model__fit_intercept': [True],
    'model__positive': [True],
}

grid_search = GridSearchCV(pipeline, param_grid=grid_params, cv=10, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)
grid_search.fit(x_train_without_outliers, y_train_without_outliers)
print('Best params:', grid_search.best_params_)
print('Best score:', grid_search.best_score_)
print('Train score:', negative_mean_squared_error(y_train_without_outliers, grid_search.predict(x_train_without_outliers)))
print('Test score:', negative_mean_squared_error(y_test_without_outliers, grid_search.predict(x_test_without_outliers)))

Fitting 10 folds for each of 11 candidates, totalling 110 fits
Best params: {'model__fit_intercept': True, 'model__positive': True, 'preprocessor__num__feature_range': (0, 1), 'selector__k': 'all'}
Best score: -7.176214410084455
Train score: -4.974796256392159
Test score: -9.228835990654138


In [157]:
# Paso 5: Guardar el modelo

import os
import gzip
import pickle

os.makedirs('../files/models', exist_ok=True)
with gzip.open('../files/models/model.pkl.gz', 'wb') as f:
    pickle.dump(grid_search, f)

In [158]:
# Paso 6: Calcular métricas

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

y_train_pred = grid_search.predict(x_train)
y_test_pred = grid_search.predict(x_test)

metrics_train = {
    'type': 'metrics',
    'dataset': 'train',
    'r2': r2_score(y_train, y_train_pred),
    'mse': mean_squared_error(y_train, y_train_pred),
    'mad': mean_absolute_error(y_train, y_train_pred),
}

metrics_test = {
    'type': 'metrics',
    'dataset': 'test',
    'r2': r2_score(y_test, y_test_pred),
    'mse': mean_squared_error(y_test, y_test_pred),
    'mad': mean_absolute_error(y_test, y_test_pred),
}

metrics = [metrics_train, metrics_test]

In [159]:
# Guardar las métricas en un archivo JSON

import json

os.makedirs('../files/output', exist_ok=True)
with open('../files/output/metrics.json', 'w') as f:
    for metric in metrics:
        f.write(json.dumps(metric) + '\n')