Paso 0. Cargue de rutas

In [1]:
import pickle

ruta_archivo = "../files/grading/x_train.pkl"

with open(ruta_archivo, "rb") as file:
    contenido1 = pickle.load(file)

In [2]:
ruta_archivo = "../files/grading/y_train.pkl"

with open(ruta_archivo, "rb") as file:
    contenido2 = pickle.load(file)

In [3]:
ruta_archivo = "../files/grading/y_train.pkl"

with open(ruta_archivo, "rb") as file:
    contenido2 = pickle.load(file)


In [4]:
ruta_archivo = "../files/grading/y_test.pkl"

with open(ruta_archivo, "rb") as file:
    contenido4 = pickle.load(file)

Paso 1. Preprocesamiento de datos. 

In [5]:
import pandas as pd

data_train = pd.read_csv('../files/input/train_data.csv.zip', index_col = False, compression = "zip")
data_test = pd.read_csv("../files/input/test_data.csv.zip", index_col = False, compression = "zip")
print(data_train)

data_train['Age'] = 2021 - data_train['Year']
data_test['Age'] = 2021 - data_test['Year']

data_train.drop(columns=['Year', 'Car_Name'], inplace=True)
data_test.drop(columns=['Year', 'Car_Name'], inplace=True)

                      Car_Name  Year  Selling_Price  Present_Price  \
0                         jazz  2016           7.40          8.500   
1                          i10  2013           4.00          4.600   
2           TVS Apache RTR 180  2011           0.50          0.826   
3                          eon  2016           3.15          4.430   
4    Royal Enfield Thunder 350  2013           1.25          1.500   
..                         ...   ...            ...            ...   
206                        i10  2011           2.55          4.430   
207                    etios g  2015           3.95          6.800   
208           Bajaj Pulsar 150  2006           0.10          0.750   
209                        i20  2014           6.00          7.600   
210                       jazz  2016           6.00          8.400   

     Driven_kms Fuel_Type Selling_type Transmission  Owner  
0         15059    Petrol       Dealer    Automatic      0  
1         30000    Petrol       Deale

Paso 2. División dataset 

In [6]:
x_train = data_train.drop(columns=['Present_Price'])
y_train = data_train["Present_Price"]
x_test = data_test.drop(columns=['Present_Price'])
y_test = data_test["Present_Price"]

Paso 3. Creación pipeline

In [7]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LinearRegression

categorical_features = ['Fuel_Type', 'Selling_type', 'Transmission']
numerical_features = list(set(x_test.columns) - set(categorical_features))

preprocesamiento = ColumnTransformer(
    transformers=[('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('num', MinMaxScaler(), numerical_features)])

pipeline = Pipeline(steps=[
    ('preprocessor', preprocesamiento),         
    ('feature_selection', SelectKBest(f_regression)), 
    ('regresion', LinearRegression()),  
])

Paso 4. Optimización de hiperparametros con cv

In [8]:
from sklearn.model_selection import GridSearchCV

param_grid = {"feature_selection__k": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]}

model = GridSearchCV(pipeline, param_grid, cv=10, scoring='neg_mean_absolute_error', n_jobs=-1)

model.fit(x_train, y_train)

Paso 5. Guardar el modelo

In [9]:
import os
import gzip

dir_path = '../files/models'

if not os.path.exists(dir_path):
    os.makedirs(dir_path)
    with gzip.open('../files/models/model.pkl.gz', 'wb') as f:
        pickle.dump(model, f)
else:
    with gzip.open('../files/models/model.pkl.gz', 'wb') as f:
        pickle.dump(model, f)

Paso 6. Cálculo de métricas

In [10]:
from sklearn.metrics import mean_squared_error, median_absolute_error, r2_score
import json

y_train_pred = model.predict(x_train)
y_test_pred = model.predict(x_test)

train_metrics = {
    "type": "metrics",
    'dataset': 'train',
    'r2': float(r2_score(y_train, y_train_pred)),
    'mse': float(mean_squared_error(y_train, y_train_pred)),
    'mad': float(median_absolute_error(y_train, y_train_pred)),
}

test_metrics = {
    "type": "metrics", 
    'dataset': 'test',
    'r2': float(r2_score(y_test, y_test_pred)),
    'mse': float(mean_squared_error(y_test, y_test_pred)),
    'mad': float(median_absolute_error(y_test, y_test_pred)),
}

output_path = '../files/output/metrics.json'
os.makedirs(os.path.dirname(output_path), exist_ok=True)

with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(train_metrics, f, ensure_ascii=False) 
    f.write('\n')
    json.dump(test_metrics, f, ensure_ascii=False)
    f.write('\n')
