In [34]:
import pandas as pd
import gzip
import json
import pickle

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error,median_absolute_error
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LinearRegression


In [35]:
TEST_DATA_PATH = '../files/input/test_data.csv.zip'
TRAIN_DATA_PATH = '../files/input/train_data.csv.zip'
MODEL_PATH = '../files/models/model.pkl.gz'
METRICS_PATH = '../files/output/metrics.json'

In [36]:
test_data = pd.read_csv(
	TEST_DATA_PATH,
	index_col=False,
	compression='zip'
)

train_data = pd.read_csv(
	TRAIN_DATA_PATH,
	index_col=False,
	compression='zip'
)

In [37]:
current_year = 2021

train_data['Age'] = current_year - train_data['Year']
test_data['Age'] = current_year - test_data['Year']

columns_to_drop = ['Year', 'Car_Name']
train_data = train_data.drop(columns=columns_to_drop)
test_data = test_data.drop(columns=columns_to_drop)

In [38]:
# Paso 2.
# Divida los datasets en x_train, y_train, x_test, y_test.

x_train=train_data.drop(columns="Present_Price")
y_train=train_data["Present_Price"]


x_test=test_data.drop(columns="Present_Price")
y_test=test_data["Present_Price"]

In [39]:
# Paso 3.
# Cree un pipeline para el modelo de clasificación. Este pipeline debe
# contener las siguientes capas:
# - Transforma las variables categoricas usando el método
#   one-hot-encoding.
# - Escala las variables numéricas al intervalo [0, 1].
# - Selecciona las K mejores entradas.
# - Ajusta un modelo de regresion lineal.


categorical_features = ['Fuel_Type', 'Selling_type', 'Transmission']
numerical_features = list(set(x_train.columns) - set(categorical_features))

preprocessor = ColumnTransformer(
	transformers=[
		("num", MinMaxScaler(), numerical_features),
		("cat", OneHotEncoder(), categorical_features)
	],
	remainder="passthrough"
)

k_best = SelectKBest(f_regression, k='all')

model = LinearRegression()

pipeline = Pipeline(
	steps=[
		("preprocessor", preprocessor),
		("k_best", k_best),
		("model", model)
	]
)


In [40]:
pipeline

In [41]:
# Paso 4.
# Optimice los hiperparametros del pipeline usando validación cruzada.
# Use 10 splits para la validación cruzada. Use el error medio absoluto
# para medir el desempeño modelo.

param_grid = {
	"k_best__k": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
	"model__fit_intercept": [True, False]
}

grid_search = GridSearchCV(
	pipeline,
	param_grid=param_grid,
	cv=10,
	scoring="neg_mean_absolute_error",
	n_jobs=-1,
	refit=True,
	verbose=1
)

grid_search.fit(x_train, y_train)

Fitting 10 folds for each of 30 candidates, totalling 300 fits




In [42]:
# Paso 5.
# Guarde el modelo (comprimido con gzip) como "files/models/model.pkl.gz".
# Recuerde que es posible guardar el modelo comprimido usanzo la libreria gzip.

with gzip.open(MODEL_PATH, 'wb') as f:
	pickle.dump(grid_search, f)

In [43]:
# Paso 6.
# Calcule las metricas r2, error cuadratico medio, y error absoluto medio
# para los conjuntos de entrenamiento y prueba. Guardelas en el archivo
# files/output/metrics.json. Cada fila del archivo es un diccionario con
# las metricas de un modelo. Este diccionario tiene un campo para indicar
# si es el conjunto de entrenamiento o prueba. Por ejemplo:
#
# {'type': 'metrics', 'dataset': 'train', 'r2': 0.8, 'mse': 0.7, 'mad': 0.9}
# {'type': 'metrics', 'dataset': 'test', 'r2': 0.7, 'mse': 0.6, 'mad': 0.8}

metrics = {}

y_train_pred = grid_search.predict(x_train)
y_test_pred = grid_search.predict(x_test)

In [44]:
y_train_pred

array([ 1.16437441e+01,  6.77715110e+00,  7.83286919e-01,  3.91120534e+00,
        1.43508652e+00,  6.86395397e+00, -1.18144039e+00,  6.29226617e+00,
        1.20218853e+01,  4.00389797e-02,  1.02463129e+01,  6.79230587e+00,
        6.32972418e+00,  8.71011995e+00,  5.46919091e+00,  1.67659760e+00,
        1.00077099e+01,  9.55240388e+00,  1.06385029e+01,  1.05373022e+01,
        6.29075816e+00,  7.22962195e+00,  1.38224891e+01,  1.21372046e+01,
        1.85468081e+01,  5.56867082e+00,  9.89936200e+00, -8.04073248e-02,
        2.50617201e+00,  1.63035177e+00,  8.35989644e-01,  1.87476419e+00,
        6.49751875e+00,  2.93833547e+01,  3.29426830e+01,  7.83671418e+00,
        7.02954865e+00, -3.26047681e-01,  2.60327148e+00,  9.65179474e+00,
        1.47733394e+01,  1.69613524e+01,  2.19550800e+00,  7.03431171e+00,
        7.42046038e+00,  6.21484879e+00,  1.07026364e+01,  8.87992048e+00,
        1.01290958e+01,  1.13008229e+00,  5.42564330e+00, -1.31528088e+00,
        4.23158367e+00,  

In [45]:
metrics['train'] = {
    'type': 'metrics',
    'dataset': 'train',
    'r2': r2_score(y_train, y_train_pred),
    'mse': mean_squared_error(y_train, y_train_pred),
    'mad': median_absolute_error(y_train, y_train_pred),
}

metrics['test'] = {
    'type': 'metrics',
    'dataset': 'test',
    'r2': r2_score(y_test, y_test_pred),
    'mse': mean_squared_error(y_test, y_test_pred),
    'mad': median_absolute_error(y_test, y_test_pred),
}


with open(METRICS_PATH, 'w') as f:
	f.write(json.dumps(metrics['train'])+'\n')
	f.write(json.dumps(metrics['test'])+'\n')

