### Carga de los datasets

In [1]:
import pandas as pd

df_train = pd.read_csv('../files/input/train_data.csv.zip',compression='zip', index_col=False)
df_test = pd.read_csv('../files/input/test_data.csv.zip',compression='zip', index_col=False)

In [2]:
df_train.shape, df_test.shape

((211, 9), (90, 9))

In [3]:
df_train.isna().sum()

Car_Name         0
Year             0
Selling_Price    0
Present_Price    0
Driven_kms       0
Fuel_Type        0
Selling_type     0
Transmission     0
Owner            0
dtype: int64

In [4]:
df_train.dtypes

Car_Name          object
Year               int64
Selling_Price    float64
Present_Price    float64
Driven_kms         int64
Fuel_Type         object
Selling_type      object
Transmission      object
Owner              int64
dtype: object

### Paso 1

- Cree la columna 'Age' a partir de la columna 'Year'.
- Asuma que el año actual es 2021.
- Elimine las columnas 'Year' y 'Car_Name'.

In [5]:
def process_data(df):
    df['Age']=2021-df['Year']
    df.drop(columns=['Year','Car_Name'],inplace=True)
    return df

In [6]:
df_train = process_data(df_train)
df_test = process_data(df_test)

## Paso 2

Divida los datasets en x_train, y_train, x_test, y_test.

In [7]:
X_train,  y_train = df_train.drop('Present_Price', axis=1), df_train['Present_Price']
X_test,  y_test = df_test.drop('Present_Price', axis=1), df_test['Present_Price']

## Paso 3

Cree un pipeline para el modelo de clasificación. Este pipeline debe contener las siguientes capas:
- Transforma las variables categoricas usando el método one-hot-encoding.
- Escala las variables numéricas al intervalo [0, 1].
- Selecciona las K mejores entradas.
- Ajusta un modelo de regresion lineal.

In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder,MinMaxScaler
from sklearn.feature_selection import f_regression,SelectKBest

columnas_categoricas = ['Fuel_Type','Selling_type','Transmission']
columnas_numericas = list(set(X_train.columns.values) - set(columnas_categoricas))

transformer = ColumnTransformer(
    transformers=[
        ('ohe', OneHotEncoder(), columnas_categoricas),
        ('minmax',MinMaxScaler(),columnas_numericas),
    ],
    remainder='passthrough'
)

pipeline=Pipeline(
    [
        ("transformer",transformer),
        ('feature_selector',SelectKBest(f_regression)),
        ('linearregressor', LinearRegression())
    ]
)
pipeline

## Paso 4

Optimice los hiperparametros del pipeline usando validación cruzada.

- Use 10 splits para la validación cruzada. Use el error medio absoluto para medir el desempeño modelo.

In [9]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'feature_selector__k':range(1,15),
    'linearregressor__fit_intercept':[True,False],
    'linearregressor__positive':[True,False]

}

grid_search=GridSearchCV(
    pipeline,
    param_grid,
    cv=10,
    scoring="neg_mean_absolute_error",
    n_jobs=-1,
    refit=True,
    verbose=True
)

In [10]:
grid_search.fit(X_train, y_train)

Fitting 10 folds for each of 56 candidates, totalling 560 fits


In [11]:
grid_search.best_estimator_

In [12]:
grid_search.best_params_

{'feature_selector__k': 11,
 'linearregressor__fit_intercept': True,
 'linearregressor__positive': True}

## Paso 5

Guarde el modelo (comprimido con gzip) como "files/models/model.pkl.gz".

In [13]:
import pickle
import gzip
import os

models_dir = '../files/models'
os.makedirs(models_dir, exist_ok=True)

with gzip.open('../files/models/model.pkl.gz', 'wb') as file:
    pickle.dump(grid_search, file)

## Paso 6

- Calcule las metricas r2, error cuadratico medio, y error absoluto medio para los conjuntos de entrenamiento y prueba. 
- Guardelas en el archivo files/output/metrics.json. 
- Cada fila del archivo es un diccionario con las metricas de un modelo. 
- Este diccionario tiene un campo para indicar si es el conjunto de entrenamiento o prueba. Por ejemplo:

{'type': 'metrics', 'dataset': 'train', 'r2': 0.8, 'mse': 0.7, 'mad': 0.9}
{'type': 'metrics', 'dataset': 'test', 'r2': 0.7, 'mse': 0.6, 'mad': 0.8}

In [14]:
def cargar_modelo_predecir(data):
    import pickle
    import gzip
    
    with gzip.open("../files/models/model.pkl.gz", "rb") as file:
        estimator = pickle.load(file)

    return estimator.predict(data)

y_train_pred = cargar_modelo_predecir(X_train)
y_test_pred = cargar_modelo_predecir(X_test)

In [15]:
def escribir_metricas(dict_metricas):
    models_dir = '../files/output'
    os.makedirs(models_dir, exist_ok=True)
    
    if os.path.exists('../files/output/metrics.json'):
        with open('../files/output/metrics.json', mode='r') as file:
            if len(file.readlines()) >= 2:
                os.remove('../files/output/metrics.json')
    
    with open('../files/output/metrics.json', mode='a') as file:
        file.write(str(dict_metricas).replace("'",'"')+"\n")

In [16]:
from sklearn.metrics import (
    r2_score,mean_squared_error,mean_absolute_error,median_absolute_error
)
def eval_metrics(dataset,y_true, y_pred):
    r2 = float(r2_score(y_true, y_pred))
    mse= float(mean_squared_error(y_true, y_pred))
    mad = float(median_absolute_error(y_true, y_pred))
    metrics= {
        "type": "metrics",
        "dataset": dataset,
        "r2": r2,
        "mse": mse,
        "mad": mad
    }
    
    escribir_metricas(metrics)
    
metrics_train = eval_metrics('train',y_train,y_train_pred)
metrics_test = eval_metrics('test',y_test,y_test_pred)