# Paso 1: Limpieza de los datos

## Cargar dataset

In [106]:
import pandas as pd

test_data = pd.read_csv("../files/input/test_data.csv.zip")

train_data = pd.read_csv("../files/input/train_data.csv.zip")

## Procesamiento de la data

### Crear columna 'Age'

In [107]:
test_data["Age"] = 2021 - test_data["Year"]
train_data["Age"] = 2021 - train_data["Year"]

### Remover columna

In [108]:
train_data.drop(['Year', 'Car_Name'], axis=1, inplace=True)
test_data.drop(['Year', 'Car_Name'], axis=1, inplace=True)

# Paso2: División del dataset

In [109]:
x_train = train_data.drop(columns="Present_Price")
y_train = train_data["Present_Price"]

x_test = test_data.drop(columns="Present_Price")
y_test = test_data["Present_Price"]

# Paso 3: Creación del pipeline

- Transforma las variables categoricas usando el método one-hot-encoding.
- Escala la matriz de entrada al intervalo [0, 1].
- Selecciona las K columnas mas relevantes de la matrix de entrada.
- Ajusta un modelo de regresion lineal.

### Transformadores

In [110]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

oneHotEncoder = OneHotEncoder()
scaler = MinMaxScaler()

### ColumnTransformer

In [111]:
from sklearn.compose import ColumnTransformer

categorical_columns = ['Fuel_Type', 'Selling_type', 'Transmission']
numerical_columns = ['Selling_Price', 'Driven_kms', 'Owner', 'Age']

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(), categorical_columns),
        ("scaler", MinMaxScaler(), numerical_columns)
    ],
)

### Seleccionar K características

In [112]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

selectKBest = SelectKBest(f_regression)

### Modelo de Regresión lineal

In [113]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()

## Pipeline

In [114]:
from sklearn.pipeline import Pipeline

estimators = [
    ("preprocessor", preprocessor), 
    ("kSelect", selectKBest),
    ("lr", lr),  
]

pipeline = Pipeline(
    steps=estimators,
    verbose=False,
)

# Paso 4: Optimizar hiperparámetros

- Optimice los hiperparametros del pipeline usando validación cruzada.
- Use 10 splits para la validación cruzada.
- Use el erro medio absoluto para medir el desempeño del modelo

## GridSearchCV

In [115]:
param_grid = {
    "kSelect__k": range(1,15),
    "lr__fit_intercept": [True, False],
    "lr__positive": [True, False]
}

In [116]:
from sklearn.model_selection import GridSearchCV

gridSearchCV=GridSearchCV(
    pipeline,
    param_grid,
    cv=10,
    scoring="neg_mean_absolute_error",
    n_jobs=-1
)

gridSearchCV.fit(x_train, y_train)

In [117]:
gridSearchCV.best_estimator_.named_steps['lr']

In [118]:
gridSearchCV.best_score_

np.float64(-1.765768769272933)

In [119]:
gridSearchCV.best_params_

{'kSelect__k': 11, 'lr__fit_intercept': True, 'lr__positive': True}

In [120]:
train_score = gridSearchCV.score(x_train, y_train)
test_score = gridSearchCV.score(x_test, y_test)

print(f'Score en el conjunto de prueba: {train_score:.4f}')
print(f'Score en el conjunto de testing: {test_score:.4f}')

Score en el conjunto de prueba: -1.6215
Score en el conjunto de testing: -2.4736


## Paso 4.5: Evaluar métricas

In [121]:
SCORES = [
    -1.590,
    -2.429,
]

In [122]:
print("Válido TRAIN: ", train_score < SCORES[0])
print("Válido TEST: ", test_score < SCORES[1])

Válido TRAIN:  True
Válido TEST:  True


# Paso 5: Salvar el modelo

Salve el modelo como "files/models/model.pkl.gz".

In [123]:
model = gridSearchCV

In [124]:
import gzip
import pickle

# Guardar el modelo comprimido con gzip
model_filename = '../files/models/model.pkl.gz'

with gzip.open(model_filename, 'wb') as f:
    pickle.dump(model, f)

# Paso 6: Cálculo de métricas

In [125]:
from sklearn.metrics import mean_squared_error, r2_score, median_absolute_error
import json

def calculate_metrics(model, X, y, dataset_name):
    y_pred = model.predict(X)
    
    r2_train = float(r2_score(y, y_pred))
    mse_train = float(mean_squared_error(y, y_pred))
    mad_train = float(median_absolute_error(y, y_pred))
    
    metrics = {
        'type': 'metrics',
        'dataset': dataset_name,
        'r2': r2_train,
        'mse': mse_train,
        'mad': mad_train,
    }
    return metrics


In [126]:
# Calcular métricas para el conjunto de entrenamiento y prueba
train_metrics = calculate_metrics(model, x_train, y_train, 'train')
test_metrics = calculate_metrics(model, x_test, y_test, 'test')

metrics = [train_metrics, test_metrics]

In [127]:
import json

# Paso 6: Guardar las métricas en un archivo JSON
def save_metrics(metrics, filename='../files/output/metrics.json'):
    with open(filename, 'a') as f:
        for metric in metrics:
            json.dump(metric, f)
            f.write('\n')

# Guardar las métricas calculadas
save_metrics(metrics)