In [1094]:
import pandas as pd
import numpy as np
import os
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, PolynomialFeatures
from sklearn.feature_selection import SelectKBest, f_regression,  f_classif
from sklearn.linear_model import LinearRegression,  LogisticRegression,  Ridge, Lasso, ElasticNet, BayesianRidge
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, median_absolute_error
from sklearn.preprocessing import StandardScaler
import gzip
import pickle
import json
import warnings

warnings.filterwarnings("ignore")

In [1095]:


test_data = pd.read_csv(
    "../files/input/test_data.csv.zip",
    index_col=False,
    compression="zip",
)

train_data = pd.read_csv(
    "../files/input/train_data.csv.zip",
    index_col=False,
    compression="zip",
)

In [1096]:
def preprocess_data(dataset_path):
    dataset_path["Age"] = 2021 - dataset_path["Year"]
    dataset_path.drop(["Year", "Car_Name"], axis=1, inplace=True)
    return dataset_path


train_data = preprocess_data(train_data)
test_data = preprocess_data(test_data)

In [1097]:
# Check data types
print("Train data types:")
print(train_data.dtypes)

print("Test data types:")
print(test_data.dtypes)

Train data types:
Selling_Price    float64
Present_Price    float64
Driven_kms         int64
Fuel_Type         object
Selling_type      object
Transmission      object
Owner              int64
Age                int64
dtype: object
Test data types:
Selling_Price    float64
Present_Price    float64
Driven_kms         int64
Fuel_Type         object
Selling_type      object
Transmission      object
Owner              int64
Age                int64
dtype: object


In [1098]:

# Paso 2: Dividir en conjuntos de entrenamiento y prueba
x_train = train_data.drop(columns=['Present_Price'])
y_train = train_data['Present_Price']

x_test = test_data.drop(columns=['Present_Price'])
y_test = test_data['Present_Price']

# Print column names to verify
print(x_train.columns)
print(y_train.name)

Index(['Selling_Price', 'Driven_kms', 'Fuel_Type', 'Selling_type',
       'Transmission', 'Owner', 'Age'],
      dtype='object')
Present_Price


In [1099]:


preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop="if_binary", dtype=int), ['Fuel_Type', 'Selling_type', 'Transmission']),
        ('num', MinMaxScaler(), ['Selling_Price', 'Driven_kms', 'Owner', 'Age'])
    ],
    remainder=MinMaxScaler(),
)

pipeline = Pipeline([
    ('OneHotEncoder', preprocessor),
    ('SelectKBest', SelectKBest(score_func=f_classif)),
    ('LinearRegression', LinearRegression())
])


In [1100]:
from sklearn.metrics import make_scorer
np.random.seed(32)
# Paso 4: Optimización de hiperparámetros
param_grid = {
    'SelectKBest__k': range(1,12),
    #'LinearRegression__fit_intercept': [True, False],
    #'LinearRegression__positive': [False,True],
    #'LinearRegression__n_jobs': [-1, -2, 1] ,
    #'LinearRegression__copy_X':[False,True],
    #'LinearRegression__alpha': [0.1, 1.0, 10.0,0.05 ],
  
}

model = GridSearchCV(
    pipeline,
    param_grid,
    cv=10,
    scoring= 'neg_mean_absolute_error',  #'neg_mean_squared_error',
    n_jobs=-1
)

model.fit(x_train,y_train)

In [1101]:
# Mejor modelo y mejores hiperparámetros
print("Mejores parámetros:", model.best_params_)
print("Mejor puntuación:", model.best_score_)

Mejores parámetros: {'SelectKBest__k': 9}
Mejor puntuación: -1.7810227729017025


In [1102]:
print(x_train.columns)

Index(['Selling_Price', 'Driven_kms', 'Fuel_Type', 'Selling_type',
       'Transmission', 'Owner', 'Age'],
      dtype='object')


In [1103]:

"GridSearchCV" in str(type(model))

True

In [1104]:

str(type(model))

"<class 'sklearn.model_selection._search.GridSearchCV'>"

In [1105]:
print(model.estimator)

Pipeline(steps=[('OneHotEncoder',
                 ColumnTransformer(remainder=MinMaxScaler(),
                                   transformers=[('cat',
                                                  OneHotEncoder(drop='if_binary',
                                                                dtype=<class 'int'>),
                                                  ['Fuel_Type', 'Selling_type',
                                                   'Transmission']),
                                                 ('num', MinMaxScaler(),
                                                  ['Selling_Price',
                                                   'Driven_kms', 'Owner',
                                                   'Age'])])),
                ('SelectKBest', SelectKBest()),
                ('LinearRegression', LinearRegression())])


In [1106]:

str(type(model))

"<class 'sklearn.model_selection._search.GridSearchCV'>"

In [1107]:
"GridSearchCV" in str(type(model))

True

In [1108]:
print(model.estimator)

Pipeline(steps=[('OneHotEncoder',
                 ColumnTransformer(remainder=MinMaxScaler(),
                                   transformers=[('cat',
                                                  OneHotEncoder(drop='if_binary',
                                                                dtype=<class 'int'>),
                                                  ['Fuel_Type', 'Selling_type',
                                                   'Transmission']),
                                                 ('num', MinMaxScaler(),
                                                  ['Selling_Price',
                                                   'Driven_kms', 'Owner',
                                                   'Age'])])),
                ('SelectKBest', SelectKBest()),
                ('LinearRegression', LinearRegression())])


In [1109]:

print(model.score(x_train, y_train))# < -1.590
print(model.score(x_test, y_test))#< -2.429

model.score(x_train, y_train) < -1.590
model.score(x_test, y_test)< -2.429

-1.6214879816158234
-2.4736199144163633


True

In [1110]:
# Paso 5.
# Guarde el modelo (comprimido con gzip) como "files/models/model.pkl.gz".
# Recuerde que es posible guardar el modelo comprimido usanzo la libreria gzip.

import gzip
import shutil
import pickle
import os

models_dir = '../files/models'
os.makedirs(models_dir, exist_ok=True)
# Ruta de los archivos
source_file = "../files/models/model.pkl"  # Archivo original
compressed_file = "../files/models/model.pkl.gz"  # Archivo comprimido

# Guardar el modelo en un archivo .pkl
with open(source_file, "wb") as file:
    pickle.dump(model, file)

# Comprimir el archivo .pkl
with gzip.open(compressed_file, 'wb') as file:
        pickle.dump(model, file)

print(f"Archivo comprimido guardado en: {compressed_file}")

Archivo comprimido guardado en: ../files/models/model.pkl.gz


In [1111]:

# Paso 5: Cálculo de métricas
#metrics = {}
def calculate_metrics(model,X_train, y_train, X_test, y_test):
    # Hacer predicciones para entrenamiento y prueba
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
 
    # Calcular métricas para el conjunto de entrenamiento
    metrics_train = {
        'type': 'metrics',
        'dataset': 'train',
        'r2': float(r2_score(y_train, y_train_pred)),
        'mse': float(mean_squared_error(y_train, y_train_pred)),
        'mad': float(median_absolute_error(y_train, y_train_pred))
    }

    # Calcular métricas para el conjunto de prueba
    metrics_test = {
        'type': 'metrics',
        'dataset': 'test',
        'r2': float(r2_score(y_test, y_test_pred)),
        'mse': float(mean_squared_error(y_test, y_test_pred)),
        'mad': float(median_absolute_error(y_test, y_test_pred))
    }
    # Crear carpeta si no existe
    output_dir = '../files/output'
    os.makedirs(output_dir, exist_ok=True)

    # Guardar las métricas en un archivo JSON
    output_path = os.path.join(output_dir, 'metrics.json')
    with open(output_path, 'w') as f:
        f.write(json.dumps(metrics_train) + '\n')  # Línea 1: métricas de entrenamiento
        f.write(json.dumps(metrics_test) + '\n')  # Línea 2: métricas de prueba




calculate_metrics(model, x_train,  y_train, x_test, y_test)




