In [89]:
import pandas as pd
import numpy as np
import os
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, PolynomialFeatures
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import gzip
import pickle
import json

In [90]:



test_data = pd.read_csv(
    "../files/input/test_data.csv.zip",
    index_col=False,
    compression="zip",
)

train_data = pd.read_csv(
    "../files/input/train_data.csv.zip",
    index_col=False,
    compression="zip",
)

In [91]:
test_data

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Driven_kms,Fuel_Type,Selling_type,Transmission,Owner
0,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
1,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
2,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
3,ciaz,2015,6.75,8.12,18796,Petrol,Dealer,Manual,0
4,s cross,2015,6.50,8.61,33429,Diesel,Dealer,Manual,0
...,...,...,...,...,...,...,...,...,...
85,city,2015,9.70,13.60,21780,Petrol,Dealer,Manual,0
86,city,2014,6.25,13.60,40126,Petrol,Dealer,Manual,0
87,city,2006,2.10,7.60,50456,Petrol,Dealer,Manual,0
88,jazz,2016,6.40,8.40,12000,Petrol,Dealer,Manual,0


In [92]:
def preprocess_data(dataset_path):
    #df = pd.read_csv(dataset_path)
    dataset_path['Age'] = 2021 - dataset_path['Year']
    dataset_path.drop(columns=['Year', 'Car_Name'], inplace=True)
    return dataset_path


train_data = preprocess_data(train_data)
test_data = preprocess_data(test_data)

In [93]:
train_data.dropna(inplace=True)
test_data.dropna(inplace=True)

In [94]:
train_data.dtypes

Selling_Price    float64
Present_Price    float64
Driven_kms         int64
Fuel_Type         object
Selling_type      object
Transmission      object
Owner              int64
Age                int64
dtype: object

In [95]:

# Paso 2: Dividir en conjuntos de entrenamiento y prueba
x_train = train_data.drop(columns=['Selling_Price'])
y_train = train_data['Selling_Price']

x_test = test_data.drop(columns=['Selling_Price'])
y_test = test_data['Selling_Price']

In [96]:
x_train

Unnamed: 0,Present_Price,Driven_kms,Fuel_Type,Selling_type,Transmission,Owner,Age
0,8.500,15059,Petrol,Dealer,Automatic,0,5
1,4.600,30000,Petrol,Dealer,Manual,0,8
2,0.826,6000,Petrol,Individual,Manual,0,10
3,4.430,15000,Petrol,Dealer,Manual,0,5
4,1.500,15000,Petrol,Individual,Manual,0,8
...,...,...,...,...,...,...,...
206,4.430,57000,Petrol,Dealer,Manual,0,10
207,6.800,36000,Petrol,Dealer,Manual,0,6
208,0.750,92233,Petrol,Individual,Manual,0,15
209,7.600,77632,Diesel,Dealer,Manual,0,7


In [97]:
categorical_features = ['Fuel_Type','Selling_type','Transmission' ]
numerical_features = x_train.select_dtypes(exclude=['object']).columns

categorical_transformer = OneHotEncoder(  drop='first', sparse_output=False )
numerical_transformer = MinMaxScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),
        #('num', numerical_transformer, numerical_features)
    ],
     remainder='passthrough'
)
pipeline = Pipeline([
    ('OneHotEncoder', preprocessor),
    ('SelectKBest', SelectKBest(score_func=f_regression)),
    ('MinMaxScaler', MinMaxScaler( copy=True, clip=False, feature_range=(0, 1))),
    ('LinearRegression', LinearRegression())
])


In [98]:
# Paso 4: Optimización de hiperparámetros
param_grid = {
     'SelectKBest__k': [1, 2, 3, 5],
    'LinearRegression__fit_intercept': [True, False],
    'LinearRegression__positive': [True, False],
    'LinearRegression__copy_X': [True, False] 
}

model = GridSearchCV(
    pipeline,
    param_grid,
    cv=10,
    scoring='r2',
    n_jobs=-1
)

model.fit(x_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [99]:
# Mejor modelo y mejores hiperparámetros
print("Mejores parámetros:", model.best_params_)
print("Mejor puntuación:", model.best_score_)

Mejores parámetros: {'LinearRegression__copy_X': True, 'LinearRegression__fit_intercept': True, 'LinearRegression__positive': False, 'SelectKBest__k': 3}
Mejor puntuación: 0.7780728739676521


In [100]:
# Paso 5.
# Guarde el modelo (comprimido con gzip) como "files/models/model.pkl.gz".
# Recuerde que es posible guardar el modelo comprimido usanzo la libreria gzip.

import gzip
import shutil
import pickle
import os

models_dir = '../files/models'
os.makedirs(models_dir, exist_ok=True)
# Ruta de los archivos
source_file = "../files/models/model.pkl"  # Archivo original
compressed_file = "../files/models/model.pkl.gz"  # Archivo comprimido

# Guardar el modelo en un archivo .pkl
with open(source_file, "wb") as file:
    pickle.dump(model, file)

# Comprimir el archivo .pkl
with open(source_file, 'rb') as f_in:
    with gzip.open(compressed_file, 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

print(f"Archivo comprimido guardado en: {compressed_file}")

Archivo comprimido guardado en: ../files/models/model.pkl.gz


In [101]:

# Paso 5: Cálculo de métricas
metrics = []
def calculate_metrics(model, x, y, dataset_type):
    y_pred = model.predict(x)
    metrics.append({
        'type': 'metrics',
        'dataset': dataset_type,
        'r2': r2_score(y, y_pred),
        'mse': mean_squared_error(y, y_pred),
        'mad': mean_absolute_error(y, y_pred)
    })

calculate_metrics(model.best_estimator_, x_train, y_train, 'train')
calculate_metrics(model.best_estimator_, x_test, y_test, 'test')

# Crear carpeta si no existe
output_dir = '../files/output'
os.makedirs(output_dir, exist_ok=True)

# Guardar las métricas en un archivo JSON
output_path = os.path.join(output_dir, 'metrics.json')
with open(output_path, 'w') as f:  # Usar 'w' para comenzar con un archivo limpio
    json.dump(metrics, f, indent=4)

print("Pipeline completado y métricas calculadas.")
print(metrics)




Pipeline completado y métricas calculadas.
[{'type': 'metrics', 'dataset': 'train', 'r2': 0.8068523203739558, 'mse': 4.464782144217011, 'mad': 1.2908075493694824}, {'type': 'metrics', 'dataset': 'test', 'r2': 0.782748356099224, 'mse': 6.9335856813515715, 'mad': 1.4321370720360018}]
