In [32]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, balanced_accuracy_score, recall_score, f1_score
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
import numpy as np
import gzip
import pickle
import json
import os
import zipfile
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import f_regression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error


In [33]:
# Cargar los datos
train_df = pd.read_csv('../files/input/train_data.csv/train_data.csv')
test_df = pd.read_csv('../files/input/test_data.csv/test_data.csv')

# Crear la columna "Age"
train_df['Age'] = 2021 - train_df['Year'] 
test_df['Age'] = 2021 - test_df['Year'] 

# Eliminar columnas "year" y "Car_Name"
train_df.drop(columns=['Car_Name'], inplace=True)
test_df.drop(columns=['Car_Name'], inplace=True)
train_df.drop(columns=['Year'], inplace=True)
test_df.drop(columns=['Year'], inplace=True)

# Eliminar registros con información no disponible
train_df.dropna(inplace=True)
test_df.dropna(inplace=True)
train_df.head()

Unnamed: 0,Selling_Price,Present_Price,Driven_kms,Fuel_Type,Selling_type,Transmission,Owner,Age
0,7.4,8.5,15059,Petrol,Dealer,Automatic,0,5
1,4.0,4.6,30000,Petrol,Dealer,Manual,0,8
2,0.5,0.826,6000,Petrol,Individual,Manual,0,10
3,3.15,4.43,15000,Petrol,Dealer,Manual,0,5
4,1.25,1.5,15000,Petrol,Individual,Manual,0,8


In [34]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 211 entries, 0 to 210
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Selling_Price  211 non-null    float64
 1   Present_Price  211 non-null    float64
 2   Driven_kms     211 non-null    int64  
 3   Fuel_Type      211 non-null    object 
 4   Selling_type   211 non-null    object 
 5   Transmission   211 non-null    object 
 6   Owner          211 non-null    int64  
 7   Age            211 non-null    int64  
dtypes: float64(2), int64(3), object(3)
memory usage: 13.3+ KB


In [35]:
# Dividir los datos en características (X) y etiquetas (y)
x_train = train_df.drop(columns=['Present_Price'])
y_train = train_df['Present_Price']
x_test = test_df.drop(columns=['Present_Price'])
y_test = test_df['Present_Price']

In [36]:
categorical_cols = ['Fuel_Type', 'Selling_type', 'Transmission']
numeric_cols = list(set(x_train.columns) - set(categorical_cols))

preprocessor = ColumnTransformer(
	transformers=[
        ("cat", OneHotEncoder(), categorical_cols),
		("num", MinMaxScaler(), numeric_cols)
	],
	remainder="passthrough"
)

pipeline = Pipeline(
	steps=[
		("preprocessor", preprocessor),
		('feature_selection', SelectKBest(f_regression, k='all')),
		("model", LinearRegression())
	]
)

In [44]:
# Definir los parámetros para la búsqueda en grid
param_grid = {
	'feature_selection__k': [5, 10, 15],
	"model__fit_intercept": [True, False]
}

# Configurar la búsqueda en grid con validación cruzada
grid_search = GridSearchCV(pipeline,param_grid=param_grid,cv=10,scoring="neg_mean_absolute_error",n_jobs=-1,refit=True,verbose=1)

grid_search.fit(x_train, y_train)

Fitting 10 folds for each of 6 candidates, totalling 60 fits


ValueError: Invalid parameter 'feature_selection' for estimator Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('cat', OneHotEncoder(),
                                                  ['Fuel_Type', 'Selling_type',
                                                   'Transmission']),
                                                 ('num', MinMaxScaler(),
                                                  ['Driven_kms', 'Age', 'Owner',
                                                   'Selling_Price'])])),
                ('k_best',
                 SelectKBest(k='all',
                             score_func=<function f_regression at 0x000001C01DF051C0>)),
                ('model', LinearRegression())]). Valid parameters are: ['memory', 'steps', 'transform_input', 'verbose'].

In [38]:
best_model = grid_search
print(best_model)

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('cat',
                                                                         OneHotEncoder(),
                                                                         ['Fuel_Type',
                                                                          'Selling_type',
                                                                          'Transmission']),
                                                                        ('num',
                                                                         MinMaxScaler(),
                                                                         ['Driven_kms',
                                                                          'Age',
                                                                

In [39]:
# Guardar el modelo comprimido
with gzip.open('../files/models/model.pkl.gz', 'wb') as f:
    pickle.dump(best_model, f)

In [41]:
# Predecir en los conjuntos de entrenamiento y prueba
y_train_pred = best_model.predict(x_train)
y_test_pred = best_model.predict(x_test)

# Calcular métricas para el conjunto de entrenamiento
train_metrics = {
    'type': 'metrics',
    'dataset': 'train',
    'r2': r2_score(y_train, y_train_pred),  # Coeficiente de determinación
    'mse': mean_squared_error(y_train, y_train_pred),  # Error Cuadrático Medio
    'mad': mean_absolute_error(y_train, y_train_pred)  # Error Absoluto Medio
}

# Calcular métricas para el conjunto de prueba
test_metrics = {
    'type': 'metrics',
    'dataset': 'test',
    'r2': r2_score(y_test, y_test_pred),  # Coeficiente de determinación
    'mse': mean_squared_error(y_test, y_test_pred),  # Error Cuadrático Medio
    'mad': mean_absolute_error(y_test, y_test_pred)  # Error Absoluto Medio
}

# Guardar las métricas en un archivo JSON
with open("../files/output/metrics.json", "w") as f:
    json.dump(train_metrics, f)
    f.write('\n')  # Nueva línea
    json.dump(test_metrics, f)