In [79]:
import pandas as pd

train = pd.read_csv("../files/input/train_data.csv.zip", index_col = False, compression = "zip")
test = pd.read_csv("../files/input/test_data.csv.zip", index_col = False, compression = "zip")

In [80]:
train = train.copy()
test = test.copy()
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 211 entries, 0 to 210
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Car_Name       211 non-null    object 
 1   Year           211 non-null    int64  
 2   Selling_Price  211 non-null    float64
 3   Present_Price  211 non-null    float64
 4   Driven_kms     211 non-null    int64  
 5   Fuel_Type      211 non-null    object 
 6   Selling_type   211 non-null    object 
 7   Transmission   211 non-null    object 
 8   Owner          211 non-null    int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 15.0+ KB


In [81]:
# En este dataset se desea pronosticar el precio de vhiculos usados. El dataset
# original contiene las siguientes columnas:
#
# - Car_Name: Nombre del vehiculo.
# - Year: Año de fabricación.
# - Selling_Price: Precio de venta.
# - Present_Price: Precio actual.
# - Driven_Kms: Kilometraje recorrido.
# - Fuel_type: Tipo de combustible.
# - Selling_Type: Tipo de vendedor.
# - Transmission: Tipo de transmisión.
# - Owner: Número de propietarios.

In [82]:
# Paso 1.
# Preprocese los datos.
# - Cree la columna 'Age' a partir de la columna 'Year'.
# Año de referencia
reference_year = 2021
train["Age"] = reference_year - train["Year"]
test["Age"] = reference_year - test["Year"]

# - Elimine las columnas 'Year' y 'Car_Name'.
train = train.drop(columns = ["Year", "Car_Name"])
test = test.drop(columns = ["Year", "Car_Name"])

In [83]:
# Paso 2.
# Divida los datasets en x_train, y_train, x_test, y_test.

x_train = train.drop(columns=['Selling_Price'])
y_train = train['Selling_Price']

x_test = test.drop(columns=['Selling_Price'])
y_test = test['Selling_Price']

In [84]:
import pandas as pd
x_train = pd.DataFrame(x_train, columns=['Fuel_Type', 'Selling_type', 'Transmission', 'Present_Price', 'Driven_kms', 'Owner', 'Age'])
x_test = pd.DataFrame(x_test, columns=['Fuel_Type', 'Selling_type', 'Transmission', 'Present_Price', 'Driven_kms', 'Owner', 'Age'])



In [85]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import make_scorer, balanced_accuracy_score, mean_absolute_error
# importar minmaxscaler
from sklearn.preprocessing import MinMaxScaler

# Columnas categóricas y numéricas
categorical_columns = ['Fuel_Type', 'Selling_type', 'Transmission']
numeric_columns = ['Present_Price', 'Driven_kms', 'Owner', 'Age']

# Preprocesador
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_columns),
        ('cat', OneHotEncoder(handle_unknown="ignore"), categorical_columns)
    ],
    remainder='drop'
)

# Pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', MinMaxScaler()),
    ('feature_selection', SelectKBest(f_regression)),
    ('regression', LinearRegression())
])

# Configuración de parámetros para GridSearchCV
param_grid = {
    'feature_selection__k': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
}

# Validación cruzada estratificada
stratified_kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Métrica de evaluación (usaremos balanced_accuracy_score como ejemplo)
scorer = make_scorer(mean_absolute_error, greater_is_better=False)

grid_search = GridSearchCV(
    pipeline,
    param_grid=param_grid,
    cv=10,
    scoring=scorer,  # Métrica para regresión
    verbose=3
)

# Entrenamiento del modelo con GridSearchCV
grid_search.fit(x_train, y_train)

# Resultados del mejor modelo
best_pipeline = grid_search.best_estimator_
best_params = grid_search.best_params_
y_pred = best_pipeline.predict(x_test)

# Evaluación final
mae = mean_absolute_error(y_test, y_pred)
best_accuracy = best_pipeline.score(x_test, y_test)

print("Mejores parámetros:", best_params)
print("Accuracy del mejor modelo:", best_accuracy)
print("Error Absoluto Medio (MAE):", mae)



Fitting 10 folds for each of 10 candidates, totalling 100 fits
[CV 1/10] END ..........feature_selection__k=1;, score=-0.899 total time=   0.0s
[CV 2/10] END ..........feature_selection__k=1;, score=-2.023 total time=   0.0s
[CV 3/10] END ..........feature_selection__k=1;, score=-0.829 total time=   0.0s
[CV 4/10] END ..........feature_selection__k=1;, score=-1.686 total time=   0.0s
[CV 5/10] END ..........feature_selection__k=1;, score=-1.097 total time=   0.0s
[CV 6/10] END ..........feature_selection__k=1;, score=-1.229 total time=   0.0s
[CV 7/10] END ..........feature_selection__k=1;, score=-1.591 total time=   0.0s
[CV 8/10] END ..........feature_selection__k=1;, score=-0.781 total time=   0.0s
[CV 9/10] END ..........feature_selection__k=1;, score=-2.322 total time=   0.0s
[CV 10/10] END .........feature_selection__k=1;, score=-0.834 total time=   0.0s
[CV 1/10] END ..........feature_selection__k=2;, score=-0.870 total time=   0.0s
[CV 2/10] END ..........feature_selection__k=2

In [86]:
'''# Paso 3.
# Cree un pipeline para el modelo de clasificación. Este pipeline debe
# contener las siguientes capas:
# - Transforma las variables categoricas usando el método
#   one-hot-encoding.
# - Escala las variables numéricas al intervalo [0, 1].
# - Selecciona las K mejores entradas.
# - Ajusta un modelo de regresion lineal.

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LinearRegression
from sklearn.compose import make_column_selector

# Columnas categóricas
categorical_columns = ['Fuel_Type', 'Selling_type', 'Transmission']
# Columnas numéricas
numeric_columns = ['Present_Price', 'Driven_kms', 'Owner', 'Age']

# Transformador de columnas
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_columns),
        ('cat', OneHotEncoder(), categorical_columns)
    ], 
    remainder='drop'
)

# Pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('feature_selection', SelectKBest(f_regression, k=10)),
    ('regression', LinearRegression())
])'''



"# Paso 3.\n# Cree un pipeline para el modelo de clasificación. Este pipeline debe\n# contener las siguientes capas:\n# - Transforma las variables categoricas usando el método\n#   one-hot-encoding.\n# - Escala las variables numéricas al intervalo [0, 1].\n# - Selecciona las K mejores entradas.\n# - Ajusta un modelo de regresion lineal.\n\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.compose import ColumnTransformer\nfrom sklearn.preprocessing import StandardScaler, OneHotEncoder\nfrom sklearn.feature_selection import SelectKBest, f_regression\nfrom sklearn.linear_model import LinearRegression\nfrom sklearn.compose import make_column_selector\n\n# Columnas categóricas\ncategorical_columns = ['Fuel_Type', 'Selling_type', 'Transmission']\n# Columnas numéricas\nnumeric_columns = ['Present_Price', 'Driven_kms', 'Owner', 'Age']\n\n# Transformador de columnas\npreprocessor = ColumnTransformer(\n    transformers=[\n        ('num', StandardScaler(), numeric_columns),\n        ('cat', On

In [87]:
'''# Paso 4.
# Optimice los hiperparametros del pipeline usando validación cruzada.
# Use 10 splits para la validación cruzada. Use el error medio absoluto
# para medir el desempeño modelo.

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error

# Validación de columnas
print("Columnas de x_train:", x_train.columns)

# Parámetros a optimizar
param_grid = {
    'feature_selection__k': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
}

# Optimización de hiperparámetros
grid_search = GridSearchCV(
    pipeline, 
    param_grid, 
    cv=10, 
    scoring='neg_mean_absolute_error',  # Métrica basada en el error absoluto
    n_jobs=-1, 
    verbose=3, 
    refit=True
)
grid_search.fit(x_train, y_train)

# Mejor modelo y evaluación en el conjunto de prueba
best_pipeline = grid_search.best_estimator_
y_pred = best_pipeline.predict(x_test)

# Métricas
mae = mean_absolute_error(y_test, y_pred)
print("Mejor MAE (Error Absoluto Medio):", mae)
print("Mejores parámetros:", grid_search.best_params_)'''


'# Paso 4.\n# Optimice los hiperparametros del pipeline usando validación cruzada.\n# Use 10 splits para la validación cruzada. Use el error medio absoluto\n# para medir el desempeño modelo.\n\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.metrics import mean_absolute_error\n\n# Validación de columnas\nprint("Columnas de x_train:", x_train.columns)\n\n# Parámetros a optimizar\nparam_grid = {\n    \'feature_selection__k\': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]\n}\n\n# Optimización de hiperparámetros\ngrid_search = GridSearchCV(\n    pipeline, \n    param_grid, \n    cv=10, \n    scoring=\'neg_mean_absolute_error\',  # Métrica basada en el error absoluto\n    n_jobs=-1, \n    verbose=3, \n    refit=True\n)\ngrid_search.fit(x_train, y_train)\n\n# Mejor modelo y evaluación en el conjunto de prueba\nbest_pipeline = grid_search.best_estimator_\ny_pred = best_pipeline.predict(x_test)\n\n# Métricas\nmae = mean_absolute_error(y_test, y_pred)\nprint("Mejor MAE (Error Absoluto Medio):"

In [88]:
import gzip
import pickle
import os
grid_search.fit(x_train, y_train)
models_dir = '../files/models'
os.makedirs(models_dir, exist_ok=True)
output_path = "../files/models/model.pkl.gz"
with gzip.open(output_path, 'wb') as file:
    pickle.dump(grid_search, file)

Fitting 10 folds for each of 10 candidates, totalling 100 fits
[CV 1/10] END ..........feature_selection__k=1;, score=-0.899 total time=   0.0s
[CV 2/10] END ..........feature_selection__k=1;, score=-2.023 total time=   0.0s
[CV 3/10] END ..........feature_selection__k=1;, score=-0.829 total time=   0.0s
[CV 4/10] END ..........feature_selection__k=1;, score=-1.686 total time=   0.0s
[CV 5/10] END ..........feature_selection__k=1;, score=-1.097 total time=   0.0s
[CV 6/10] END ..........feature_selection__k=1;, score=-1.229 total time=   0.0s
[CV 7/10] END ..........feature_selection__k=1;, score=-1.591 total time=   0.0s
[CV 8/10] END ..........feature_selection__k=1;, score=-0.781 total time=   0.0s
[CV 9/10] END ..........feature_selection__k=1;, score=-2.322 total time=   0.0s
[CV 10/10] END .........feature_selection__k=1;, score=-0.834 total time=   0.0s
[CV 1/10] END ..........feature_selection__k=2;, score=-0.870 total time=   0.0s
[CV 2/10] END ..........feature_selection__k=2

In [89]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Realizar las predicciones sobre los conjuntos de entrenamiento y prueba

y_train_pred = model.predict(x_train)
y_test_pred = model.predict(x_test)

# Calcular métricas para el conjunto de entrenamiento
r2_train = r2_score(y_train, y_train_pred)
mse_train = mean_squared_error(y_train, y_train_pred)
mae_train = mean_absolute_error(y_train, y_train_pred)


# Calcular métricas para el conjunto de prueba
r2_test = r2_score(y_test, y_test_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
mae_test = mean_absolute_error(y_test, y_test_pred)

# Resultados
print("Métricas para el conjunto de entrenamiento:")
print(f"R^2: {r2_train:.4f}")
print(f"Error Cuadrático Medio (MSE): {mse_train:.4f}")
print(f"Error Absoluto Medio (MAE): {mae_train:.4f}")

print("\nMétricas para el conjunto de prueba:")
print(f"R^2: {r2_test:.4f}")
print(f"Error Cuadrático Medio (MSE): {mse_test:.4f}")
print(f"Error Absoluto Medio (MAE): {mae_test:.4f}")

Métricas para el conjunto de entrenamiento:
R^2: 0.8977
Error Cuadrático Medio (MSE): 2.3640
Error Absoluto Medio (MAE): 1.0504

Métricas para el conjunto de prueba:
R^2: 0.7898
Error Cuadrático Medio (MSE): 6.7090
Error Absoluto Medio (MAE): 1.4542


In [90]:
import json

metrics = [
    {
        "type": "metrics",
        'dataset': 'train',
        'r2': r2_train,
        'mse': mse_train,
        'mad': mae_train,
    },
    {
        "type": "metrics",
        'dataset': 'test',
        'r2': r2_test,
        'mse': mse_test,
        'mad': mae_test,
    }
]

output_dir = "../files/output"
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir,"metrics.json")

with open(output_path,"w") as file:
        file.write(json.dumps(metrics) + '\n')

In [91]:
# Verificar los componentes del pipeline
print("Pasos del pipeline en el modelo cargado:")
if hasattr(model, 'estimator'):
    for step_name, step_component in model.estimator.steps:
        print(f"{step_name}: {step_component}")
else:
    print("El modelo no tiene un atributo 'estimator'.")



Pasos del pipeline en el modelo cargado:
preprocessor: ColumnTransformer(transformers=[('num', StandardScaler(),
                                 ['Present_Price', 'Driven_kms', 'Owner',
                                  'Age']),
                                ('cat', OneHotEncoder(),
                                 ['Fuel_Type', 'Selling_type',
                                  'Transmission'])])
feature_selection: SelectKBest(score_func=<function f_regression at 0x000002AC44160160>)
regression: LinearRegression()
