In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, balanced_accuracy_score,
    precision_score, recall_score, f1_score,
    confusion_matrix, ConfusionMatrixDisplay
)

In [2]:
import pandas as pd

train_data = pd.read_csv('../files/input/train_data.csv.zip',compression='zip')
test_data = pd.read_csv('../files/input/test_data.csv.zip',compression='zip')

print(train_data.head())



                    Car_Name  Year  Selling_Price  Present_Price  Driven_kms  \
0                       jazz  2016           7.40          8.500       15059   
1                        i10  2013           4.00          4.600       30000   
2         TVS Apache RTR 180  2011           0.50          0.826        6000   
3                        eon  2016           3.15          4.430       15000   
4  Royal Enfield Thunder 350  2013           1.25          1.500       15000   

  Fuel_Type Selling_type Transmission  Owner  
0    Petrol       Dealer    Automatic      0  
1    Petrol       Dealer       Manual      0  
2    Petrol   Individual       Manual      0  
3    Petrol       Dealer       Manual      0  
4    Petrol   Individual       Manual      0  


In [3]:
# Paso 1.
# Preprocese los datos.
# - Cree la columna 'Age' a partir de la columna 'Year'.
#   Asuma que el año actual es 2021.
# - Elimine las columnas 'Year' y 'Car_Name'.

train_data['Age'] = 2021 - train_data['Year']
test_data['Age'] = 2021 - test_data['Year']

train_data = train_data.drop(columns=['Year', 'Car_Name'])
test_data = test_data.drop(columns=['Year', 'Car_Name'])


In [4]:
# Paso 2.
# Divida los datasets en x_train, y_train, x_test, y_test.

x_train = train_data.drop(columns=['Present_Price'])
y_train = train_data['Present_Price']

x_test = test_data.drop(columns=['Present_Price'])
y_test = test_data['Present_Price']

In [5]:
x_train.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 211 entries, 0 to 210
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Selling_Price  211 non-null    float64
 1   Driven_kms     211 non-null    int64  
 2   Fuel_Type      211 non-null    object 
 3   Selling_type   211 non-null    object 
 4   Transmission   211 non-null    object 
 5   Owner          211 non-null    int64  
 6   Age            211 non-null    int64  
dtypes: float64(1), int64(3), object(3)
memory usage: 11.7+ KB


In [6]:
# Paso 3.
# Cree un pipeline para el modelo de clasificación. Este pipeline debe
# contener las siguientes capas:
# - Transforma las variables categoricas usando el método
#   one-hot-encoding.
# - Escala las variables numéricas al intervalo [0, 1].
# - Selecciona las K mejores entradas.
# - Ajusta un modelo de regresion lineal.

# Identificar variables categóricas y numéricas

categorical_features = ["Fuel_Type", "Selling_type", "Transmission"]
numerical_features = [c for c in x_train.columns if c not in categorical_features]



In [7]:
categorical_features
numerical_features

['Selling_Price', 'Driven_kms', 'Owner', 'Age']

In [8]:
preprocessor = ColumnTransformer(
    transformers = [
        ("encoder", OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ("scaler", MinMaxScaler(), numerical_features),
    ],
)


In [9]:
from sklearn.feature_selection import f_regression
from sklearn.linear_model import LinearRegression


pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("select_kbest", SelectKBest(f_regression)),
    ("regression", LinearRegression())
])

In [10]:
# Paso 4.
# Optimice los hiperparametros del pipeline usando validación cruzada.
# Use 10 splits para la validación cruzada. Use el error medio absoluto
# para medir el desempeño modelo.

from sklearn.model_selection import StratifiedKFold


param_grid = {
    "select_kbest__k": [5, 10, 15, 20, 'all'],
    "regression__fit_intercept": [True, False],
    "regression__positive": [True, False]

}

grid_search = GridSearchCV(
    pipeline, param_grid, cv = 10, scoring = "neg_mean_absolute_error", verbose = 1, n_jobs = -1
)

In [11]:
grid_search.fit(x_train, y_train)

Fitting 10 folds for each of 20 candidates, totalling 200 fits




0,1,2
,estimator,Pipeline(step...egression())])
,param_grid,"{'regression__fit_intercept': [True, False], 'regression__positive': [True, False], 'select_kbest__k': [5, 10, ...]}"
,scoring,'neg_mean_absolute_error'
,n_jobs,-1
,refit,True
,cv,10
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('encoder', ...), ('scaler', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,score_func,<function f_r...00230ED227E20>
,k,15

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,True


In [12]:
# Mejor modelo
print(f'Mejores Hiperparámetros: {grid_search.best_params_}')
best_model = grid_search.best_estimator_

Mejores Hiperparámetros: {'regression__fit_intercept': True, 'regression__positive': True, 'select_kbest__k': 15}


In [13]:
import gzip
import pickle
import os
import json



In [14]:
# Paso 5.
# Guarde el modelo (comprimido con gzip) como "files/models/model.pkl.gz".
# Recuerde que es posible guardar el modelo comprimido usanzo la libreria gzip.

os.makedirs("../files/models", exist_ok=True)
with gzip.open('../files/models/model.pkl.gz', 'wb') as f:
    pickle.dump(grid_search, f)

In [15]:
# Predicciones
y_train_pred = best_model.predict(x_train)
y_test_pred = best_model.predict(x_test)

In [16]:
# Paso 6.
# Calcule las metricas de precision, precision balanceada, recall,
# y f1-score para los conjuntos de entrenamiento y prueba.
# Guardelas en el archivo files/output/metrics.json. Cada fila
# del archivo es un diccionario con las metricas de un modelo.
# Este diccionario tiene un campo para indicar si es el conjunto
# de entrenamiento o prueba. Por ejemplo:
#
# {'dataset': 'train', 'precision': 0.8, 'balanced_accuracy': 0.7, 'recall': 0.9, 'f1_score': 0.85}
# {'dataset': 'test', 'precision': 0.7, 'balanced_accuracy': 0.6, 'recall': 0.8, 'f1_score': 0.75}

# Métricas
from sklearn.metrics import r2_score, mean_squared_error, median_absolute_error


os.makedirs("../files/output", exist_ok=True)

metrics_list = [
    {
        "type":"metrics",
        "dataset": "train",
        "r2":float(r2_score(y_train, y_train_pred)),
        "mse": float(mean_squared_error(y_train, y_train_pred)),
        "mad": float(median_absolute_error(y_train, y_train_pred))
    },
    {
        "type":"metrics",
        "dataset": "test",
        "r2":float(r2_score(y_test, y_test_pred)),
        "mse": float(mean_squared_error(y_test, y_test_pred)),
        "mad": float(median_absolute_error(y_test, y_test_pred))
    }
]

print(pd.DataFrame(metrics_list))

with open("../files/output/metrics.json", "w") as f:
    for m in metrics_list:
        json.dump(m, f)
        f.write("\n")


      type dataset        r2        mse       mad
0  metrics   train  0.891696   5.874646  1.092912
1  metrics    test  0.732572  32.566673  1.503354
