In [1]:
#Importar librerias
import pandas as pd
import numpy as np
import os
import pickle
import json
import gzip
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_selector
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error


In [21]:
#Funciones
def load_data():
    # Define las columnas categóricas
    #global categorical_features
    #categorical_features = ['SEX', 'EDUCATION', 'MARRIAGE'] 
    train = pd.read_csv(
        "../files/input/train_data.csv.zip",
        index_col=False,
        compression="zip",
    )
    test = pd.read_csv(
        "../files/input/test_data.csv.zip",
        index_col=False,
        compression="zip",
    )
    return train, test

def clear_data(df):
    #Agregar columna de edad
    df["Age"] = 2021 - df["Year"]
    #Eliminar columnas 
    df = df.drop(["Year","Car_Name"],axis=1)
    return df

def make_train_test_split(df):
    #Division en etiquetas 
    y_df =  df['Present_Price']
    #Division en caracteristicas de entrada
    x_df = df.drop('Present_Price', axis=1)
    return x_df, y_df


def cross_validation(pipeline, param_grid, x_train, y_train):
    #Evaluacion de hiperparametros
    model = GridSearchCV(
        estimator = pipeline,
        param_grid = param_grid,
        cv = 10,
        scoring="neg_mean_absolute_error", 
    )
    #Aplicacion de GridSearchCV
    model.fit(x_train, y_train)
    return model

def save_grid_search_model(model):
    #Guardar mejor modelo
    if not os.path.exists("../files/models"):
        os.makedirs("../files/models")
    with gzip.open("../files/models/model.pkl.gz", "wb") as file:
        pickle.dump(model, file)

def eval_metrics(type_dataset, y_true, y_pred):
    #Calculo de Metricas
    r2 = r2_score(y_true=y_true, y_pred=y_pred, multioutput='uniform_average')
    mse = mean_squared_error(y_true=y_true, y_pred=y_pred)
    mad = mean_absolute_error(y_true=y_true, y_pred=y_pred, multioutput="raw_values")
    #Formar diccionario de metricas 
    dic_metrics = { "type": "metrics",
                   'dataset': type_dataset, 
                   'r2': float(r2), 
                   'mse': float(mse), 
                   'mad': float(mad), 
                   }
    print(dic_metrics)
    #Guardar metricas como archivo json
    if not os.path.exists("../files/output"):
        os.makedirs("../files/output")
    with open("../files/output/metrics.json", "a") as f:
        json.dump(dic_metrics, f)
        f.write("\n")



#---------------------------FUCTION SET------------------------------------------
def dataset_manipulation():
    #Carga de datos
    train, test = load_data()
    #Limpieza de datos
    train = clear_data(train)
    test = clear_data(test)
    #Division en etiquetas y caracteristicas de entrada
    x_train, y_train = make_train_test_split(train)
    x_test, y_test = make_train_test_split(test)
    return x_train, y_train, x_test, y_test

def eval_model(model, x_train, y_train, x_test, y_test):
    if os.path.exists("../files/output/metrics.json"):
        os.remove("../files/output/metrics.json")
    # Calculo de métricas
    eval_metrics("train", y_train, y_pred=model.best_estimator_.predict(x_train))
    eval_metrics("test", y_test, y_pred=model.best_estimator_.predict(x_test))
    

In [15]:
#------------------------------MODEL------------------------------------------
def train_model(x_train, y_train):  
    #----------------------PIPELINE------------------------------
    # Crea el preprocesador
    preprocessor = ColumnTransformer(
        transformers=[
            #Ej. df[Sex]-->"1","2" por tanto la codificacion de esa columna sera un array de bit
            # "1"-->[1,0] "2"-->[0,1]
            ('one', OneHotEncoder(handle_unknown='ignore'), make_column_selector(dtype_include=object)),
            #(data-media/sdt) con ello cada dato tiene media = 0 y sdt=1
            ("scaler", MinMaxScaler(), make_column_selector(dtype_include=np.number)),
        ],
    )
    #Contruccion pipeline
    pipeline = make_pipeline(
    preprocessor,
    SelectKBest(k=15),
    LinearRegression(),
    )
    #-------------------------PARAMETROS GRID-----------------------------
    #Definicion de hiperparametros a evualuar 
    paramters_grid = {
        'selectkbest__k': range(2, 7),
    }
    #----------------------TRAIN CROSS-----------------------
    model = cross_validation(pipeline=pipeline,
                             param_grid=paramters_grid,
                             x_train=x_train,
                             y_train=y_train
                             )
    return model


Ejecución del flujo principal para el dataset predefinido

In [4]:
#Carga y manipulacion de datos 
x_train, y_train, x_test, y_test = dataset_manipulation()

In [5]:
#Informacion dataset
x_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 211 entries, 0 to 210
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Selling_Price  211 non-null    float64
 1   Driven_kms     211 non-null    int64  
 2   Fuel_Type      211 non-null    object 
 3   Selling_type   211 non-null    object 
 4   Transmission   211 non-null    object 
 5   Owner          211 non-null    int64  
 6   Age            211 non-null    int64  
dtypes: float64(1), int64(3), object(3)
memory usage: 11.7+ KB


In [16]:
#Definicion y entrenamiento de modelo
model = train_model(x_train, y_train)  
#Informacion del mejor modelo y ademas definirlo
print(model.best_score_)
print(model.best_params_)

  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw


-1.945183641987981
{'selectkbest__k': 6}


  f = msb / msw


In [7]:
#Salvar mejor model
save_grid_search_model(model)

In [22]:
#Evaluacion del Modelo con diversas metricas
eval_model(model, x_train, y_train, x_test, y_test)

{'type': 'metrics', 'dataset': 'train', 'r2': 0.813081518903666, 'mse': 10.138890078791942, 'mad': 1.8739578273009199}
{'type': 'metrics', 'dataset': 'test', 'r2': 0.730306417857275, 'mse': 32.84252947776587, 'mad': 2.4813145495028794}


  'mad': float(mad),
  'mad': float(mad),
