In [19]:
# Importamos librerías necesarias
import os
import pandas as pd
import gzip
import json
import pickle

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [11]:
def load_and_process_data():
    # Cargamos los datasets
    train_path = '../files/input/train_data.csv.zip'
    test_path = '../files/input/test_data.csv.zip'
    
    df_train = pd.read_csv(train_path, index_col=False, compression='zip')
    df_test = pd.read_csv(test_path, index_col=False, compression='zip')
    
    # Creamos la columna Age a partir de year
    df_train['Age'] = 2021 - df_train['Year']
    df_test['Age'] = 2021 - df_test['Year']
    
    # Eliminamos las columnas 
    df_train.drop(columns=['Car_Name', 'Year'], inplace=True)
    df_test.drop(columns=['Car_Name', 'Year'], inplace=True)
    
    # Eliminamos filas con valores nulos
    df_train.dropna(inplace=True)
    df_test.dropna(inplace=True)
    
    return df_train, df_test
    
    
def split_features_target(df_train, df_test):
    x_train = df_train.drop(columns=['Present_Price'])
    y_train = df_train['Present_Price']
    x_test = df_test.drop(columns=['Present_Price'])
    y_test = df_test['Present_Price']
    
    return x_train, y_train, x_test, y_test
    
    

In [12]:
df_train, df_test = load_and_process_data()
x_train, y_train, x_test, y_test = split_features_target(df_train, df_test)
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((211, 7), (211,), (90, 7), (90,))

In [14]:
print(x_train.columns)
x_train.info()

Index(['Selling_Price', 'Driven_kms', 'Fuel_Type', 'Selling_type',
       'Transmission', 'Owner', 'Age'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 211 entries, 0 to 210
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Selling_Price  211 non-null    float64
 1   Driven_kms     211 non-null    int64  
 2   Fuel_Type      211 non-null    object 
 3   Selling_type   211 non-null    object 
 4   Transmission   211 non-null    object 
 5   Owner          211 non-null    int64  
 6   Age            211 non-null    int64  
dtypes: float64(1), int64(3), object(3)
memory usage: 11.7+ KB


In [70]:
from sklearn.preprocessing import PolynomialFeatures

def create_pipeline():
    categorical_cols = ['Fuel_Type', 'Selling_type', 'Transmission']
    numeric_cols = [c for c in x_train.columns if c not in categorical_cols]

    preprocessor = ColumnTransformer(
        transformers=[
            ("cat", OneHotEncoder(handle_unknown='ignore'), categorical_cols),
            ("num", MinMaxScaler(), numeric_cols),
        ]
    )

    pipeline = Pipeline(
        steps=[
            ("preprocess", preprocessor),
            #("poly", PolynomialFeatures(include_bias=False)),
            ("select", SelectKBest(score_func=f_regression)),
            ("mlp", LinearRegression()),
        ]
    )
    
    return pipeline

In [77]:
def make_grid_search(pipeline, x_train, y_train):
    # Búsqueda de hiperparámetros
    param_grid = {
        #'poly__degree': [1, 2],
        'select__k': [5, 10, 15, 'all'],
    }

    grid_search = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grid,
        scoring='neg_mean_absolute_error',
        cv=10,
        n_jobs=9,
        return_train_score=True
    )
    grid_search.fit(x_train, y_train)
    
    return grid_search

In [18]:
def save_estimator(estimator):
    models_path = "../files/models"
    os.makedirs(models_path, exist_ok=True)

    with gzip.open("../files/models/model.pkl.gz", "wb") as file:
        pickle.dump(estimator, file)

In [58]:
def calc_metrics(model, x_train, y_train, x_test, y_test):
    
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)
    
    metrics = [
        {
            'type': 'metrics',
            'dataset': 'train',
            'r2': r2_score(y_train, y_train_pred),
            'mse': mean_squared_error(y_train, y_train_pred),
            'mad': mean_absolute_error(y_train, y_train_pred)
        },
        {
            'type': 'metrics',
            'dataset': 'test',
            'r2': r2_score(y_test, y_test_pred),
            'mse': mean_squared_error(y_test, y_test_pred),
            'mad': mean_absolute_error(y_test, y_test_pred)
        }
    ]
    
    return metrics

In [38]:
# Paso 7: Guardamos las metricas
def save_metrics(metrics):
    metrics_path = "../files/output"
    os.makedirs(metrics_path, exist_ok=True)
    
    with open("../files/output/metrics.json", "w") as file:
        for metric in metrics:
            file.write(json.dumps(metric, ensure_ascii=False))
            file.write('\n')

In [78]:
pipeline = create_pipeline()
model = make_grid_search(pipeline, x_train, y_train)
save_estimator(model)
metrics = calc_metrics(model, x_train, y_train, x_test, y_test)
save_metrics(metrics)

print(model.best_estimator_)
print(model.best_params_)

Pipeline(steps=[('preprocess',
                 ColumnTransformer(transformers=[('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['Fuel_Type', 'Selling_type',
                                                   'Transmission']),
                                                 ('num', MinMaxScaler(),
                                                  ['Selling_Price',
                                                   'Driven_kms', 'Owner',
                                                   'Age'])])),
                ('select',
                 SelectKBest(k=15,
                             score_func=<function f_regression at 0x000001EEAD519300>)),
                ('mlp', LinearRegression())])
{'select__k': 15}




In [79]:
model.score(x_train, y_train) 

-1.6214879816158296

In [23]:
# Traemos el modelo guardado
with gzip.open("../files/models/model.pkl.gz", "rb") as file:
    model = pickle.load(file)
metrics = calc_metrics(model, x_train, y_train, x_test, y_test)
save_metrics(metrics)

print(model.best_estimator_)
print(model.best_params_)

Pipeline(steps=[('preprocess',
                 ColumnTransformer(transformers=[('cat', OneHotEncoder(),
                                                  ['SEX', 'EDUCATION',
                                                   'MARRIAGE']),
                                                 ('num', StandardScaler(),
                                                  ['LIMIT_BAL', 'AGE', 'PAY_0',
                                                   'PAY_2', 'PAY_3', 'PAY_4',
                                                   'PAY_5', 'PAY_6',
                                                   'BILL_AMT1', 'BILL_AMT2',
                                                   'BILL_AMT3', 'BILL_AMT4',
                                                   'BILL_AMT5', 'BILL_AMT6',
                                                   'PAY_AMT1', 'PAY_AMT2',
                                                   'PAY_AMT3', 'PAY_AMT4',
                                                   'PAY_AMT5', 'PAY_AMT6'])]