In [1]:
# Librerias
import pandas as pd 
import pickle
import numpy as np
import os
import time
import gzip
from sklearn.model_selection import GridSearchCV 
from sklearn.compose import ColumnTransformer 
from sklearn.pipeline import Pipeline 
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, median_absolute_error


In [3]:
#LECTURA DE DATOS
train_data_zip = '../files/input/train_data.csv.zip'
test_data_zip = '../files/input/test_data.csv.zip'

In [4]:
#DESCOMPRIMIR ARCHIVOS 
train_data = pd.read_csv(train_data_zip, index_col=False, compression='zip')
test_data = pd.read_csv(test_data_zip, index_col=False, compression='zip')

In [6]:
print(train_data)


                      Car_Name  Year  Selling_Price  Present_Price  \
0                         jazz  2016           7.40          8.500   
1                          i10  2013           4.00          4.600   
2           TVS Apache RTR 180  2011           0.50          0.826   
3                          eon  2016           3.15          4.430   
4    Royal Enfield Thunder 350  2013           1.25          1.500   
..                         ...   ...            ...            ...   
206                        i10  2011           2.55          4.430   
207                    etios g  2015           3.95          6.800   
208           Bajaj Pulsar 150  2006           0.10          0.750   
209                        i20  2014           6.00          7.600   
210                       jazz  2016           6.00          8.400   

     Driven_kms Fuel_Type Selling_type Transmission  Owner  
0         15059    Petrol       Dealer    Automatic      0  
1         30000    Petrol       Deale

In [7]:
# Paso 1.
# Preprocese los datos.
# - Cree la columna 'Age' a partir de la columna 'Year'.
#   Asuma que el año actual es 2021.
# - Elimine las columnas 'Year' y 'Car_Name'.
#

def clean_data(data_df):
    df=data_df.copy()
    df['Age']=2021-df['Year']
    df.drop(columns=['Year','Car_Name'],inplace=True)
    return df

In [8]:
train_data = clean_data(train_data)
test_data = clean_data(test_data)

In [9]:
train_data

Unnamed: 0,Selling_Price,Present_Price,Driven_kms,Fuel_Type,Selling_type,Transmission,Owner,Age
0,7.40,8.500,15059,Petrol,Dealer,Automatic,0,5
1,4.00,4.600,30000,Petrol,Dealer,Manual,0,8
2,0.50,0.826,6000,Petrol,Individual,Manual,0,10
3,3.15,4.430,15000,Petrol,Dealer,Manual,0,5
4,1.25,1.500,15000,Petrol,Individual,Manual,0,8
...,...,...,...,...,...,...,...,...
206,2.55,4.430,57000,Petrol,Dealer,Manual,0,10
207,3.95,6.800,36000,Petrol,Dealer,Manual,0,6
208,0.10,0.750,92233,Petrol,Individual,Manual,0,15
209,6.00,7.600,77632,Diesel,Dealer,Manual,0,7


In [10]:
test_data

Unnamed: 0,Selling_Price,Present_Price,Driven_kms,Fuel_Type,Selling_type,Transmission,Owner,Age
0,4.75,9.54,43000,Diesel,Dealer,Manual,0,8
1,7.25,9.85,6900,Petrol,Dealer,Manual,0,4
2,2.85,4.15,5200,Petrol,Dealer,Manual,0,10
3,6.75,8.12,18796,Petrol,Dealer,Manual,0,6
4,6.50,8.61,33429,Diesel,Dealer,Manual,0,6
...,...,...,...,...,...,...,...,...
85,9.70,13.60,21780,Petrol,Dealer,Manual,0,6
86,6.25,13.60,40126,Petrol,Dealer,Manual,0,7
87,2.10,7.60,50456,Petrol,Dealer,Manual,0,15
88,6.40,8.40,12000,Petrol,Dealer,Manual,0,5


In [11]:
# Paso 2.
# Divida los datasets en x_train, y_train, x_test, y_test.

def features_target(data, target_column):
    x = data.drop(columns = target_column)
    y = data[target_column]
    return x, y

In [12]:
x_train, y_train = features_target(train_data, 'Present_Price')
x_test, y_test = features_target(test_data, 'Present_Price')

In [13]:
# Paso 3.
# Cree un pipeline para el modelo de regresión. Este pipeline debe
# contener las siguientes capas:
# - Transforma las variables categoricas usando el método
#   one-hot-encoding.
# - Escala las variables numéricas al intervalo [0, 1].
# - Selecciona las K mejores entradas.
# - Ajusta un modelo de regresion lineal.
#

def create_pipeline(df):
    
    cat_features = ['Fuel_Type', 'Selling_type', 'Transmission']
    num_features = [col for col in df.columns if col not in cat_features]

    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', MinMaxScaler(), num_features),
            ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
        ]
        
    )

    pipeline = Pipeline(
        steps=[
            ('preprocessor', preprocessor),
            ('feature_selection', SelectKBest(score_func=f_regression)),
            ('regressor', LinearRegression())
        ]
    )

    return pipeline

In [14]:
pipeline = create_pipeline(x_train)

In [15]:
# Paso 4.
# Optimice los hiperparametros del pipeline usando validación cruzada.
# Use 10 splits para la validación cruzada. Use el error medio absoluto
# para medir el desempeño modelo.
#

def optimize_hyperparameters(pipeline, x_train, y_train):
    
    param_grid = {
        'feature_selection__k': range(1, 12),
    }

    model = GridSearchCV(pipeline, param_grid, cv=10, scoring='neg_mean_absolute_error', n_jobs=-1, verbose=1)
    model.fit(x_train, y_train)

    return model

In [16]:

start = time.time()
model = optimize_hyperparameters(pipeline, x_train, y_train)
end = time.time()
print(f'Tiempo de optimizacion de hiperparametros: {end - start:.2f} seconds')
print(model.best_params_)
print(f'Score entrenamiento: {model.score(x_train, y_train)}')
print(f'Score prueba: {model.score(x_test, y_test)}')

Fitting 10 folds for each of 11 candidates, totalling 110 fits
Tiempo de optimizacion de hiperparametros: 5.90 seconds
{'feature_selection__k': 11}
Score entrenamiento: -1.6214879816158287
Score prueba: -2.4736199144163766


In [17]:
# Paso 5.
# Guarde el modelo (comprimido con gzip) como "files/models/model.pkl.gz".
# Recuerde que es posible guardar el modelo comprimido usanzo la libreria gzip.
#
#

def save_model(model):
    
    if not os.path.exists('../files/models'):
        os.makedirs('../files/models')
    
    with gzip.open('../files/models/model.pkl.gz', 'wb') as file:
        pickle.dump(model, file)

In [18]:
save_model(model)

In [19]:
# Paso 6.
# Calcule las metricas r2, error cuadratico medio, y error absoluto medio
# para los conjuntos de entrenamiento y prueba. Guardelas en el archivo
# files/output/metrics.json. Cada fila del archivo es un diccionario con
# las metricas de un modelo. Este diccionario tiene un campo para indicar
# si es el conjunto de entrenamiento o prueba. Por ejemplo:
#
# {'type': 'metrics', 'dataset': 'train', 'r2': 0.8, 'mse': 0.7, 'mad': 0.9}
# {'type': 'metrics', 'dataset': 'test', 'r2': 0.7, 'mse': 0.6, 'mad': 0.8}
#

def calculate_metrics(model, x_train, y_train, x_test, y_test):
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)

    metrics_train = {
        'type': 'metrics',
        'dataset': 'train',
        'r2': float(r2_score(y_train, y_train_pred)),
        'mse': float(mean_squared_error(y_train, y_train_pred)),
        'mad': float(median_absolute_error(y_train, y_train_pred))
    }

    metrics_test = {
        'type': 'metrics',
        'dataset': 'test',
        'r2': float(r2_score(y_test, y_test_pred)),
        'mse': float(mean_squared_error(y_test, y_test_pred)),
        'mad': float(median_absolute_error(y_test, y_test_pred))
    }

    print(metrics_train)
    print(metrics_test)

    return metrics_train, metrics_test

In [20]:
metrics_train, metrics_test = calculate_metrics(model, x_train, y_train, x_test, y_test)

{'type': 'metrics', 'dataset': 'train', 'r2': 0.8916962358587399, 'mse': 5.8746462805982045, 'mad': 1.0929123440195507}
{'type': 'metrics', 'dataset': 'test', 'r2': 0.7325716754123306, 'mse': 32.56667275386626, 'mad': 1.5033540603205657}


In [21]:
#guardar los parametros en carpeta output
def save_metrics(metrics_train, metrics_test):
    
    if not os.path.exists('../files/output'):
        os.makedirs('../files/output')
    
    metrics = [metrics_train, metrics_test]
    pd.DataFrame(metrics).to_json('../files/output/metrics.json', orient='records', lines=True)

In [22]:
save_metrics(metrics_train, metrics_test)