In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error


In [2]:
# Cargar los datos

def load_data(filename):
    df = pd.read_csv(f'../files/input/{filename}_data.csv.zip', compression='zip')
    return df

In [3]:
df_train = load_data('train')
df_train.head()

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Driven_kms,Fuel_Type,Selling_type,Transmission,Owner
0,jazz,2016,7.4,8.5,15059,Petrol,Dealer,Automatic,0
1,i10,2013,4.0,4.6,30000,Petrol,Dealer,Manual,0
2,TVS Apache RTR 180,2011,0.5,0.826,6000,Petrol,Individual,Manual,0
3,eon,2016,3.15,4.43,15000,Petrol,Dealer,Manual,0
4,Royal Enfield Thunder 350,2013,1.25,1.5,15000,Petrol,Individual,Manual,0


In [4]:
df_test = load_data('test')
df_test.head()

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Driven_kms,Fuel_Type,Selling_type,Transmission,Owner
0,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
1,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
2,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
3,ciaz,2015,6.75,8.12,18796,Petrol,Dealer,Manual,0
4,s cross,2015,6.5,8.61,33429,Diesel,Dealer,Manual,0


In [5]:
# Paso 1.
# Preprocese los datos.
# - Cree la columna 'Age' a partir de la columna 'Year'.
#   Asuma que el año actual es 2021.
# - Elimine las columnas 'Year' y 'Car_Name'.

In [6]:
def clean_data(df):
    df['Age'] = 2021 - df['Year']
    df.drop(columns=['Year', 'Car_Name'], inplace=True)
    return df

In [7]:
df_train = clean_data(df_train)
df_train.head()

Unnamed: 0,Selling_Price,Present_Price,Driven_kms,Fuel_Type,Selling_type,Transmission,Owner,Age
0,7.4,8.5,15059,Petrol,Dealer,Automatic,0,5
1,4.0,4.6,30000,Petrol,Dealer,Manual,0,8
2,0.5,0.826,6000,Petrol,Individual,Manual,0,10
3,3.15,4.43,15000,Petrol,Dealer,Manual,0,5
4,1.25,1.5,15000,Petrol,Individual,Manual,0,8


In [8]:
df_test = clean_data(df_test)
df_test.head()

Unnamed: 0,Selling_Price,Present_Price,Driven_kms,Fuel_Type,Selling_type,Transmission,Owner,Age
0,4.75,9.54,43000,Diesel,Dealer,Manual,0,8
1,7.25,9.85,6900,Petrol,Dealer,Manual,0,4
2,2.85,4.15,5200,Petrol,Dealer,Manual,0,10
3,6.75,8.12,18796,Petrol,Dealer,Manual,0,6
4,6.5,8.61,33429,Diesel,Dealer,Manual,0,6


In [9]:
# Paso 2.
# Divida los datasets en x_train, y_train, x_test, y_test.

In [10]:
def split_data(df):
    x = df.drop(columns=['Present_Price'])
    y = df['Present_Price']
    return x, y

In [11]:
x_train, y_train = split_data(df_train)
x_train

Unnamed: 0,Selling_Price,Driven_kms,Fuel_Type,Selling_type,Transmission,Owner,Age
0,7.40,15059,Petrol,Dealer,Automatic,0,5
1,4.00,30000,Petrol,Dealer,Manual,0,8
2,0.50,6000,Petrol,Individual,Manual,0,10
3,3.15,15000,Petrol,Dealer,Manual,0,5
4,1.25,15000,Petrol,Individual,Manual,0,8
...,...,...,...,...,...,...,...
206,2.55,57000,Petrol,Dealer,Manual,0,10
207,3.95,36000,Petrol,Dealer,Manual,0,6
208,0.10,92233,Petrol,Individual,Manual,0,15
209,6.00,77632,Diesel,Dealer,Manual,0,7


In [12]:
x_test, y_test = split_data(df_test)
x_test

Unnamed: 0,Selling_Price,Driven_kms,Fuel_Type,Selling_type,Transmission,Owner,Age
0,4.75,43000,Diesel,Dealer,Manual,0,8
1,7.25,6900,Petrol,Dealer,Manual,0,4
2,2.85,5200,Petrol,Dealer,Manual,0,10
3,6.75,18796,Petrol,Dealer,Manual,0,6
4,6.50,33429,Diesel,Dealer,Manual,0,6
...,...,...,...,...,...,...,...
85,9.70,21780,Petrol,Dealer,Manual,0,6
86,6.25,40126,Petrol,Dealer,Manual,0,7
87,2.10,50456,Petrol,Dealer,Manual,0,15
88,6.40,12000,Petrol,Dealer,Manual,0,5


In [13]:
# Paso 3.
# Cree un pipeline para el modelo de clasificación. Este pipeline debe
# contener las siguientes capas:
# - Transforma las variables categoricas usando el método
#   one-hot-encoding.
# - Escala las variables numéricas al intervalo [0, 1].
# - Selecciona las K mejores entradas.
# - Ajusta un modelo de regresion lineal.

In [14]:
def filter_features(df):
    num = df.select_dtypes(include=[np.number]).columns.tolist()
    cat = [col for col in df.columns if col not in num]
    return num, cat

In [15]:
x_train.dtypes

Selling_Price    float64
Driven_kms         int64
Fuel_Type         object
Selling_type      object
Transmission      object
Owner              int64
Age                int64
dtype: object

In [16]:
num_train, cat_train = filter_features(x_train)
num_train, cat_train

(['Selling_Price', 'Driven_kms', 'Owner', 'Age'],
 ['Fuel_Type', 'Selling_type', 'Transmission'])

In [17]:
def make_pipeline(num_ft, cat_ft):

    preprocessor = ColumnTransformer(
        transformers=[
            ('scaler', MinMaxScaler(), num_ft), 
            ('cat', OneHotEncoder(), cat_ft)
        ],

    )

    pipeline = Pipeline(
        steps=[
            ('preprocessor', preprocessor),
            ('feature_selection', SelectKBest(f_regression)),
            ('regression', LinearRegression())
    ])

    return pipeline

In [18]:
pipeline_train = make_pipeline(num_train, cat_train)
pipeline_train

In [19]:
# Paso 4.
# Optimice los hiperparametros del pipeline usando validación cruzada.
# Use 10 splits para la validación cruzada. Use el error medio absoluto
# para medir el desempeño modelo.

In [20]:
def grid_search(pipeline, x, y):

    param_grid = {
        'feature_selection__k': range(1, 2*(x.shape[1] + 1)),
    }

    grid = GridSearchCV(pipeline, param_grid, cv=10, scoring='neg_mean_absolute_error', n_jobs=-1)
    grid.fit(x,y)

    return grid

In [21]:
model = grid_search(pipeline_train, x_train, y_train)

In [23]:
# Paso 5.
# Guarde el modelo (comprimido con gzip) como "files/models/model.pkl.gz".
# Recuerde que es posible guardar el modelo comprimido usanzo la libreria gzip.

In [24]:
def save_model(model):

    import gzip
    import pickle
    import os

    if not os.path.exists('../files/models'):
        os.makedirs('../files/models')
    with gzip.open('../files/models/model.pkl.gz', 'wb') as f:
        pickle.dump(model, f)

In [25]:
save_model(model)

In [26]:
# Paso 6.
# Calcule las metricas r2, error cuadratico medio, y error absoluto medio
# para los conjuntos de entrenamiento y prueba. Guardelas en el archivo
# files/output/metrics.json. Cada fila del archivo es un diccionario con
# las metricas de un modelo. Este diccionario tiene un campo para indicar
# si es el conjunto de entrenamiento o prueba. Por ejemplo:
#
# {'type': 'metrics', 'dataset': 'train', 'r2': 0.8, 'mse': 0.7, 'mad': 0.9}
# {'type': 'metrics', 'dataset': 'test', 'r2': 0.7, 'mse': 0.6, 'mad': 0.8}

In [27]:
def calculate_metrics(model, x, y, dataset):

    y_pred = model.predict(x)
    r2 = float(r2_score(y, y_pred))
    mse = float(mean_squared_error(y, y_pred))
    mad = float(mean_absolute_error(y, y_pred))

    metrics = {
        'type': 'metrics',
        'dataset': dataset,
        'r2': r2,
        'mse': mse,
        'mad': mad
    }

    return metrics

In [28]:
def save_metrics(metrics):

    import json
    import os

    output_dir = "../files/output"
    output_file = os.path.join(output_dir, "metrics.json")

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    if os.path.exists(output_file):
        with open(output_file, "r") as f:
            existing_data = [json.loads(line) for line in f]
    else:
        existing_data = []
    existing_data.extend(metrics)

    with open(output_file, "w") as f:
        for metric in existing_data:
            json.dump(metric, f)
            f.write("\n")

In [29]:
metrics_train = calculate_metrics(model, x_train, y_train, "train")
metrics_test = calculate_metrics(model, x_test, y_test, "test")
metrics = [metrics_train, metrics_test]

save_metrics(metrics)