# ML con Factored - Sesión 2

El objetivo de esta sección es mostrar cómo se pueden desarrollar flujos de preprocesamiento en sklearn. Vamos a explorar las diferentes operaciones que se le pueden hacer a los diferentes tipos de variables y cómo agrupar todas las operaciones en un solo elemento de sklearn que tenga los métodos .fit() y .transform().

In [2]:
import numpy as np
import pandas as pd
from typing import List
from sklearn import set_config
set_config(display='diagram')

In [3]:
data = pd.read_csv(    "https://factored-workshops.s3.amazonaws.com/taxi-trip-duration.csv")
# Limitar rango de datos
tiempo_minimo = 60 
# 1 minuto
tiempo_maximo = 36000
# 10
horasdata = data[    (data["trip_duration"] > tiempo_minimo) &    (data["trip_duration"] < tiempo_maximo)]
data.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,pickup_borough,dropoff_borough
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455,Manhattan,Manhattan
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663,Manhattan,Brooklyn
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124,Manhattan,Brooklyn
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429,Brooklyn,Brooklyn
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435,Manhattan,Manhattan


In [4]:
y = data["trip_duration"]
input_df = data.drop(    ["id", "trip_duration", "dropoff_datetime", "store_and_fwd_flag"],    axis="columns")

In [5]:
from sklearn.model_selection import train_test_split
train_df, val_df, y_train, y_val = train_test_split(input_df, y, random_state=0)

In [6]:
from sklearn.preprocessing import StandardScaler

transformer = StandardScaler()
transformer.fit(
    train_df[["pickup_longitude", "pickup_latitude"]]
)
normed_array = transformer.transform(
    val_df[["pickup_longitude", "pickup_latitude"]]
)
print(normed_array)

[[ 0.02002579  0.1899489 ]
 [ 0.14776375  0.62411963]
 [-0.46953918 -1.21511026]
 ...
 [-0.24808783  0.48282216]
 [-0.15602706 -0.71726116]
 [ 0.020123    0.09933422]]


In [7]:
from sklearn.base import BaseEstimator, TransformerMixin

class PrimerTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.mean = X.mean()
        self.std = X.std()
        return self

    def transform(self, X, y=None):
        return (X - self.mean) / self.std

In [8]:
transformer = PrimerTransformer()
transformer.fit(
    train_df[["pickup_longitude", "pickup_latitude"]]
)
segundo = transformer.transform(
    val_df[["pickup_longitude", "pickup_latitude"]]
)
print(segundo)


         pickup_longitude  pickup_latitude
571578           0.020026         0.189949
1280332          0.147764         0.624119
177838          -0.469539        -1.215110
1433776         -0.412961        -0.334954
757662          -0.478774        -1.093542
...                   ...              ...
279330          -0.240311        -0.046098
579302          -0.096144        -0.789564
494423          -0.248088         0.482822
1098264         -0.156027        -0.717261
893982           0.020123         0.099334

[364661 rows x 2 columns]


In [9]:
normed_array == segundo

Unnamed: 0,pickup_longitude,pickup_latitude
571578,False,False
1280332,False,False
177838,False,False
1433776,False,False
757662,False,False
...,...,...
279330,False,False
579302,False,False
494423,False,False
1098264,False,False


In [10]:

class TransformerFechas(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        columna_fecha = pd.to_datetime(X["pickup_datetime"])
        fecha_df = pd.DataFrame()
        fecha_df["weekday"]=columna_fecha.dt.weekday
        fecha_df["hour"]=columna_fecha.dt.hour
        # TODO: Crear columnas con dia de la semana y hora de recogida.
        return fecha_df

In [11]:
transformer_fechas = TransformerFechas()
fechas_df = transformer_fechas.fit_transform(train_df)
fechas_df.head()

Unnamed: 0,weekday,hour
769896,2,18
562980,5,17
364339,6,2
44595,4,19
815832,4,2


In [12]:
class TransformerDistancia(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_init = X[["pickup_latitude", "pickup_longitude"]].to_numpy()
        X_final = X[["dropoff_latitude", "dropoff_longitude"]].to_numpy()

        # Distancia de Haversine
        # TODO: Calcular la variable distancia usando la funcion
        # distancia de Haversine.
        distancia = self.distancia_haversine(X_init=X_init,X_final=X_final)
        distancia_df = pd.DataFrame()
        distancia_df["distancia"] = distancia
        return distancia_df
    
    def distancia_haversine(self, X_init, X_final):
        # Convertir de decimal a radianes
        X_init = np.radians(X_init)
        X_final = np.radians(X_final)

        # Formula Haversine
        dlat = X_final[:, 0] - X_init[:, 0] 
        dlon = X_final[:, 1] - X_init[:, 1]
        a = np.sin(dlat / 2) ** 2 + np.cos(X_init[:, 0]) * np.cos(X_final[:, 0]) * np.sin(dlon / 2) ** 2
        c = 2 * np.arcsin(np.sqrt(a))
        r = 6371 # Radius of earth in kilometers. Use 3956 for miles. Determines return value units.
        return c * r

In [13]:
transformer_dist = TransformerDistancia()
distancias_df = transformer_dist.fit_transform(train_df)
distancias_df.head()

Unnamed: 0,distancia
0,2.16173
1,0.813213
2,2.564499
3,1.532988
4,0.847019


In [15]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


coord_cols = [
    "pickup_longitude",
    "pickup_latitude",
    "dropoff_longitude",
    "dropoff_latitude"
]

transformer_coord = ColumnTransformer(
    [
        ("transformer_dist", TransformerDistancia(), coord_cols),
    ],
    remainder="passthrough"
)
display(transformer_coord)

In [16]:
num_cols = ["passenger_count"] + coord_cols

num_pipeline = Pipeline(
    [
        ("transformer_coord", transformer_coord),
        ("scaler", StandardScaler())
    ]
)

X_num = num_pipeline.fit_transform(train_df[num_cols], y_train)
print(X_num)

[[-0.30501039  2.53913643]
 [-0.62654288 -0.50534352]
 [-0.20897624  2.53913643]
 ...
 [ 0.60805415 -0.50534352]
 [-0.55444107 -0.50534352]
 [-0.392485    0.25577647]]


In [17]:
display(num_pipeline)

## Checkpoint 3

In [18]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [19]:
coord_cols = [
    "pickup_longitude",
    "pickup_latitude",
    "dropoff_longitude",
    "dropoff_latitude"
]

transformer_coord = ColumnTransformer(
    [
        ("transformer_dist", TransformerDistancia(), coord_cols),
    ],
    remainder="passthrough"
)
display(transformer_coord)

In [20]:
num_cols = ["passenger_count"] + coord_cols

num_pipeline = Pipeline(
    [
        ("transformer_coord", transformer_coord),
        ("scaler", StandardScaler())
    ]
)

X_num = num_pipeline.fit_transform(train_df[num_cols], y_train)
print(X_num)

[[-0.30501039  2.53913643]
 [-0.62654288 -0.50534352]
 [-0.20897624  2.53913643]
 ...
 [ 0.60805415 -0.50534352]
 [-0.55444107 -0.50534352]
 [-0.392485    0.25577647]]


In [21]:
from sklearn.preprocessing import OrdinalEncoder

cat_cols = ["vendor_id", "pickup_borough", "pickup_datetime"]

transformer_fechas = ColumnTransformer(
    [("transformer_fechas",TransformerFechas(),["pickup_datetime"])
    ],
    remainder="passthrough"
)

cat_pipeline = Pipeline(
    [
       ("transformer_fechas",transformer_fechas), 
       ("ordinal_encoder", OrdinalEncoder()) #TODO: Punto 2 de Checkpoint 3
    ]
)

X_cat = cat_pipeline.fit_transform(train_df[cat_cols])

In [22]:
from sklearn.pipeline import FeatureUnion

full_pipeline = ColumnTransformer(
    [
        ("num_pipeline", num_pipeline, num_cols),
        ("cat_pipeline", cat_pipeline, cat_cols)
    ]
)

X_transformed = full_pipeline.fit_transform(train_df, y_train)
print(X_transformed.shape)

(1093983, 6)


In [23]:
display(full_pipeline)

In [24]:
import dill
dill.settings['recurse'] = True

with open("preprocesser.pkl", "wb") as f:
    dill.dump(full_pipeline, f)

In [25]:
with open("preprocesser.pkl", "rb") as f:
    loaded_pipeline = dill.load(f)
    
X_loaded = loaded_pipeline.transform(train_df)
print((X_loaded == X_transformed).all())

True


# parte 2: Entrenamiento de modelos

In [26]:
with open("preprocesser.pkl", "rb") as f:
    preprocessor = dill.load(f)

X_train = preprocessor.transform(train_df)
X_val = preprocessor.transform(val_df)

In [27]:
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error, r2_score, mean_squared_log_error

def evaluar_predicciones(y_pred, y_true):
    mae = mean_absolute_error(y_pred=y_pred, y_true=y_true)
    mape = mean_absolute_percentage_error(y_pred=y_pred, y_true=y_true)
    rmse = mean_squared_error(y_pred=y_pred, y_true=y_true, squared=False)
    print(f"MAE: {mae:.2f}")
    print(f"MAPE: {mape}")
    print(f"RMSE: {rmse}")

In [28]:
from sklearn.dummy import DummyRegressor

dummy_model = DummyRegressor(strategy="mean")
dummy_model.fit(X_train, y_train)
y_train_dummy = dummy_model.predict(X_train)
y_val_dummy = dummy_model.predict(X_val)

print("TRAIN")
evaluar_predicciones(y_pred=y_train_dummy, y_true=y_train)

print("VALIDATION")
evaluar_predicciones(y_pred=y_val_dummy, y_true=y_val)

TRAIN
MAE: 630.32
MAPE: 1.6123789519518277
RMSE: 4690.6078634083315
VALIDATION
MAE: 627.35
MAPE: 1.5768519368705345
RMSE: 6611.9047448133515


In [29]:
from sklearn.linear_model import LinearRegression

linear_model=LinearRegression()
linear_model.fit(X_train, y_train)
y_train_linear= linear_model.predict(X_train)
y_val_linear=linear_model.predict(X_val)


print("TRAIN")
evaluar_predicciones(y_pred=y_train_linear, y_true=y_train)

print("VALIDATION")
evaluar_predicciones(y_pred=y_val_linear, y_true=y_val)

TRAIN
MAE: 463.95
MAPE: 0.9715061206523689
RMSE: 4662.471053740636
VALIDATION
MAE: 461.73
MAPE: 0.9540775663732421
RMSE: 6593.7285986679935


In [30]:
import mlflow
mlflow.sklearn.autolog()

In [31]:
with mlflow.start_run(run_name="dummy") as run:
    dummy_model.fit(X_train, y_train)
    y_pred_val = dummy_model.predict(X_val)
    val_mae = mean_absolute_error(y_pred=y_pred_val, y_true=y_val)
    val_rmse = mean_squared_error(y_pred=y_pred_val, y_true=y_val, squared=False)
    val_mape = mean_absolute_percentage_error(y_pred=y_pred_val, y_true=y_val)
    val_r2 = r2_score(y_pred=y_pred_val, y_true=y_val)

    mlflow.log_metric("val_mae", val_mae)
    mlflow.log_metric("val_rmse", val_rmse)
    mlflow.log_metric("val_mape", val_mape)
    mlflow.log_metric("val_r2", val_r2)

mlflow ui

In [32]:
with mlflow.start_run(run_name="linear_regression") as run:
    linear_model=LinearRegression()
    linear_model.fit(X_train, y_train)
    y_pred_val=linear_model.predict(X_val)
    
    
    val_mae = mean_absolute_error(y_pred=y_pred_val, y_true=y_val)
    val_rmse = mean_squared_error(y_pred=y_pred_val, y_true=y_val, squared=False)
    val_mape = mean_absolute_percentage_error(y_pred=y_pred_val, y_true=y_val)
    val_r2 = r2_score(y_pred=y_pred_val, y_true=y_val)

    mlflow.log_metric("val_mae", val_mae)
    mlflow.log_metric("val_rmse", val_rmse)
    mlflow.log_metric("val_mape", val_mape)
    mlflow.log_metric("val_r2", val_r2)

In [33]:
from sklearn.ensemble import RandomForestRegressor

with mlflow.start_run(run_name="random_forest") as run:
    
    rf_model = RandomForestRegressor()
    rf_model.fit(X_train, y_train)
    
    y_pred_val = rf_model.predict(X_val)
    val_mae = mean_absolute_error(y_pred=y_pred_val, y_true=y_val)
    val_rmse = mean_squared_error(y_pred=y_pred_val, y_true=y_val, squared=False)
    val_mape = mean_absolute_percentage_error(y_pred=y_pred_val, y_true=y_val)
    val_r2 = r2_score(y_pred=y_pred_val, y_true=y_val)

    mlflow.log_metric("val_mae", val_mae)
    mlflow.log_metric("val_rmse", val_rmse)
    mlflow.log_metric("val_mape", val_mape)
    mlflow.log_metric("val_r2", val_r2)
    mlflow.log_artifact("preprocesser.pkl")

## Entrenando con XGBoost

In [35]:
def log_metrics_mlflow(y_pred_val, y_val):
    val_mae = mean_absolute_error(y_pred=y_pred_val, y_true=y_val)
    val_rmse = mean_squared_error(y_pred=y_pred_val, y_true=y_val, squared=False)
    val_mape = mean_absolute_percentage_error(y_pred=y_pred_val, y_true=y_val)
    val_r2 = r2_score(y_pred=y_pred_val, y_true=y_val)

    mlflow.log_metric("val_mae", val_mae)
    mlflow.log_metric("val_rmse", val_rmse)
    mlflow.log_metric("val_mape", val_mape)
    mlflow.log_metric("val_r2", val_r2)

In [37]:
from xgboost import XGBRegressor

mlflow.xgboost.autolog()

with mlflow.start_run(run_name="xgboost") as run:
    xgb_model = XGBRegressor()
    xgb_model.fit(X_train, y_train)
    y_pred_val = xgb_model.predict(X_val)
    log_metrics_mlflow(y_pred_val, y_val)
    mlflow.log_artifact("preprocesser.pkl")
    
    with open("xgb_model.pkl", "wb") as f:
        dill.dump(xgb_model, f)
    mlflow.log_artifact("xgb_model.pkl")