# Feature Engineering

## Cargamos datos tranformados

In [2]:
import pandas as pd

ts_data = pd.read_parquet('../data/transformed/ts_data_2024_01.parquet')
ts_data

Unnamed: 0,pickup_hour,pickup_location_id,rides
0,2024-01-01 00:00:00,4,25
1,2024-01-01 00:00:00,7,4
2,2024-01-01 00:00:00,9,1
3,2024-01-01 00:00:00,10,6
4,2024-01-01 00:00:00,12,4
...,...,...,...
193435,2024-01-31 23:00:00,58,0
193436,2024-01-31 23:00:00,105,0
193437,2024-01-31 23:00:00,109,0
193438,2024-01-31 23:00:00,199,0


## Elegimos una localización

Por ejemplo, elegimos la localización 138.

In [3]:
ts_data_one_location = ts_data.loc[ts_data.pickup_location_id == 138, :].reset_index(drop=True)
ts_data_one_location.head(24)

Unnamed: 0,pickup_hour,pickup_location_id,rides
0,2024-01-01 00:00:00,138,59
1,2024-01-01 01:00:00,138,4
2,2024-01-01 02:00:00,138,0
3,2024-01-01 03:00:00,138,0
4,2024-01-01 04:00:00,138,0
5,2024-01-01 05:00:00,138,1
6,2024-01-01 06:00:00,138,1
7,2024-01-01 07:00:00,138,8
8,2024-01-01 08:00:00,138,64
9,2024-01-01 09:00:00,138,139


## Creamos los lags

In [4]:
# Vamos a predecir con 24 lags
n_lags = 24

# Creamos un dataframe con las horas de los días ordenadas
ts_data_one_location = ts_data_one_location.sort_values('pickup_hour').reset_index(drop=True)

# Creamos las columnas de lags
for lag in range(n_lags, 0, -1):
    ts_data_one_location[f'rides_previous_{lag}_hour'] = ts_data_one_location['rides'].shift(lag)

ts_data_one_location.head(10)

Unnamed: 0,pickup_hour,pickup_location_id,rides,rides_previous_24_hour,rides_previous_23_hour,rides_previous_22_hour,rides_previous_21_hour,rides_previous_20_hour,rides_previous_19_hour,rides_previous_18_hour,...,rides_previous_10_hour,rides_previous_9_hour,rides_previous_8_hour,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour
0,2024-01-01 00:00:00,138,59,,,,,,,,...,,,,,,,,,,
1,2024-01-01 01:00:00,138,4,,,,,,,,...,,,,,,,,,,59.0
2,2024-01-01 02:00:00,138,0,,,,,,,,...,,,,,,,,,59.0,4.0
3,2024-01-01 03:00:00,138,0,,,,,,,,...,,,,,,,,59.0,4.0,0.0
4,2024-01-01 04:00:00,138,0,,,,,,,,...,,,,,,,59.0,4.0,0.0,0.0
5,2024-01-01 05:00:00,138,1,,,,,,,,...,,,,,,59.0,4.0,0.0,0.0,0.0
6,2024-01-01 06:00:00,138,1,,,,,,,,...,,,,,59.0,4.0,0.0,0.0,0.0,1.0
7,2024-01-01 07:00:00,138,8,,,,,,,,...,,,,59.0,4.0,0.0,0.0,0.0,1.0,1.0
8,2024-01-01 08:00:00,138,64,,,,,,,,...,,,59.0,4.0,0.0,0.0,0.0,1.0,1.0,8.0
9,2024-01-01 09:00:00,138,139,,,,,,,,...,,59.0,4.0,0.0,0.0,0.0,1.0,1.0,8.0,64.0


## Añadimos target

In [5]:
# La columna target será la demanda real de la hora actual
ts_data_one_location['target'] = ts_data_one_location['rides']
ts_data_one_location.head(10)

Unnamed: 0,pickup_hour,pickup_location_id,rides,rides_previous_24_hour,rides_previous_23_hour,rides_previous_22_hour,rides_previous_21_hour,rides_previous_20_hour,rides_previous_19_hour,rides_previous_18_hour,...,rides_previous_9_hour,rides_previous_8_hour,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,target
0,2024-01-01 00:00:00,138,59,,,,,,,,...,,,,,,,,,,59
1,2024-01-01 01:00:00,138,4,,,,,,,,...,,,,,,,,,59.0,4
2,2024-01-01 02:00:00,138,0,,,,,,,,...,,,,,,,,59.0,4.0,0
3,2024-01-01 03:00:00,138,0,,,,,,,,...,,,,,,,59.0,4.0,0.0,0
4,2024-01-01 04:00:00,138,0,,,,,,,,...,,,,,,59.0,4.0,0.0,0.0,0
5,2024-01-01 05:00:00,138,1,,,,,,,,...,,,,,59.0,4.0,0.0,0.0,0.0,1
6,2024-01-01 06:00:00,138,1,,,,,,,,...,,,,59.0,4.0,0.0,0.0,0.0,1.0,1
7,2024-01-01 07:00:00,138,8,,,,,,,,...,,,59.0,4.0,0.0,0.0,0.0,1.0,1.0,8
8,2024-01-01 08:00:00,138,64,,,,,,,,...,,59.0,4.0,0.0,0.0,0.0,1.0,1.0,8.0,64
9,2024-01-01 09:00:00,138,139,,,,,,,,...,59.0,4.0,0.0,0.0,0.0,1.0,1.0,8.0,64.0,139


## Eliminamos valores nulos

In [6]:
# Eliminamos la columna 'rides' original y las filas con NaN (las primeras n_lags)
ts_data_one_location = ts_data_one_location.drop(columns=['rides'])
ts_data_one_location = ts_data_one_location.dropna().reset_index(drop=True)

ts_data_one_location.head(10)

Unnamed: 0,pickup_hour,pickup_location_id,rides_previous_24_hour,rides_previous_23_hour,rides_previous_22_hour,rides_previous_21_hour,rides_previous_20_hour,rides_previous_19_hour,rides_previous_18_hour,rides_previous_17_hour,...,rides_previous_9_hour,rides_previous_8_hour,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,target
0,2024-01-02 00:00:00,138,59.0,4.0,0.0,0.0,0.0,1.0,1.0,8.0,...,193.0,241.0,225.0,200.0,172.0,209.0,104.0,220.0,149.0,130
1,2024-01-02 01:00:00,138,4.0,0.0,0.0,0.0,1.0,1.0,8.0,64.0,...,241.0,225.0,200.0,172.0,209.0,104.0,220.0,149.0,130.0,27
2,2024-01-02 02:00:00,138,0.0,0.0,0.0,1.0,1.0,8.0,64.0,139.0,...,225.0,200.0,172.0,209.0,104.0,220.0,149.0,130.0,27.0,0
3,2024-01-02 03:00:00,138,0.0,0.0,1.0,1.0,8.0,64.0,139.0,199.0,...,200.0,172.0,209.0,104.0,220.0,149.0,130.0,27.0,0.0,1
4,2024-01-02 04:00:00,138,0.0,1.0,1.0,8.0,64.0,139.0,199.0,138.0,...,172.0,209.0,104.0,220.0,149.0,130.0,27.0,0.0,1.0,3
5,2024-01-02 05:00:00,138,1.0,1.0,8.0,64.0,139.0,199.0,138.0,146.0,...,209.0,104.0,220.0,149.0,130.0,27.0,0.0,1.0,3.0,2
6,2024-01-02 06:00:00,138,1.0,8.0,64.0,139.0,199.0,138.0,146.0,137.0,...,104.0,220.0,149.0,130.0,27.0,0.0,1.0,3.0,2.0,10
7,2024-01-02 07:00:00,138,8.0,64.0,139.0,199.0,138.0,146.0,137.0,147.0,...,220.0,149.0,130.0,27.0,0.0,1.0,3.0,2.0,10.0,33
8,2024-01-02 08:00:00,138,64.0,139.0,199.0,138.0,146.0,137.0,147.0,193.0,...,149.0,130.0,27.0,0.0,1.0,3.0,2.0,10.0,33.0,114
9,2024-01-02 09:00:00,138,139.0,199.0,138.0,146.0,137.0,147.0,193.0,241.0,...,130.0,27.0,0.0,1.0,3.0,2.0,10.0,33.0,114.0,172


# Seleccionamos las columnas de interés

In [7]:
# eliminamos la columna de la fecha y hora y la de la localización
ts_data_one_location = ts_data_one_location.drop(columns=['pickup_hour', 'pickup_location_id'])
ts_data_one_location.head(10)

Unnamed: 0,rides_previous_24_hour,rides_previous_23_hour,rides_previous_22_hour,rides_previous_21_hour,rides_previous_20_hour,rides_previous_19_hour,rides_previous_18_hour,rides_previous_17_hour,rides_previous_16_hour,rides_previous_15_hour,...,rides_previous_9_hour,rides_previous_8_hour,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,target
0,59.0,4.0,0.0,0.0,0.0,1.0,1.0,8.0,64.0,139.0,...,193.0,241.0,225.0,200.0,172.0,209.0,104.0,220.0,149.0,130
1,4.0,0.0,0.0,0.0,1.0,1.0,8.0,64.0,139.0,199.0,...,241.0,225.0,200.0,172.0,209.0,104.0,220.0,149.0,130.0,27
2,0.0,0.0,0.0,1.0,1.0,8.0,64.0,139.0,199.0,138.0,...,225.0,200.0,172.0,209.0,104.0,220.0,149.0,130.0,27.0,0
3,0.0,0.0,1.0,1.0,8.0,64.0,139.0,199.0,138.0,146.0,...,200.0,172.0,209.0,104.0,220.0,149.0,130.0,27.0,0.0,1
4,0.0,1.0,1.0,8.0,64.0,139.0,199.0,138.0,146.0,137.0,...,172.0,209.0,104.0,220.0,149.0,130.0,27.0,0.0,1.0,3
5,1.0,1.0,8.0,64.0,139.0,199.0,138.0,146.0,137.0,147.0,...,209.0,104.0,220.0,149.0,130.0,27.0,0.0,1.0,3.0,2
6,1.0,8.0,64.0,139.0,199.0,138.0,146.0,137.0,147.0,193.0,...,104.0,220.0,149.0,130.0,27.0,0.0,1.0,3.0,2.0,10
7,8.0,64.0,139.0,199.0,138.0,146.0,137.0,147.0,193.0,241.0,...,220.0,149.0,130.0,27.0,0.0,1.0,3.0,2.0,10.0,33
8,64.0,139.0,199.0,138.0,146.0,137.0,147.0,193.0,241.0,225.0,...,149.0,130.0,27.0,0.0,1.0,3.0,2.0,10.0,33.0,114
9,139.0,199.0,138.0,146.0,137.0,147.0,193.0,241.0,225.0,200.0,...,130.0,27.0,0.0,1.0,3.0,2.0,10.0,33.0,114.0,172


## Dividimos en X e y

In [8]:
# Hacemos un split de X e y
X = ts_data_one_location.drop(columns=['target'])
y = ts_data_one_location['target']


## Paquetizamos en una función

In [9]:
# todo en una función
def create_lag_features(df: pd.DataFrame, n_lags: int = 24) -> pd.DataFrame:
    """
    Crea n_lags features con retrasos (lags) para la serie temporal de rides.
    La columna target será el valor actual a predecir.
    
    Args:
        df (pd.DataFrame): DataFrame con columnas ['pickup_hour', 'rides'] de una sola localización.
        n_lags (int): número de lags que queremos generar.

    Returns:
        pd.DataFrame: dataframe con columnas rides_previous_N_hour y target
    """
    
    # Nos aseguramos de que esté ordenado por fecha
    df = df.sort_values('pickup_hour').reset_index(drop=True)
    
    # Creamos las columnas de lags
    for lag in range(n_lags, 0, -1):
        df[f'rides_previous_{lag}_hour'] = df['rides'].shift(lag)
    
    # La columna target será la demanda real de la hora actual
    df['target'] = df['rides']
    
    # Eliminamos la columna 'rides' original y las filas con NaN (las primeras n_lags)
    df = df.drop(columns=['rides'])
    df = df.dropna().reset_index(drop=True)

    df = df.drop(columns=['pickup_hour', 'pickup_location_id'])

    features = df_features.drop(columns=['target'])
    target = df_features['target']
    
    return df, features, target



# ts_data_one_location ya contiene solo una zona específica
df_features, features, target = create_lag_features(ts_data_one_location, n_lags=24)
# Mostramos la tabla
df_features.head(20)

print(f'{features.shape=}')
print(f'{target.shape=}')


KeyError: 'pickup_hour'

In [None]:
df_features.to_parquet('../data/processed/features_target_2024_01.parquet')