# Feature Engineering

In [34]:
import pandas as pd 
import numpy as np 

In [35]:
def save_interim(df, name, format="parquet"):
    path = f"/Users/emiliodulay/Documents/1. UCLA/MATH 156/data/{name}.{format}"
    if format == "parquet":
        df.to_parquet(path, engine="fastparquet")
    elif format == "pickle":
        df.to_pickle(path)
    print(f"Saved interim data to {path}")


## Load Interim data

In [36]:
train_df = pd.read_parquet('/Users/emiliodulay/Documents/1. UCLA/MATH 156/data/train_interim.parquet')
test_df  = pd.read_parquet('/Users/emiliodulay/Documents/1. UCLA/MATH 156/data/test_interim.parquet')

In [37]:
target_col = 'trip_duration'
X_train = train_df.drop(columns=[target_col])
y_train = train_df[target_col]


X_test = test_df

In [38]:
# remove outliers using IQR (NOT USED)
def remove_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df_filtered = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return df_filtered

## Baseline FE 
- haversine distance
- datetime patterns 
- categorical: vendor_id, store_and_fwd_flag

### Haversine distance
- haversine distance: shortest dist accounting for earth curvature 


In [39]:
def haversine_distance(lat1, lon1, lat2, lon2): 
    R = 6371 # earth rad km 
    phi1, phi2 = np.radians(lat1), np.radians(lat2)
    dphi = phi2 - phi1
    dlambda = np.radians(lon2 - lon1)
    a = np.sin(dphi/2)**2 + np.cos(phi1) * np.cos(phi2) * np.sin(dlambda/2)**2
    return 2 * R * np.arcsin(np.sqrt(a))

In [40]:
for df in [X_train, X_test]:
    df['distance'] = haversine_distance(
        df['pickup_latitude'], df['pickup_longitude'],
        df['dropoff_latitude'], df['dropoff_longitude']
    )

### Datetime cyclic patterns
- separate into hour, day, rest day (holiday vs business day)

In [41]:
for df in [X_train, X_test]:
    df['pickup_hour'] = df['pickup_datetime'].dt.hour
    df['pickup_dayofweek'] = df['pickup_datetime'].dt.dayofweek
    # weekend: 1, weekday: 0
    df['is_rest_day'] = (df['pickup_dayofweek'] >= 5).astype(int)

### Categorical features

In [42]:
cat_cols = ['vendor_id', 'store_and_fwd_flag']

### RESULT: BASELINE FE

In [43]:
drop_cols = ['id', 'pickup_datetime', 'dropoff_datetime',
             'pickup_latitude', 'pickup_longitude',
             'dropoff_latitude', 'dropoff_longitude']

X_train_baseline = X_train.drop(columns=drop_cols)
X_test_baseline = X_train.drop(columns=drop_cols)

In [44]:
# save baseline 
save_interim(X_train_baseline, 'X_train_baseline.parquet')
save_interim(X_test_baseline, 'X_test_baseline.parquet')

Saved interim data to /Users/emiliodulay/Documents/1. UCLA/MATH 156/data/X_train_baseline.parquet.parquet
Saved interim data to /Users/emiliodulay/Documents/1. UCLA/MATH 156/data/X_test_baseline.parquet.parquet


In [45]:
save_interim(y_train.to_frame(), 'y_train.parquet')

Saved interim data to /Users/emiliodulay/Documents/1. UCLA/MATH 156/data/y_train.parquet.parquet


In [46]:
print("Training features shape:", X_train_baseline.shape)
print("Test features shape:", X_test_baseline.shape)
print("Feature columns:", X_train_baseline.columns.tolist())

Training features shape: (1458644, 7)
Test features shape: (1458644, 7)
Feature columns: ['vendor_id', 'passenger_count', 'store_and_fwd_flag', 'distance', 'pickup_hour', 'pickup_dayofweek', 'is_rest_day']


## Advanced FE
- indep of baseline FE

In [48]:
from sklearn.cluster import KMeans 

### Spatial clustering
- distance metrics: euclidean, manhatten, speed estimate 
- cluster pickup, dropoff, center coord using KMeans 
- per cluster: avg trip duration, count of trips, avg speed (dist/trip duration)

Captures spatial patterns that poly won't find. 

In [49]:
def compute_basic_distances(df):
    df['euclid_dist'] = np.sqrt(
        (df['pickup_latitude'] - df['dropoff_latitude'])**2 +
        (df['pickup_longitude'] - df['dropoff_longitude'])**2
    )
    
    df['manhattan_dist'] = (
        np.abs(df['pickup_latitude'] - df['dropoff_latitude']) +
        np.abs(df['pickup_longitude'] - df['dropoff_longitude'])
    )

    df['distance'] = haversine_distance(
        df['pickup_latitude'], df['pickup_longitude'],
        df['dropoff_latitude'], df['dropoff_longitude']
    )
    
    df['raw_speed'] = df['manhattan_dist'] / (df['trip_duration'] + 1)
    
    return df

### Direction features 
- use angle + sin/cos expansion, 4 bin direction

In [50]:
def compute_direction(df):
    dx = df['dropoff_longitude'] - df['pickup_longitude']
    dy = df['dropoff_latitude'] - df['pickup_latitude']
    
    df['direction_angle'] = np.arctan2(dy, dx)
    
    df['dir_sin'] = np.sin(df['direction_angle'])
    df['dir_cos'] = np.cos(df['direction_angle'])
    
    df['dir_bin4'] = pd.cut(df['direction_angle'],
                            bins=4,
                            labels=False)
    
    return df

### Time features 

In [51]:
def compute_time(df):
    df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])

    df['hour'] = df['pickup_datetime'].dt.hour
    df['weekday'] = df['pickup_datetime'].dt.weekday
    df['week_hour'] = df['weekday'] * 24 + df['hour']
    df['is_weekend'] = (df['weekday'] >= 5).astype(int)
    df['pickup_hour'] = df['pickup_datetime'].dt.hour
    df['pickup_dayofweek'] = df['pickup_datetime'].dt.dayofweek
    # weekend: 1, weekday: 0
    df['is_rest_day'] = (df['pickup_dayofweek'] >= 5).astype(int)
    
    return df

### Rotated Manhatten grid features 


In [52]:
def rotate_coordinates(df, angle_deg=45):
    angle = np.radians(angle_deg)
    cosA, sinA = np.cos(angle), np.sin(angle)

    # pickup
    df['pickup_rot_x'] = df['pickup_longitude'] * cosA - df['pickup_latitude'] * sinA
    df['pickup_rot_y'] = df['pickup_longitude'] * sinA + df['pickup_latitude'] * cosA

    # dropoff
    df['dropoff_rot_x'] = df['dropoff_longitude'] * cosA - df['dropoff_latitude'] * sinA
    df['dropoff_rot_y'] = df['dropoff_longitude'] * sinA + df['dropoff_latitude'] * cosA

    # rotated distances
    df['rot_ns_dist'] = np.abs(df['pickup_rot_y'] - df['dropoff_rot_y'])
    df['rot_ew_dist'] = np.abs(df['pickup_rot_x'] - df['dropoff_rot_x'])

    return df

### Low-k clustering for poly 
- k = 10, don't use large bins like 100/50 used by XGB

In [53]:
def add_clusters(df, cluster_n=10):
    # pickup clusters
    km_pu = KMeans(n_clusters=cluster_n, n_init='auto', random_state=42)
    df['pickup_cluster'] = km_pu.fit_predict(df[['pickup_latitude','pickup_longitude']])

    # dropoff clusters
    km_do = KMeans(n_clusters=cluster_n, n_init='auto', random_state=42)
    df['dropoff_cluster'] = km_do.fit_predict(df[['dropoff_latitude','dropoff_longitude']])

    # center cluster
    df['center_lat'] = (df['pickup_latitude'] + df['dropoff_latitude']) / 2
    df['center_lon'] = (df['pickup_longitude'] + df['dropoff_longitude']) / 2

    km_center = KMeans(n_clusters=cluster_n, n_init='auto', random_state=42)
    df['center_cluster'] = km_center.fit_predict(df[['center_lat','center_lon']])

    return df

### Agg using 3-4 groups
- mean speed pickup, dropoff, center cluster, speed by hour 

In [54]:
def aggregate_cluster_speeds(df):
    # pickup cluster
    df['pu_cluster_avg_speed'] = df.groupby('pickup_cluster')['raw_speed'].transform('mean')
    
    # dropoff cluster
    df['do_cluster_avg_speed'] = df.groupby('dropoff_cluster')['raw_speed'].transform('mean')
    
    # center cluster
    df['center_cluster_avg_speed'] = df.groupby('center_cluster')['raw_speed'].transform('mean')
    
    # mean speed by hour
    df['hour_avg_speed'] = df.groupby('hour')['raw_speed'].transform('mean')

    return df

### Advanced FE Pipeline

In [55]:
def poly_feature_engineer(df, cluster_n=10, rotate_angle=45):
    df = df.copy()

    print('===compute_basic_distances===')
    df = compute_basic_distances(df)
    print('===compute_direction===')
    df = compute_direction(df)
    print('===compute_time===')
    df = compute_time(df)
    print('===rotate_coordinates===')
    df = rotate_coordinates(df, angle_deg=rotate_angle)
    print('===add_clusters===')
    df = add_clusters(df, cluster_n=cluster_n)
    print('===aggregate_cluster_speeds===')
    df = aggregate_cluster_speeds(df)

    return df

### RESULT ADVANCED FE

In [56]:
X_train_advanced = poly_feature_engineer(train_df)

===compute_basic_distances===
===compute_direction===
===compute_time===
===rotate_coordinates===
===add_clusters===
===aggregate_cluster_speeds===


In [58]:
# save advanced parquet (y already saved) 
save_interim(X_train_advanced, 'X_train_advanced_smaller.parquet')

Saved interim data to /Users/emiliodulay/Documents/1. UCLA/MATH 156/data/X_train_advanced_smaller.parquet.parquet


In [59]:
print("Training features shape:", X_train_advanced.shape)
print("Test features shape:", X_train_advanced.shape)
print("Feature columns:", X_train_advanced.columns.tolist())

Training features shape: (1458644, 41)
Test features shape: (1458644, 41)
Feature columns: ['id', 'vendor_id', 'pickup_datetime', 'dropoff_datetime', 'passenger_count', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'store_and_fwd_flag', 'trip_duration', 'euclid_dist', 'manhattan_dist', 'distance', 'raw_speed', 'direction_angle', 'dir_sin', 'dir_cos', 'dir_bin4', 'hour', 'weekday', 'week_hour', 'is_weekend', 'pickup_hour', 'pickup_dayofweek', 'is_rest_day', 'pickup_rot_x', 'pickup_rot_y', 'dropoff_rot_x', 'dropoff_rot_y', 'rot_ns_dist', 'rot_ew_dist', 'pickup_cluster', 'dropoff_cluster', 'center_lat', 'center_lon', 'center_cluster', 'pu_cluster_avg_speed', 'do_cluster_avg_speed', 'center_cluster_avg_speed', 'hour_avg_speed']


In [60]:
X_train_advanced.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1458644 entries, 0 to 1458643
Data columns (total 41 columns):
 #   Column                    Non-Null Count    Dtype         
---  ------                    --------------    -----         
 0   id                        1458644 non-null  object        
 1   vendor_id                 1458644 non-null  int64         
 2   pickup_datetime           1458644 non-null  datetime64[ns]
 3   dropoff_datetime          1458644 non-null  datetime64[ns]
 4   passenger_count           1458644 non-null  int64         
 5   pickup_longitude          1458644 non-null  float64       
 6   pickup_latitude           1458644 non-null  float64       
 7   dropoff_longitude         1458644 non-null  float64       
 8   dropoff_latitude          1458644 non-null  float64       
 9   store_and_fwd_flag        1458644 non-null  object        
 10  trip_duration             1458644 non-null  int64         
 11  euclid_dist               1458644 non-null  float6

## Create model combinations
Given that we have multiple features within each category, choose one feature of each and find the lowest RMSLE 

In [64]:
feature_combinations = {
     "combo_A": [
         'euclid_dist',
         'dir_sin', 'dir_cos',
         'hour', 'weekday',
         'center_cluster_avg_speed',
         'passenger_count',
         'vendor_id'
     ],

     "combo_B": [
         'rot_ns_dist', 'rot_ew_dist',
         'direction_angle',
         'week_hour',
         'pu_cluster_avg_speed', 'do_cluster_avg_speed',
         'passenger_count'
     ],

     "combo_C": [
         'manhattan_dist',
         'dir_bin4',
         'is_rest_day',
         'center_cluster',
         'vendor_id'
     ],

     "combo_D": [
         'distance',
         'dir_bin4',
         'hour_avg_speed',
         'center_cluster',
         'passenger_count'
     ],

     "combo_E": [
         'distance',
         'dir_sin', 'dir_cos',
         'week_hour',
         'center_cluster_avg_speed',
         'pu_cluster_avg_speed',
         'store_and_fwd_flag',
         'passenger_count'
     ], 
    
    "combo_F": [
        'rot_ns_dist', 
        'rot_ew_dist', 
        'pu_cluster_avg_speed', 
        'do_cluster_avg_speed', 
        'passenger_count'
    ]
}


In [65]:
for combo_name, features in feature_combinations.items():
    df_subset = X_train_advanced[features].copy()
    save_interim(df_subset, f"{combo_name}.parquet")
    print(f"Saved {combo_name} with {len(features)} features")

Saved interim data to /Users/emiliodulay/Documents/1. UCLA/MATH 156/data/combo_A.parquet.parquet
Saved combo_A with 8 features
Saved interim data to /Users/emiliodulay/Documents/1. UCLA/MATH 156/data/combo_B.parquet.parquet
Saved combo_B with 7 features
Saved interim data to /Users/emiliodulay/Documents/1. UCLA/MATH 156/data/combo_C.parquet.parquet
Saved combo_C with 5 features
Saved interim data to /Users/emiliodulay/Documents/1. UCLA/MATH 156/data/combo_D.parquet.parquet
Saved combo_D with 5 features
Saved interim data to /Users/emiliodulay/Documents/1. UCLA/MATH 156/data/combo_E.parquet.parquet
Saved combo_E with 8 features
Saved interim data to /Users/emiliodulay/Documents/1. UCLA/MATH 156/data/combo_F.parquet.parquet
Saved combo_F with 5 features
