In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load cleaned BIXI data
df = pd.read_csv('../data/clean_bixi_2023.csv', parse_dates=['start_time', 'end_time'])

### Temporal Feature Engineering

In [4]:
df['hour'] = df['start_time'].dt.hour
df['day_of_week'] = df['start_time'].dt.dayofweek  # 0 = Monday
df['day_name'] = df['start_time'].dt.day_name()
df['month'] = df['start_time'].dt.month
df['is_weekend'] = df['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)

### Spatial Feature Engineering

In [5]:
from math import radians, sin, cos, sqrt, atan2

def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # Earth radius in kilometers
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    return R * c

df['trip_distance_km'] = df.apply(lambda row: haversine(
    row['start_lat'], row['start_lon'],
    row['end_lat'], row['end_lon']), axis=1)

In [6]:
# Number of trips per start station
start_counts = df['start_station_name'].value_counts().to_dict()
df['start_station_popularity'] = df['start_station_name'].map(start_counts)

In [7]:
# Average trip duration from each start station
avg_duration = df.groupby('start_station_name')['trip_duration_min'].mean().to_dict()
df['avg_trip_duration_from_station'] = df['start_station_name'].map(avg_duration)

In [8]:
df.drop(columns=[
    'start_station_name',
    'end_station_name',
    'start_time',
    'end_time',
    'day_name'
], inplace=True)

### Train-Test Split

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
target = 'trip_duration_min'
features = df.drop(columns=[target]).columns

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    df[features], df[target], test_size=0.2, random_state=42)

In [12]:
# Save as CSV for next step
X_train.to_csv('../data/X_train.csv', index=False)
X_test.to_csv('../data/X_test.csv', index=False)
y_train.to_csv('../data/y_train.csv', index=False)
y_test.to_csv('../data/y_test.csv', index=False)

In [13]:
feature_metadata = pd.DataFrame({
    'feature': features,
    'dtype': X_train.dtypes.values
})
feature_metadata.to_csv('../data/feature_metadata.csv', index=False)