In [None]:
pip install deepctr-torch==0.2.8  # includes DCN v2 (“DCNMix”)
pip install pytorch-tabular optuna torchmetrics


In [None]:
import pandas as pd, numpy as np
from deepctr_torch.inputs import SparseFeat, DenseFeat, get_feature_names
from sklearn.preprocessing import StandardScaler, LabelEncoder

df = pd.read_parquet("fhv_clean_2019_2022.parquet")

# -------- continuous -------------
dense_feats = ['trip_miles','trip_time','avg_speed',
               'tolls','bcf','sales_tax','congestion_surcharge',
               'airport_fee','wait_request_pickup','wait_request_scene']

scaler = StandardScaler().fit(df[dense_feats])
df[dense_feats] = scaler.transform(df[dense_feats])

# -------- categorical ------------
sparse_feats = ['hvfhs_license_num','dispatching_base_num',
                'originating_base_num','PULocationID','DOLocationID',
                'PU_DO_pair','shared_request_flag','shared_match_flag',
                'access_a_ride_flag','wav_request_flag','wav_match_flag',
                'pickup_hour','pickup_dow','pickup_month','covid_phase']

encoders = {col:LabelEncoder().fit(df[col]) for col in sparse_feats}
for col, le in encoders.items():
    df[col] = le.transform(df[col])

feature_columns = (
    [DenseFeat(col, 1) for col in dense_feats] +
    [SparseFeat(col,
                vocabulary_size=int(df[col].max())+1,
                embedding_dim=min(50, round(1.6*np.sqrt(df[col].nunique()))))
     for col in sparse_feats]
)


In [None]:
from deepctr_torch.models import DCNMix
import torch
from torch.utils.data import DataLoader, TensorDataset

fixlen_feature_names = get_feature_names(feature_columns)

X = df[fixlen_feature_names].values.astype('int64')
y = np.log1p(df['base_passenger_fare'].values.astype('float32'))

train_mask = df['pickup_datetime'] < '2022-04-01'
X_train, y_train = X[train_mask], y[train_mask]
X_val,   y_val   = X[~train_mask], y[~train_mask]

device = "cuda" if torch.cuda.is_available() else "cpu"

model = DCNMix(
    dnn_feature_columns=feature_columns,
    cross_num=4,               # number of cross layers
    low_rank=32,               # “mixed” low-rank variant
    dnn_hidden_units=(256,128,64),
    dnn_dropout=0.2,
    task='regression',
    l2_reg_embedding=1e-6,
).to(device)

model.compile(
    "adam", "mse",
    metrics=["mse"],  # will compute on log-space
    device=device
)

model.fit(
    X_train, y_train,
    batch_size=65536,
    epochs=20,
    verbose=2,
    validation_split=0.05,
    shuffle=True,
    use_double=False,
    callbacks=['early_stopping']
)
