# WWW 2025: CLIP-Enhanced Multimodal CTR Solution

This notebook implements a two-stage advanced solution:

### **Stage 1: CLIP Embedding Generation (Task 1)**
* **Model:** `openai/clip-vit-base-patch32` (Pre-trained Vision Transformer).
* **Hardware:** Utilizes **Dual T4 GPUs** for parallel inference.
* **Process:** Reads raw images from `/kaggle/input/microlens/item_images`, encodes them into rich semantic vectors, and projects them to 128-d using PCA.

### **Stage 2: Transformer_DCN Training (Task 2)**
* **Model:** Team momo's **Transformer_DCN** architecture.
* **Input:** The **NEW** CLIP-generated embeddings + User ID sequences.
* **Goal:** High-performance CTR prediction.

In [None]:
# 1. Setup Environment
!pip install fuxictr==2.3.7 pandas==2.2.3 scikit-learn==1.4.0 transformers==4.38.0 pillow -q

import os
import torch
import pandas as pd
import numpy as np
from PIL import Image
from tqdm import tqdm
from transformers import CLIPProcessor, CLIPModel
from torch.utils.data import Dataset, DataLoader
from sklearn.decomposition import PCA

# Check GPUs
print(f"CUDA Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU Count: {torch.cuda.device_count()}")
    print(f"Devices: {[torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())]}")

## Stage 1: Generate CLIP Embeddings

In [None]:
# 2. Define Image Dataset & CLIP Extractor

class RawImageDataset(Dataset):
    def __init__(self, item_ids, img_root, processor):
        self.item_ids = item_ids
        self.img_root = img_root
        self.processor = processor

    def __len__(self):
        return len(self.item_ids)

    def __getitem__(self, idx):
        item_id = self.item_ids[idx]
        # Handle potential jpg/png extensions
        img_path = os.path.join(self.img_root, f"{item_id}.jpg")
        if not os.path.exists(img_path):
            img_path = os.path.join(self.img_root, f"{item_id}.png")
        
        try:
            if os.path.exists(img_path):
                image = Image.open(img_path).convert("RGB")
            else:
                # Fallback black image if missing
                image = Image.new("RGB", (224, 224), (0, 0, 0))
            
            # Process image for CLIP
            inputs = self.processor(images=image, return_tensors="pt")
            return inputs["pixel_values"].squeeze(0)  # [3, 224, 224]
        except Exception as e:
            print(f"Error loading {item_id}: {e}")
            return torch.zeros(3, 224, 224)

def generate_clip_embeddings():
    # Configuration
    IMG_ROOT = "/kaggle/input/microlens/item_images/item_images"
    ITEM_INFO_PATH = "/kaggle/input/microlens/MicroLens_1M_x1/item_info.parquet"
    BATCH_SIZE = 256  # Adjust for GPU memory
    MODEL_ID = "openai/clip-vit-base-patch32"
    
    # 1. Load Item IDs
    if not os.path.exists(ITEM_INFO_PATH):
        # Fallback search if path varies
        import glob
        found = glob.glob("/kaggle/input/**/item_info.parquet", recursive=True)
        if found: ITEM_INFO_PATH = found[0]
        else: 
            print("Dataset not found!")
            return None

    print(f"Loading metadata from {ITEM_INFO_PATH}...")
    df_items = pd.read_parquet(ITEM_INFO_PATH)
    item_ids = df_items['item_id'].tolist()
    
    # 2. Setup Model (Dual GPU)
    print(f"Loading {MODEL_ID}...")
    processor = CLIPProcessor.from_pretrained(MODEL_ID)
    model = CLIPModel.from_pretrained(MODEL_ID)
    
    if torch.cuda.device_count() > 1:
        print(f"Using {torch.cuda.device_count()} GPUs for inference!")
        model = torch.nn.DataParallel(model)
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()

    # 3. Inference Loop
    dataset = RawImageDataset(item_ids, IMG_ROOT, processor)
    dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4, pin_memory=True)
    
    all_embeddings = []
    print("Extracting features...")
    with torch.no_grad():
        for batch_imgs in tqdm(dataloader):
            batch_imgs = batch_imgs.to(device)
            # Get image features from CLIP
            if isinstance(model, torch.nn.DataParallel):
                features = model.module.get_image_features(pixel_values=batch_imgs)
            else:
                features = model.get_image_features(pixel_values=batch_imgs)
            
            all_embeddings.append(features.cpu().numpy())
            
    raw_embeddings = np.concatenate(all_embeddings, axis=0)
    print(f"Raw Embedding Shape: {raw_embeddings.shape}")

    # 4. Project to 128-dim using PCA (Task 1 Requirement)
    print("Reducing dimensions to 128 (PCA)...")
    pca = PCA(n_components=128)
    emb_128 = pca.fit_transform(raw_embeddings)
    
    # 5. Save New Dataset
    os.makedirs("data/MicroLens_1M_x1", exist_ok=True)
    df_items['item_emb_d128'] = list(emb_128)
    
    output_path = "data/MicroLens_1M_x1/item_info_task1.parquet"
    df_items.to_parquet(output_path)
    print(f"✓ New Item Info saved to: {output_path}")
    return output_path

# Run the extraction
# Note: If item_images folder is missing in this specific environment, 
# this block handles exceptions gracefully.
if os.path.exists("/kaggle/input/microlens/item_images"):
    new_item_info_path = generate_clip_embeddings()
else:
    print("⚠️ Image directory not found. Skipping CLIP generation for demo.")
    # Use existing if images missing (fallback)
    new_item_info_path = "./data/MicroLens_1M_x1/item_info.parquet"

## Stage 2: Transformer_DCN Setup & Training

In [None]:
# 3. Create Model Code (FuxiCTR + Momo Architecture)
import os

# --- FIX: Create directories first ---
os.makedirs("src", exist_ok=True)
os.makedirs("config", exist_ok=True)
os.makedirs("config/Transformer_DCN_microlens_mmctr_tuner_config_01", exist_ok=True)
os.makedirs("config/transformer_dcn_config", exist_ok=True)
# -------------------------------------

files = {}

files["fuxictr_version.py"] = """import fuxictr\nassert fuxictr.__version__ == "2.3.7"\n"""
files["src/__init__.py"] = """from .mmctr_dataloader import MMCTRDataLoader\nfrom .DIN import DIN\nfrom .Transformer_DCN import Transformer_DCN\nfrom .Transformer_DCN_Quant import Transformer_DCN_Quant"""

files["src/mmctr_dataloader.py"] = """import numpy as np
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.dataloader import default_collate
import pandas as pd
import torch

class ParquetDataset(Dataset):
    def __init__(self, data_path):
        self.column_index = dict()
        self.darray = self.load_data(data_path)
    def __getitem__(self, index):
        return self.darray[index, :]
    def __len__(self):
        return self.darray.shape[0]
    def load_data(self, data_path):
        df = pd.read_parquet(data_path)
        data_arrays = []
        idx = 0
        for col in df.columns:
            if df[col].dtype == "object":
                array = np.array(df[col].to_list())
                seq_len = array.shape[1]
                self.column_index[col] = [i + idx for i in range(seq_len)]
                idx += seq_len
            else:
                array = df[col].to_numpy()
                self.column_index[col] = idx
                idx += 1
            data_arrays.append(array)
        return np.column_stack(data_arrays)

class MMCTRDataLoader(DataLoader):
    def __init__(self, feature_map, data_path, item_info, batch_size=32, shuffle=False,
                 num_workers=1, max_len=100, **kwargs):
        if not data_path.endswith(".parquet"):
            data_path += ".parquet"
        self.dataset = ParquetDataset(data_path)
        column_index = self.dataset.column_index
        super().__init__(dataset=self.dataset, batch_size=batch_size,
                         shuffle=shuffle, num_workers=num_workers,
                         collate_fn=BatchCollator(feature_map, max_len, column_index, item_info))
        self.num_samples = len(self.dataset)
        self.num_blocks = 1
        self.num_batches = int(np.ceil(self.num_samples / self.batch_size))
    def __len__(self):
        return self.num_batches

class BatchCollator(object):
    def __init__(self, feature_map, max_len, column_index, item_info):
        self.feature_map = feature_map
        self.item_info = pd.read_parquet(item_info)
        self.max_len = max_len
        self.column_index = column_index
    def __call__(self, batch):
        batch_tensor = default_collate(batch)
        all_cols = set(list(self.feature_map.features.keys()) + self.feature_map.labels)
        batch_dict = dict()
        for col, idx in self.column_index.items():
            if col in all_cols:
                batch_dict[col] = batch_tensor[:, idx]
        batch_seqs = batch_dict["item_seq"][:, -self.max_len:]
        del batch_dict["item_seq"]
        mask = (batch_seqs > 0).float()
        item_index = batch_dict["item_id"].numpy().reshape(-1, 1)
        del batch_dict["item_id"]
        batch_items = np.hstack([batch_seqs.numpy(), item_index]).flatten()
        item_info = self.item_info.iloc[batch_items]
        item_dict = dict()
        for col in item_info.columns:
            if col in all_cols:
                item_dict[col] = torch.from_numpy(np.array(item_info[col].to_list()))
        return batch_dict, item_dict, mask"""

files["src/Transformer_DCN.py"] = """import torch
from fuxictr.utils import not_in_whitelist
from torch import nn
from fuxictr.pytorch.models import BaseModel
from fuxictr.pytorch.layers import FeatureEmbedding, MLP_Block, CrossNetV2

class Transformer_DCN(BaseModel):
    def __init__(self, feature_map, model_id="Transformer_DCN", gpu=-1, hidden_activations="ReLU",
                 dcn_cross_layers=3, dcn_hidden_units=[1024, 512, 256], mlp_hidden_units=[64, 32],
                 num_heads=1, transformer_layers=2, transformer_dropout=0.2, dim_feedforward=256,
                 learning_rate=5e-4, embedding_dim=64, net_dropout=0.2, first_k_cols=16,
                 batch_norm=False, concat_max_pool=True, accumulation_steps=1,
                 embedding_regularizer=None, net_regularizer=None, **kwargs):
        super().__init__(feature_map, model_id=model_id, gpu=gpu, embedding_regularizer=embedding_regularizer,
                         net_regularizer=net_regularizer, **kwargs)
        self.feature_map = feature_map
        self.embedding_dim = embedding_dim
        self.item_info_dim = 0
        for feat, spec in self.feature_map.features.items():
            if spec.get("source") == "item":
                self.item_info_dim += spec.get("embedding_dim", embedding_dim)
        transformer_in_dim = self.item_info_dim * 2
        self.accumulation_steps = accumulation_steps
        self.embedding_layer = FeatureEmbedding(feature_map, embedding_dim)
        self.transformer_encoder = Transformer(transformer_in_dim, dim_feedforward=dim_feedforward,
            num_heads=num_heads, dropout=transformer_dropout, transformer_layers=transformer_layers,
            first_k_cols=first_k_cols, concat_max_pool=concat_max_pool)
        seq_out_dim = (first_k_cols + int(concat_max_pool)) * transformer_in_dim
        dcn_in_dim = feature_map.sum_emb_out_dim() + seq_out_dim
        self.crossnet = CrossNetV2(dcn_in_dim, dcn_cross_layers)
        self.parallel_dnn = MLP_Block(input_dim=dcn_in_dim, output_dim=None, hidden_units=dcn_hidden_units,
                                      hidden_activations=hidden_activations, output_activation=None,
                                      dropout_rates=net_dropout, batch_norm=batch_norm)
        dcn_out_dim = dcn_in_dim + dcn_hidden_units[-1]
        self.mlp = MLP_Block(input_dim=dcn_out_dim, output_dim=1, hidden_units=mlp_hidden_units,
                             hidden_activations=hidden_activations, output_activation=self.output_activation)
        self.compile(kwargs["optimizer"], kwargs["loss"], learning_rate)
        self.reset_parameters()
        self.model_to_device()
    def forward(self, inputs):
        batch_dict, item_dict, mask = self.get_inputs(inputs)
        emb_list = []
        if batch_dict:
            feature_emb = self.embedding_layer(batch_dict, flatten_emb=True)
            emb_list.append(feature_emb)
        feat_emb = torch.cat(emb_list, dim=-1)
        item_feat_emb = self.embedding_layer(item_dict, flatten_emb=True)
        batch_size = mask.shape[0]
        item_feat_emb = item_feat_emb.view(batch_size, -1, self.item_info_dim)
        target_emb = item_feat_emb[:, -1, :]
        sequence_emb = item_feat_emb[:, 0:-1, :]
        transformer_emb = self.transformer_encoder(target_emb, sequence_emb, mask=mask)
        dcn_in_emb = torch.cat([feat_emb, target_emb, transformer_emb], dim=-1)
        cross_out = self.crossnet(dcn_in_emb)
        dnn_out = self.parallel_dnn(dcn_in_emb)
        y_pred = self.mlp(torch.cat([cross_out, dnn_out], dim=-1))
        return {"y_pred": y_pred}
    def get_inputs(self, inputs, feature_source=None):
        batch_dict, item_dict, mask = inputs
        X_dict = dict()
        for feature, value in batch_dict.items():
            if feature in self.feature_map.labels: continue
            feature_spec = self.feature_map.features[feature]
            if feature_spec["type"] == "meta": continue
            if feature_source and not_in_whitelist(feature_spec["source"], feature_source): continue
            X_dict[feature] = value.to(self.device)
        for item, value in item_dict.items():
            item_dict[item] = value.to(self.device)
        return X_dict, item_dict, mask.to(self.device)
    def concat_embedding(self, field, feature_emb_dict):
        if type(field) == tuple:
            emb_list = [feature_emb_dict[f] for f in field]
            return torch.cat(emb_list, dim=-1)
        else:
            return feature_emb_dict[field]
    def get_labels(self, inputs):
        labels = self.feature_map.labels
        batch_dict = inputs[0]
        y = batch_dict[labels[0]].to(self.device)
        return y.float().view(-1, 1)
    def get_group_id(self, inputs):
        return inputs[0][self.feature_map.group_id]
    def train_step(self, batch_data):
        return_dict = self.forward(batch_data)
        y_true = self.get_labels(batch_data)
        loss = self.compute_loss(return_dict, y_true)
        loss = loss / self.accumulation_steps
        loss.backward()
        if (self._batch_index + 1) % self.accumulation_steps == 0:
            nn.utils.clip_grad_norm_(self.parameters(), self._max_gradient_norm)
            self.optimizer.step()
            self.optimizer.zero_grad()
        return loss

class Transformer(nn.Module):
    def __init__(self, transformer_in_dim, dim_feedforward=64, num_heads=1, dropout=0,
                 transformer_layers=1, first_k_cols=16, concat_max_pool=True):
        super(Transformer, self).__init__()
        self.concat_max_pool = concat_max_pool
        self.first_k_cols = first_k_cols
        encoder_layer = nn.TransformerEncoderLayer(d_model=transformer_in_dim, nhead=num_heads,
            dim_feedforward=dim_feedforward, dropout=dropout, batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=transformer_layers)
        if self.concat_max_pool:
            self.out_linear = nn.Linear(transformer_in_dim, transformer_in_dim)
    def forward(self, target_emb, sequence_emb, mask=None):
        seq_len = sequence_emb.size(1)
        concat_seq_emb = torch.cat([sequence_emb, target_emb.unsqueeze(1).expand(-1, seq_len, -1)], dim=-1)
        key_padding_mask = self.adjust_mask(mask).bool()
        tfmr_out = self.transformer_encoder(src=concat_seq_emb, src_key_padding_mask=key_padding_mask)
        tfmr_out = tfmr_out.masked_fill(key_padding_mask.unsqueeze(-1).repeat(1, 1, tfmr_out.shape[-1]), 0.)
        output_concat = []
        output_concat.append(tfmr_out[:, -self.first_k_cols:].flatten(start_dim=1))
        if self.concat_max_pool:
            tfmr_out = tfmr_out.masked_fill(key_padding_mask.unsqueeze(-1).repeat(1, 1, tfmr_out.shape[-1]), -1e9)
            pooled_out = self.out_linear(tfmr_out.max(dim=1).values)
            output_concat.append(pooled_out)
        return torch.cat(output_concat, dim=-1)
    def adjust_mask(self, mask):
        fully_masked = mask.all(dim=-1)
        mask[fully_masked, -1] = 0
        return mask"""

files["run_expid.py"] = """import os
import sys
import logging
import fuxictr_version
from fuxictr import datasets
from datetime import datetime
from fuxictr.utils import load_config, set_logger, print_to_json, print_to_list
from fuxictr.features import FeatureMap
from fuxictr.pytorch.dataloaders import RankDataLoader
from fuxictr.pytorch.torch_utils import seed_everything
from fuxictr.preprocess import FeatureProcessor, build_dataset
import src as model_zoo
from src.mmctr_dataloader import MMCTRDataLoader
import gc
import argparse
import os
from pathlib import Path
if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--config', type=str, default='./config/', help='The config directory.')
    parser.add_argument('--expid', type=str, default='DeepFM_test', help='The experiment id to run.')
    parser.add_argument('--gpu', type=int, default=-1, help='The gpu index, -1 for cpu')
    args = vars(parser.parse_args())
    experiment_id = args['expid']
    params = load_config(args['config'], experiment_id)
    params['gpu'] = args['gpu']
    set_logger(params)
    logging.info("Params: " + print_to_json(params))
    seed_everything(seed=params['seed'])
    data_dir = os.path.join(params['data_root'], params['dataset_id'])
    feature_map_json = os.path.join(data_dir, "feature_map.json")
    feature_encoder = FeatureProcessor(**params)
    params["train_data"], params["valid_data"], params["test_data"] = build_dataset(feature_encoder, **params)
    feature_map = FeatureMap(params['dataset_id'], data_dir)
    feature_map.load(feature_map_json, params)
    logging.info("Feature specs: " + print_to_json(feature_map.features))
    model_class = getattr(model_zoo, params['model'])
    model = model_class(feature_map, **params)
    model.count_parameters()
    params["data_loader"] = MMCTRDataLoader
    train_gen, valid_gen = RankDataLoader(feature_map, stage='train', **params).make_iterator()
    model.fit(train_gen, validation_data=valid_gen, **params)
    logging.info('****** Validation evaluation ******')
    valid_result = model.evaluate(valid_gen)
    result_filename = Path(args['config']).name.replace(".yaml", "") + '.csv'
    with open(result_filename, 'a+') as fw:
        fw.write(' {},[command] python {},[exp_id] {},[dataset_id] {},[train] {},[val] {}\\n'.format(datetime.now().strftime('%Y%m%d-%H%M%S'), ' '.join(sys.argv), experiment_id, params['dataset_id'], "N.A.", print_to_list(valid_result)))"""

files["prediction.py"] = """import os
import sys
import logging
import fuxictr_version
from fuxictr import datasets
from datetime import datetime
from fuxictr.utils import load_config, set_logger, print_to_json, print_to_list
from fuxictr.features import FeatureMap
from fuxictr.pytorch.dataloaders import RankDataLoader
from fuxictr.pytorch.torch_utils import seed_everything
from fuxictr.preprocess import FeatureProcessor, build_dataset
import src as model_zoo
from src.mmctr_dataloader import MMCTRDataLoader
import gc
import argparse
import os
from pathlib import Path
import pandas as pd
import shutil
if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--config', type=str, default='./config/', help='The config directory.')
    parser.add_argument('--expid', type=str, default='DeepFM_test', help='The experiment id to run.')
    parser.add_argument('--gpu', type=int, default=-1, help='The gpu index, -1 for cpu')
    args = vars(parser.parse_args())
    experiment_id = args['expid']
    params = load_config(args['config'], experiment_id)
    params['gpu'] = args['gpu']
    set_logger(params)
    logging.info("Params: " + print_to_json(params))
    seed_everything(seed=params['seed'])
    data_dir = os.path.join(params['data_root'], params['dataset_id'])
    feature_map_json = os.path.join(data_dir, "feature_map.json")
    feature_encoder = FeatureProcessor(**params)
    params["train_data"], params["valid_data"], params["test_data"] = build_dataset(feature_encoder, **params)
    feature_map = FeatureMap(params['dataset_id'], data_dir)
    feature_map.load(feature_map_json, params)
    logging.info("Feature specs: " + print_to_json(feature_map.features))
    model_class = getattr(model_zoo, params['model'])
    model = model_class(feature_map, **params)
    model.count_parameters()
    params["data_loader"] = MMCTRDataLoader
    train_gen, valid_gen = RankDataLoader(feature_map, stage='train', **params).make_iterator()
    model.load_weights(model.checkpoint)
    logging.info('Test scoring...')
    test_gen = RankDataLoader(feature_map, stage='test', **params).make_iterator()
    test_pred = model.predict(test_gen)
    ans = pd.DataFrame({"ID": range(test_pred.shape[0]), "Task2": test_pred})
    logging.info("Writing results...")
    os.makedirs("submission", exist_ok=True)
    ans.to_csv("submission/prediction.csv", index=False)
    shutil.make_archive(f'submission/{experiment_id}', 'zip', 'submission/', 'prediction.csv')
    logging.info("All done.")"""

files["src/DIN.py"] = """from fuxictr.pytorch.models import BaseModel\nclass DIN(BaseModel): pass"""
files["src/Transformer_DCN_Quant.py"] = """from fuxictr.pytorch.models import BaseModel\nclass Transformer_DCN_Quant(BaseModel): pass"""

for path, content in files.items():
    with open(path, "w") as f: f.write(content)
print("✓ Model source files created")

In [None]:
# 4. Write Configuration (Optimized for Speed + New Embeddings)
import json
import os

# Create directories just in case
os.makedirs("config/Transformer_DCN_microlens_mmctr_tuner_config_01", exist_ok=True)
os.makedirs("data/MicroLens_1M_x1", exist_ok=True)

# DATASET CONFIG (Points to our new item_info_task1.parquet)
dataset_config = """MicroLens_1M_x1:
    data_format: parquet
    data_root: ./data/
    feature_cols:
    - {active: true, dtype: int, name: user_id, type: meta}
    - {active: true, dtype: int, name: item_seq, type: meta}
    - {active: true, dtype: int, name: likes_level, type: categorical, vocab_size: 11}
    - {active: true, dtype: int, name: views_level, type: categorical, vocab_size: 11}
    - {active: true, dtype: int, name: item_id, source: item, type: categorical, vocab_size: 91718}
    - {active: true, dtype: int, max_len: 5, name: item_tags, source: item, type: sequence, vocab_size: 11740}
    - {active: true, dtype: float, embedding_dim: 128, name: item_emb_d128, source: item, type: embedding}
    item_info: ./data/MicroLens_1M_x1/item_info_task1.parquet
    label_col: {dtype: float, name: label}
    rebuild_dataset: False
    test_data: ./data/MicroLens_1M_x1/test.parquet
    train_data: ./data/MicroLens_1M_x1/train.parquet
    valid_data: ./data/MicroLens_1M_x1/valid.parquet"""

with open("config/Transformer_DCN_microlens_mmctr_tuner_config_01/dataset_config.yaml", "w") as f: 
    f.write(dataset_config)

# MODEL CONFIG (Reduced Epochs/Dims for Kaggle Time Limits)
model_config = """Transformer_DCN_MicroLens_1M_x1_001_820c435c:
    batch_norm: false
    batch_size: 1024
    concat_max_pool: true
    dataset_id: MicroLens_1M_x1
    dcn_cross_layers: 3
    dcn_hidden_units: [512, 256]
    debug_mode: false
    dim_feedforward: 128
    early_stop_patience: 2
    embedding_dim: 32
    embedding_regularizer: 0
    epochs: 5
    eval_steps: null
    feature_config: null
    feature_specs: null
    first_k_cols: 8
    group_id: user_id
    hidden_activations: relu
    learning_rate: 0.001
    loss: binary_crossentropy
    metrics: [logloss, AUC]
    mlp_hidden_units: [32]
    model: Transformer_DCN
    model_root: ./checkpoints/
    monitor: {AUC: 1}
    monitor_mode: max
    net_dropout: 0.1
    net_regularizer: 0
    num_heads: 1
    num_workers: 8
    optimizer: adam
    pickle_feature_encoder: true
    save_best_only: true
    seed: 20242025
    shuffle: true
    task: binary_classification
    transformer_dropout: 0.1
    transformer_layers: 1
    use_features: null
    verbose: 1"""

with open("config/Transformer_DCN_microlens_mmctr_tuner_config_01/model_config.yaml", "w") as f: 
    f.write(model_config)

# MANUAL FEATURE MAP (Fixes schema errors)
feature_map = {
    "dataset_id": "MicroLens_1M_x1", "num_fields": 7, "num_features": -1, "input_length": -1,
    "features": {
        "user_id": {"active": True, "dtype": "int", "name": "user_id", "type": "meta"},
        "item_seq": {"active": True, "dtype": "int", "name": "item_seq", "type": "meta"},
        "likes_level": {"active": True, "dtype": "int", "name": "likes_level", "type": "categorical", "vocab_size": 11},
        "views_level": {"active": True, "dtype": "int", "name": "views_level", "type": "categorical", "vocab_size": 11},
        "item_id": {"active": True, "dtype": "int", "name": "item_id", "source": "item", "type": "categorical", "vocab_size": 91718},
        "item_tags": {"active": True, "dtype": "int", "max_len": 5, "name": "item_tags", "source": "item", "type": "sequence", "vocab_size": 11740},
        "item_emb_d128": {"active": True, "dtype": "float", "embedding_dim": 128, "name": "item_emb_d128", "source": "item", "type": "embedding"}
    },
    "labels": ["label"]
}
with open("data/MicroLens_1M_x1/feature_map.json", "w") as f: 
    json.dump(feature_map, f, indent=4)

# Link Data
import glob
train_files = glob.glob("/kaggle/input/microlens/train.parquet", recursive=True)
if train_files:
    dataset_dir = os.path.dirname(train_files[0])
    local_data_dir = "./data/MicroLens_1M_x1"
    for f in os.listdir(dataset_dir):
        src = os.path.join(dataset_dir, f)
        dst = os.path.join(local_data_dir, f)
        if not os.path.exists(dst): os.symlink(src, dst)
    print("✓ Data linked")

In [10]:
# 5. Run Training & Prediction
print("Starting Training...")
!python run_expid.py --config config/Transformer_DCN_microlens_mmctr_tuner_config_01 --expid Transformer_DCN_MicroLens_1M_x1_001_820c435c --gpu 0

print("Starting Prediction...")
!python prediction.py --config config/Transformer_DCN_microlens_mmctr_tuner_config_01 --expid Transformer_DCN_MicroLens_1M_x1_001_820c435c --gpu 0

print("✓ Done! Submission file is ready.")

Starting Training...
2025-12-05 17:13:39,647 P526 INFO FuxiCTR version: 2.3.7
2025-12-05 17:13:39,647 P526 INFO Params: {
    "batch_norm": "False",
    "batch_size": "1024",
    "concat_max_pool": "True",
    "data_format": "parquet",
    "data_root": "./data/",
    "dataset_id": "MicroLens_1M_x1",
    "dcn_cross_layers": "3",
    "dcn_hidden_units": "[512, 256]",
    "debug_mode": "False",
    "dim_feedforward": "128",
    "early_stop_patience": "2",
    "embedding_dim": "32",
    "embedding_regularizer": "0",
    "epochs": "5",
    "eval_steps": "None",
    "feature_cols": "[{'active': True, 'dtype': 'int', 'name': 'user_id', 'type': 'meta'}, {'active': True, 'dtype': 'int', 'name': 'item_seq', 'type': 'meta'}, {'active': True, 'dtype': 'int', 'name': 'likes_level', 'type': 'categorical', 'vocab_size': 11}, {'active': True, 'dtype': 'int', 'name': 'views_level', 'type': 'categorical', 'vocab_size': 11}, {'active': True, 'dtype': 'int', 'name': 'item_id', 'source': 'item', 'type': 'c