# Compare Data Modules

This notebook compares the data samples produced by `TransactionDataModule` (tabular) and `TextDataModule` (transformer text).

In [13]:
import sys
from pathlib import Path

# Add src to path
project_root = Path("..").resolve()
sys.path.append(str(project_root / "src"))

import torch
import numpy as np
import matplotlib.pyplot as plt
from ml_ops_project.data import TransactionDataModule
from ml_ops_project.data_transformer import TextDataModule

# Initialize Modules
# Note: We assume that the data has already been preprocessed and exists in data/processed
data_path = project_root / "data" / "processed" / "transactiq_processed"
tabular_dm = TransactionDataModule(data_path=str(data_path), batch_size=4)
tabular_dm.setup()

text_path = project_root / "data" 
text_dm = TextDataModule(data_path=str(text_path), batch_size=4)
# Force setup to ensure files are found
text_dm.processed_path = text_path / "processed" / "transactiq_processed_text"
text_dm.setup()

Loading data from /home/otto/repos/machine-learning-operations-02476-project/data/processed/transactiq_processed...
Splitting dataset: Train=3600834, Val=450104, Test=450105


In [14]:
# Retrieve Batch from Tabular Module
print("--- Tabular Data Module ---")
tabular_loader = tabular_dm.train_dataloader()
tabular_batch = next(iter(tabular_loader))

print("Keys:", tabular_batch.keys())
print("Features Shape:", tabular_batch["features"].shape)
print("Labels Shape:", tabular_batch["labels"].shape)
print("\nSample Feature Vector (First 5 elements):", tabular_batch["features"][0][:5])
print("Sample Label:", tabular_batch["labels"][0])

--- Tabular Data Module ---
Keys: dict_keys(['features', 'labels'])
Features Shape: torch.Size([4, 32])
Labels Shape: torch.Size([4])

Sample Feature Vector (First 5 elements): tensor([5.9916, 0.0000, 2.0000, 0.0000, 0.0000])
Sample Label: tensor(7)


In [15]:
# Retrieve Batch from Text Module
print("--- Text Data Module ---")
text_loader = text_dm.train_dataloader()
text_batch = next(iter(text_loader))

print("Keys:", text_batch.keys())
print("Input IDs Shape:", text_batch["input_ids"].shape)
print("Attention Mask Shape:", text_batch["attention_mask"].shape)
print("Labels Shape:", text_batch["labels"].shape)
print("\nSample Input IDs (First 10):", text_batch["input_ids"][0][:10])
print("Sample Label:", text_batch["labels"][0])

--- Text Data Module ---
Keys: dict_keys(['input_ids', 'attention_mask', 'labels'])
Input IDs Shape: torch.Size([4, 64])
Attention Mask Shape: torch.Size([4, 64])
Labels Shape: torch.Size([4])

Sample Input IDs (First 10): tensor([  101,  4613,  7242,  3589, 19067,  2078, 12740, 17788, 17134,   102])
Sample Label: tensor(5)


In [16]:
from transformers import AutoTokenizer

# Load tokenizer to decode text
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Get indices 0 for comparison
idx = 0

print("=== Single Sample Inspection ===")
print(f"Sample Index: {idx}")

print("\n--- Tabular Sample ---")
features = tabular_batch["features"][idx]
label = tabular_batch["labels"][idx]
print(f"Raw Features: {features.tolist()}")
print(f"Label Index: {label.item()}")


print("\n--- Text Sample ---")
input_ids = text_batch["input_ids"][idx]
attention_mask = text_batch["attention_mask"][idx]
text_label = text_batch["labels"][idx]

decoded_text = tokenizer.decode(input_ids, skip_special_tokens=True)
print(f"Decoded Text: '{decoded_text}'")
print(f"Input IDs (first 10): {input_ids[:10].tolist()}")
print(f"Label Index: {text_label.item()}")

=== Single Sample Inspection ===
Sample Index: 0

--- Tabular Sample ---
Raw Features: [5.991589546203613, 0.0, 2.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Label Index: 7

--- Text Sample ---
Decoded Text: 'speech therapy branch txn402513'
Input IDs (first 10): [101, 4613, 7242, 3589, 19067, 2078, 12740, 17788, 17134, 102]
Label Index: 5


In [17]:
# We cannot guarantee the indices align perfectly because both DataModules perform 
# their own random splitting and shuffling.

# However, we can inspect the raw datasets directly (before splitting/shuffling) to find the same sample.
from ml_ops_project.data import MyDataset
from ml_ops_project.data_transformer import TextDataset

# Load raw processed datasets directly
raw_tabular_dataset = MyDataset(tabular_dm.data_path)
raw_tabular_dataset.load()

raw_text_dataset = TextDataset(text_dm.processed_path)
raw_text_dataset.load()

# Pick a random index available in both
sample_idx = 42 

print(f"=== Direct Dataset Comparison (Index {sample_idx}) ===")

# Tabular
tab_sample = raw_tabular_dataset[sample_idx]
print("\n--- Tabular ---")
print("Features:", tab_sample["features"][:10]) # First 10 features
print("Label (Int):", tab_sample["labels"])

# Text
text_sample = raw_text_dataset[sample_idx]
print("\n--- Text ---")
print("Decoded:", tokenizer.decode(text_sample["input_ids"], skip_special_tokens=True))
print("Label (Int):", text_sample["labels"])


=== Direct Dataset Comparison (Index 42) ===

--- Tabular ---
Features: tensor([10.9943,  0.0000,  4.0000,  0.0000,  0.0000,  0.0000,  0.0000,  1.0000,
         0.0000,  0.0000])
Label (Int): tensor(6)

--- Text ---
Decoded: refund - usa
Label (Int): tensor(6)


In [19]:
text_sample

{'input_ids': tensor([  101, 25416,  8630,  1011,  3915,   102,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'labels': tensor(6)}