In [2]:
import hackathon
from hackathon import HackathonDataset


In [3]:
if __name__ == "__main__":
    from torch.utils.data import DataLoader
    from collate import collate_fn

    dataset = HackathonDataset(split="val", download=True, seed=42, root="data")
    test_predictions = {idx: [] for idx in range(18299)}
    dataset.create_submission(test_predictions)

    dataloader = DataLoader(dataset, batch_size=32, collate_fn=collate_fn)

    for batch in dataloader:
        print(batch)
        break


Submission saved to submissions/submission_20251025_152857.csv
{'X': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'Y': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'context': tensor([[[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]],

        [[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]],

        [[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,

In [4]:
for key, value in dataset[0].items():
    print(f"{key}: {type(value)}")

sample = dataset[0]

print(sample["X_codes"].shape)
print(sample["Y_codes"].shape)

id: <class 'int'>
X: <class 'torch.Tensor'>
Y: <class 'torch.Tensor'>
project_id: <class 'int'>
room_cluster: <class 'str'>
room_cluster_one_hot: <class 'torch.Tensor'>
calculus: <class 'list'>
X_codes: <class 'torch.Tensor'>
Y_codes: <class 'torch.Tensor'>
insurance_company: <class 'str'>
insurance_company_one_hot: <class 'torch.Tensor'>
recover_office_zip_code: <class 'int'>
damage_address_zip_code: <class 'int'>
office_distance: <class 'float'>
case_creation_year: <class 'int'>
case_creation_month: <class 'int'>
torch.Size([3])
torch.Size([0])


In [5]:
print(sample["X_codes"])
print(sample["Y_codes"])
print(sample["X"].shape)
print(sample["Y"].shape)

tensor([259, 256, 108])
tensor([])
torch.Size([388])
torch.Size([388])


In [6]:
smth = next(iter(dataloader))
for key, value in smth.items():
    print(f"{key}: {type(value)}")
# print(smth)
print("Hello, Hackathon Dataset!")


X: <class 'torch.Tensor'>
Y: <class 'torch.Tensor'>
context: <class 'torch.Tensor'>
context_mask: <class 'torch.Tensor'>
Hello, Hackathon Dataset!


In [7]:
print(smth["context"].shape)
print(smth["X"].shape)
print(smth["Y"].shape)
# smth["context"]

torch.Size([32, 6, 399])
torch.Size([32, 399])
torch.Size([32, 388])


In [51]:
import torch
import torch.nn as nn
import torch.optim as optim
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# === 1. Minimal Multi-Label Model ===
class SimpleMultiLabelModel(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dim=256):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.Sigmoid(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.Sigmoid(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.Sigmoid(),
            nn.Linear(hidden_dim, output_dim)
        )
    def forward(self, batch):
        return self.fc(batch["X"])  # batch["X"] is [batch_size, input_dim]

# === 2. Basic Training Loop ===
def train_model(model, dataloader, num_epochs=5, learning_rate=1e-3, device=torch.device('cpu')):
    model = model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.BCEWithLogitsLoss()

    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        for batch in dataloader:
            x = batch["X"].to(device).float()
            y = batch["Y"].to(device).float()

            outputs = model({'X': x})
            loss = criterion(outputs, y)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            running_loss += loss.item() * x.size(0)
        avg_loss = running_loss / len(dataloader.dataset)
        print(f"Epoch {epoch+1}/{num_epochs} - Loss: {avg_loss:.4f}")



In [None]:
# === Example usage (assuming you have dataset and dataloader ready) ===
# First, get your dataloader as per your pipeline
from dataset.hackathon import HackathonDataset
from dataset.collate import collate_fn
from torch.utils.data import DataLoader

dataset = HackathonDataset(split="train", download=True)  # Adjust paths/splits as needed


In [52]:
# Now, initialize and train the model:
batch_size = 1024
dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)

In [53]:
# Now, initialize and train the model:
input_dim = dataset[0]["X"].numel()
output_dim = dataset[0]["Y"].numel()
model = SimpleMultiLabelModel(399, output_dim)
train_model(model, dataloader, num_epochs=1, device=device)


Epoch 1/1 - Loss: 0.0537


In [54]:
test_dataset = HackathonDataset(split="test", download=True)
test_dataloader = DataLoader(test_dataset, batch_size=18299, collate_fn=collate_fn, shuffle=False)


In [55]:
for batch in test_dataloader:
    print(len(batch["X"]))

18299


In [56]:
# Set model to eval mode and turn off gradients
model.eval()
predictions = {}

with torch.no_grad():
    for batch in test_dataloader:
        x = batch["X"].to(device).float()
        logits = model({'X': x})
        probs = torch.sigmoid(logits)
        # Get predicted cluster indices (threshold can be tuned)
        pred_indices = (probs > 0.25).cpu().numpy()
        ids = range(len(batch["X"])) # Assuming your batch dict contains the test sample ids
        for idx, arr in zip(ids, pred_indices):
            # arr is a boolean array length=num_clusters
            cluster_indices = [i for i, v in enumerate(arr) if v]
            predictions[int(idx)] = cluster_indices

# Save using the dataset's built-in create_submission method
test_dataset.create_submission(predictions)

Submission saved to submissions/submission_20251025_163749.csv


In [57]:
len(predictions)

18299

In [58]:
print(device)

cuda
