In [1]:
from google.colab import drive
drive.mount('/content/drive')

BASE = "/content/drive/MyDrive/mlembeddedshared"

!pip install -q torch torchvision transformers lightgbm xgboost tqdm joblib pandas numpy scikit-learn

import os, gc
import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
import joblib
import xgboost as xgb
import lightgbm as lgb
from tqdm import tqdm

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)


Mounted at /content/drive
Using device: cuda


In [2]:
# Load embeddings & target
X_train_full = np.load(os.path.join(BASE, "X_train_reduced.npy"))
y_train_full = np.load(os.path.join(BASE, "y_train.npy"))
X_test_full  = np.load(os.path.join(BASE, "X_test_reduced.npy"))
sample_ids_test = np.load(os.path.join(BASE, "sample_ids.npy"))

# Log-transform target
y_train_log = np.log1p(y_train_full)

# Normalize embeddings
scaler = StandardScaler()
X_train_full = scaler.fit_transform(X_train_full)
X_test_full  = scaler.transform(X_test_full)

# Train/validation split
X_tr, X_val, y_tr, y_val = train_test_split(X_train_full, y_train_log, test_size=0.1, random_state=42)
print("Train/Val shapes:", X_tr.shape, X_val.shape)


Train/Val shapes: (67500, 898) (7500, 898)


In [3]:
class PriceDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_ds = PriceDataset(X_tr, y_tr)
val_ds   = PriceDataset(X_val, y_val)

train_loader = DataLoader(train_ds, batch_size=256, shuffle=True)
val_loader   = DataLoader(val_ds, batch_size=256, shuffle=False)


In [4]:
class MultiModalTransformer(nn.Module):
    def __init__(self, input_dim, embed_dim=512, n_heads=4, n_layers=2, ff_dim=1024, dropout=0.1):
        super().__init__()
        self.embedding = nn.Linear(input_dim, embed_dim)
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embed_dim,
            nhead=n_heads,
            dim_feedforward=ff_dim,
            dropout=dropout,
            activation='relu'
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)
        self.fc = nn.Sequential(
            nn.Linear(embed_dim, 256),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(256, 1)
        )

    def forward(self, x):
        x = self.embedding(x).unsqueeze(1)  # add sequence dimension
        x = self.transformer(x)  # shape: (batch, seq=1, embed_dim)
        x = x.squeeze(1)         # remove seq dim
        return self.fc(x)

input_dim = X_train_full.shape[1]
model = MultiModalTransformer(input_dim).to(device)
print(model)




MultiModalTransformer(
  (embedding): Linear(in_features=898, out_features=512, bias=True)
  (transformer): TransformerEncoder(
    (layers): ModuleList(
      (0-1): 2 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (linear1): Linear(in_features=512, out_features=1024, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=1024, out_features=512, bias=True)
        (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (fc): Sequential(
    (0): Linear(in_features=512, out_features=256, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.1, inplace=False)
    (3): Linear(in_features=256, out_features=1, bias=True)
 

In [5]:
criterion = nn.MSELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3)

epochs = 20
best_val_loss = float('inf')

for epoch in range(epochs):
    model.train()
    train_loss = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device).unsqueeze(1)
        optimizer.zero_grad()
        y_pred = model(X_batch)
        loss = criterion(y_pred, y_batch)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * X_batch.size(0)
    train_loss /= len(train_loader.dataset)

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device).unsqueeze(1)
            y_pred = model(X_batch)
            loss = criterion(y_pred, y_batch)
            val_loss += loss.item() * X_batch.size(0)
    val_loss /= len(val_loader.dataset)
    scheduler.step(val_loss)

    print(f"Epoch {epoch+1}/{epochs} | Train Loss: {train_loss:.5f} | Val Loss: {val_loss:.5f}")

torch.save(model.state_dict(), os.path.join(BASE, "transformer_price_model.pth"))
print("âœ… Transformer model saved.")


Epoch 1/20 | Train Loss: 0.82085 | Val Loss: 0.59679
Epoch 2/20 | Train Loss: 0.55219 | Val Loss: 0.56940
Epoch 3/20 | Train Loss: 0.49940 | Val Loss: 0.56846
Epoch 4/20 | Train Loss: 0.45094 | Val Loss: 0.53056
Epoch 5/20 | Train Loss: 0.40278 | Val Loss: 0.52823
Epoch 6/20 | Train Loss: 0.35728 | Val Loss: 0.53070
Epoch 7/20 | Train Loss: 0.31549 | Val Loss: 0.54716
Epoch 8/20 | Train Loss: 0.28023 | Val Loss: 0.55042
Epoch 9/20 | Train Loss: 0.24549 | Val Loss: 0.54270
Epoch 10/20 | Train Loss: 0.19395 | Val Loss: 0.54508
Epoch 11/20 | Train Loss: 0.17579 | Val Loss: 0.54703
Epoch 12/20 | Train Loss: 0.16286 | Val Loss: 0.55662
Epoch 13/20 | Train Loss: 0.15184 | Val Loss: 0.56248
Epoch 14/20 | Train Loss: 0.13298 | Val Loss: 0.56131
Epoch 15/20 | Train Loss: 0.12650 | Val Loss: 0.56550
Epoch 16/20 | Train Loss: 0.12154 | Val Loss: 0.56299
Epoch 17/20 | Train Loss: 0.11734 | Val Loss: 0.56630
Epoch 18/20 | Train Loss: 0.10952 | Val Loss: 0.56682
Epoch 19/20 | Train Loss: 0.10725 | V

In [6]:
model.eval()
X_test_tensor = torch.tensor(X_test_full, dtype=torch.float32).to(device)
with torch.no_grad():
    y_pred_log = model(X_test_tensor).squeeze().cpu().numpy()
    y_pred = np.expm1(y_pred_log)  # back-transform

# Clip unrealistic prices
y_pred = np.clip(y_pred, 1, 5000)

submission = pd.DataFrame({"sample_id": sample_ids_test, "price": y_pred})
submission_file = os.path.join(BASE, "submission_transformer.csv")
submission.to_csv(submission_file, index=False)
print(f"ðŸŽ‰ Submission saved: {submission_file}")
submission.head()


ðŸŽ‰ Submission saved: /content/drive/MyDrive/mlembeddedshared/submission_transformer.csv


Unnamed: 0,sample_id,price
0,100179,8.116191
1,245611,22.695425
2,146263,19.028183
3,95658,15.121753
4,36806,35.398964


In [7]:
# Load previous LGB/XGB models
lgb_model = joblib.load(os.path.join(BASE, "lgb_model_full_lowlevel.joblib"))
xgb_model = xgb.Booster()
xgb_model.load_model(os.path.join(BASE, "xgb_model_full_lowlevel.json"))

dtest_xgb = xgb.DMatrix(X_test_full)
y_pred_lgb = lgb_model.predict(X_test_full)
y_pred_xgb = xgb_model.predict(dtest_xgb)

# Weighted ensemble: Transformer + LGB + XGB
y_pred_ensemble = 0.5*y_pred + 0.25*y_pred_lgb + 0.25*y_pred_xgb

submission = pd.DataFrame({"sample_id": sample_ids_test, "price": y_pred_ensemble})
submission_file = os.path.join(BASE, "submission_transformer_ensemble.csv")
submission.to_csv(submission_file, index=False)
print(f"ðŸŽ‰ Ensemble submission saved: {submission_file}")
submission.head()


ðŸŽ‰ Ensemble submission saved: /content/drive/MyDrive/mlembeddedshared/submission_transformer_ensemble.csv


Unnamed: 0,sample_id,price
0,100179,189.069535
1,245611,161.259154
2,146263,200.229997
3,95658,191.316404
4,36806,202.657497


In [11]:
# Make predictions on validation set
model.eval()
val_loader_smape = DataLoader(val_ds, batch_size=256, shuffle=False)

y_val_pred_log = []
with torch.no_grad():
    for X_batch, _ in val_loader_smape:
        X_batch = X_batch.to(device)
        y_pred_batch = model(X_batch).squeeze().cpu().numpy()
        y_val_pred_log.append(y_pred_batch)

y_val_pred_log = np.concatenate(y_val_pred_log)
y_val_pred = np.expm1(y_val_pred_log)  # back-transform

# Back-transform y_val for SMAPE
y_val_true = np.expm1(y_val)  # y_val was log1p transformed

def smape(y_true, y_pred, eps=1e-9):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    denom = (np.abs(y_true) + np.abs(y_pred)) / 2.0     # denominator
    denom = np.where(denom == 0, eps, denom)           # avoid divide by zero
    smape_vals = np.abs(y_pred - y_true) / denom       # numerator / denominator
    return 100.0 * np.mean(smape_vals)                 # mean across all samples


# Compute SMAPE
val_smape = smape(y_val_true, y_val_pred)
print(f"âœ… Validation SMAPE: {val_smape:.3f}%")

âœ… Validation SMAPE: 55.918%
