In [128]:
import numpy as np
import torch
import torch.nn as nn
import pandas as pd
from tqdm.auto import tqdm
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split

In [129]:
img_embeds_train = np.load("/content/drive/MyDrive/Amazon ML challenge/embeddings/clip_image_train_embeds.npy")
txt_embeds_train = np.load("/content/drive/MyDrive/Amazon ML challenge/embeddings/text_train_embeddings.npy")

print(img_embeds_train.shape)
print(txt_embeds_train.shape)

img_embeds_train = torch.from_numpy(img_embeds_train)
txt_embeds_train = torch.from_numpy(txt_embeds_train)

print(img_embeds_train.shape)
print(txt_embeds_train.shape)
print(img_embeds_train.dtype)
print(txt_embeds_train.dtype)
print(img_embeds_train.ndim)
print(txt_embeds_train.ndim)

(74999, 512)
(75000, 128)
torch.Size([74999, 512])
torch.Size([75000, 128])
torch.float32
torch.float32
2
2


In [130]:
df = pd.read_csv("/content/drive/MyDrive/Amazon ML challenge/amazon ml challenge/student_resource/dataset/train.csv")
df.drop(df[df["image_link"] == "https://m.media-amazon.com/images/I/51mjZYDYjyL.jpg"].index, axis=0, inplace=True)
df.shape

(74999, 5)

In [131]:
i = 38945 # training missing index
txt_embeds_train = torch.cat((txt_embeds_train[:i], txt_embeds_train[i+1:]), dim=0)

In [132]:
print(img_embeds_train.shape)
print(txt_embeds_train.shape)
print(img_embeds_train.dtype)
print(txt_embeds_train.dtype)
print(img_embeds_train.ndim)
print(txt_embeds_train.ndim)

torch.Size([74999, 512])
torch.Size([74999, 128])
torch.float32
torch.float32
2
2


In [133]:
combined_embs_train = torch.cat((img_embeds_train, txt_embeds_train), dim=1)
combined_embs_train.shape

torch.Size([74999, 640])

In [134]:
targets = df.loc[:, "price"].to_list()
targets = torch.tensor(targets)

idx = 55

print(targets[idx])
log_targets = torch.log1p(targets)
print(log_targets[idx])

# original_price_pred = torch.expm1(model_output)

targets.shape

tensor(20.7900)
tensor(3.0815)


torch.Size([74999])

In [135]:
torch.manual_seed(143)

X_train, X_test, y_train, y_test = train_test_split(
    combined_embs_train,
    log_targets,
    test_size=0.2,
    random_state=143
)

train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

print(len(train_loader))
print(len(test_loader))

1875
469


In [136]:
1875+469

2344

In [139]:
class regression_model(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(640, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.5),

            nn.Linear(512, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.3),

            nn.Linear(128, 1)
        )

    def forward(self, x):
        return self.net(x)

model = regression_model()

class SMAPELoss(nn.Module):
    def __init__(self, epsilon=1e-2):
        super().__init__()
        self.epsilon = epsilon

    def forward(self, y_pred, y_true):
        denominator = (torch.abs(y_true) + torch.abs(y_pred)) / 2.0
        mape = torch.abs(y_pred - y_true) / denominator
        return (torch.mean(mape) * 100)

criterion = SMAPELoss()
# criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay = 0.0001)

In [140]:
torch.manual_seed(143)

print("TRAINING ON TRAIN DATASET")
model.train()

epochs = 10
for epoch in tqdm(range(epochs)):
    running_loss = 0
    for X, y in tqdm(train_loader):
        preds = model(X).squeeze()

        preds = torch.expm1(preds)
        y = torch.expm1(y)

        loss = criterion(preds, y)
        running_loss += loss.item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    running_loss /= len(train_loader)
    print(f"Epoch {epoch+1}/{epochs}: SMAPE score = {running_loss:.4f}%")


print("EVALUATING ON TEST DATASET")
model.eval()

running_loss = 0
with torch.inference_mode():
  for X,y in tqdm(test_loader):
    test_preds = model(X)
    test_preds = torch.expm1(test_preds)
    y = torch.expm1(y)
    running_loss += criterion(test_preds, y).item()

running_loss /= len(test_loader)
print(f"Testing SMAPE score = {running_loss:.4f}%")

TRAINING ON TRAIN DATASET


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/1875 [00:00<?, ?it/s]

Epoch 1/10: SMAPE score = 62.1433%


  0%|          | 0/1875 [00:00<?, ?it/s]

Epoch 2/10: SMAPE score = 55.5585%


  0%|          | 0/1875 [00:00<?, ?it/s]

Epoch 3/10: SMAPE score = 54.2055%


  0%|          | 0/1875 [00:00<?, ?it/s]

Epoch 4/10: SMAPE score = 53.1380%


  0%|          | 0/1875 [00:00<?, ?it/s]

Epoch 5/10: SMAPE score = 52.8307%


  0%|          | 0/1875 [00:00<?, ?it/s]

Epoch 6/10: SMAPE score = 51.9609%


  0%|          | 0/1875 [00:00<?, ?it/s]

Epoch 7/10: SMAPE score = 51.7739%


  0%|          | 0/1875 [00:00<?, ?it/s]

Epoch 8/10: SMAPE score = 51.4741%


  0%|          | 0/1875 [00:00<?, ?it/s]

Epoch 9/10: SMAPE score = 51.2049%


  0%|          | 0/1875 [00:00<?, ?it/s]

Epoch 10/10: SMAPE score = 51.0696%
EVALUATING ON TEST DATASET


  0%|          | 0/469 [00:00<?, ?it/s]

Testing SMAPE score = 81.4128%


In [None]:
torch.save(model.state_dict(), "model_weights.pth")
print("✅ Model saved as model_weights.pth")

✅ Model saved as model_weights.pth


In [None]:
model = regression_model()             # initialize same model class
model.load_state_dict(torch.load("model_weights.pth"))
model.eval()  # put it in evaluation mode

regression_model(
  (net): Sequential(
    (0): Linear(in_features=640, out_features=512, bias=True)
    (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.5, inplace=False)
    (4): Linear(in_features=512, out_features=128, bias=True)
    (5): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU()
    (7): Dropout(p=0.3, inplace=False)
    (8): Linear(in_features=128, out_features=1, bias=True)
  )
)

In [None]:
y_actual = torch.tensor([120, 100, 50])
y_preds = torch.tensor([120, 99, 51])

criterion(y_actual,y_preds)

tensor(0.9951)

In [None]:
y_test
y_test_orig = torch.expm1(y_test)
y_test_orig

tensor([ 4.9700, 20.0900, 22.4100,  ..., 27.9500, 11.4900,  8.7250])

In [None]:
model_preds = model(X_test)
model_preds = torch.expm1(model_preds)
model_preds

tensor([[ 7.8762],
        [21.0826],
        [21.4784],
        ...,
        [34.8345],
        [15.8638],
        [12.3700]], grad_fn=<Expm1Backward0>)

In [None]:
criterion(y_test_orig, model_preds.squeeze())
# print(y_train.shape)
# print(model_preds.shape)
# 49.2366

tensor(49.3481, grad_fn=<MulBackward0>)

In [None]:
df_test = pd.read_csv("/content/drive/MyDrive/Amazon ML challenge/amazon ml challenge/student_resource/dataset/test.csv")
df_test.head(10)

Unnamed: 0,sample_id,catalog_content,image_link
0,100179,Item Name: Rani 14-Spice Eshamaya's Mango Chut...,https://m.media-amazon.com/images/I/71hoAn78AW...
1,245611,Item Name: Natural MILK TEA Flavoring extract ...,https://m.media-amazon.com/images/I/61ex8NHCIj...
2,146263,Item Name: Honey Filled Hard Candy - Bulk Pack...,https://m.media-amazon.com/images/I/61KCM61J8e...
3,95658,Item Name: Vlasic Snack'mm's Kosher Dill 16 Oz...,https://m.media-amazon.com/images/I/51Ex6uOH7y...
4,36806,"Item Name: McCormick Culinary Vanilla Extract,...",https://m.media-amazon.com/images/I/71QYlrOMoS...
5,148239,"Item Name: Snyder's of Hanover Mini Pretzel, 1...",https://m.media-amazon.com/images/I/81ONUU3OAf...
6,92659,Item Name: Oregon Plum Purple 15 oz (Pack of 3...,https://m.media-amazon.com/images/I/519Plng8cp...
7,3780,"Item Name: Barkman Honey 059640 Honey, Clover,...",https://m.media-amazon.com/images/I/6134TQBo+v...
8,196940,Item Name: Against The Grain Gluten Free Origi...,https://m.media-amazon.com/images/I/41bK3vFt5R...
9,20472,"Item Name: Nature Valley Granola Bars, Sweet a...",https://m.media-amazon.com/images/I/81NWfSFYXB...


In [None]:
# index to ignore 42045

img_embeds_test = np.load("/content/drive/MyDrive/Amazon ML challenge/embeddings/clip_image_test_embeds.npy")
txt_embeds_test = np.load("/content/drive/MyDrive/Amazon ML challenge/embeddings/text_test_embeddings.npy")

print(img_embeds_test.shape)
print(txt_embeds_test.shape)

img_embeds_test = torch.from_numpy(img_embeds_test)
txt_embeds_test = torch.from_numpy(txt_embeds_test)

print(img_embeds_test.shape)
print(txt_embeds_test.shape)
print(img_embeds_test.dtype)
print(txt_embeds_test.dtype)
print(img_embeds_test.ndim)
print(txt_embeds_test.ndim)

(74999, 512)
(75000, 128)
torch.Size([74999, 512])
torch.Size([75000, 128])
torch.float32
torch.float32
2
2


In [None]:
missing_idx = 42045

def fill_with_interpolation(emb, idx):
    if idx == 0:
        interp_row = emb[0:1, :]
    elif idx == emb.shape[0]:
        interp_row = emb[-1:, :]
    else:
        interp_row = (emb[idx-1:idx, :] + emb[idx:idx+1, :]) / 2
    emb_filled = np.insert(emb, idx, interp_row, axis=0)
    return emb_filled

img_embeds_test = fill_with_interpolation(img_embeds_test, missing_idx)

print(img_embeds_test.shape)
print(txt_embeds_test.shape)
print(img_embeds_test.dtype)
print(txt_embeds_test.dtype)
print(img_embeds_test.ndim)
print(txt_embeds_test.ndim)

# result = np.concatenate([emb1_zeros, emb2], axis=1)  # shape (75000, 640)

torch.Size([75000, 512])
torch.Size([75000, 128])
torch.float32
torch.float32
2
2


In [None]:
combined_embs_test = torch.cat((img_embeds_test, txt_embeds_test), dim=1)
# combined_embs_test.dtype
combined_embs_test.shape

torch.Size([75000, 640])

In [None]:
torch.manual_seed(143)

test_dataset = TensorDataset(combined_embs_test)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle = False)

print(len(test_loader))
# test_dataset

2344


In [None]:
model.eval()

prices = []

with torch.inference_mode():
  for X in tqdm(test_loader):
    test_preds = model(X[0])
    prices.extend(torch.expm1(test_preds).tolist())

prices[0:100]

  0%|          | 0/2344 [00:00<?, ?it/s]

[[10.900726318359375],
 [22.45857810974121],
 [28.516761779785156],
 [10.615925788879395],
 [29.245662689208984],
 [7.02227258682251],
 [11.767744064331055],
 [8.008166313171387],
 [14.693929672241211],
 [6.980665683746338],
 [20.861024856567383],
 [6.3170671463012695],
 [23.38658332824707],
 [12.04135513305664],
 [21.425992965698242],
 [14.088099479675293],
 [19.214027404785156],
 [16.591718673706055],
 [11.220873832702637],
 [6.633373260498047],
 [15.31226921081543],
 [6.685408592224121],
 [6.852643013000488],
 [13.83100414276123],
 [9.651530265808105],
 [29.21933937072754],
 [8.394816398620605],
 [12.496479988098145],
 [6.7154998779296875],
 [34.45280838012695],
 [29.17613410949707],
 [22.423748016357422],
 [11.218134880065918],
 [7.890181541442871],
 [44.57797622680664],
 [11.249650955200195],
 [4.697519302368164],
 [6.1924567222595215],
 [15.095165252685547],
 [15.018328666687012],
 [14.747864723205566],
 [5.69584846496582],
 [6.287599563598633],
 [161.3787384033203],
 [9.76602649

In [None]:
prices_final = [p[0] for p in prices]
prices_final[0:20]

[10.900726318359375,
 22.45857810974121,
 28.516761779785156,
 10.615925788879395,
 29.245662689208984,
 7.02227258682251,
 11.767744064331055,
 8.008166313171387,
 14.693929672241211,
 6.980665683746338,
 20.861024856567383,
 6.3170671463012695,
 23.38658332824707,
 12.04135513305664,
 21.425992965698242,
 14.088099479675293,
 19.214027404785156,
 16.591718673706055,
 11.220873832702637,
 6.633373260498047]

In [None]:
sample_ids = df_test["sample_id"].to_list()
sample_ids[:10]

[100179, 245611, 146263, 95658, 36806, 148239, 92659, 3780, 196940, 20472]

In [None]:
df_result = pd.DataFrame(sample_ids, columns=["sample_id"])
df_result["price"] = prices_final
df_result.to_csv("Result.csv", index=False)

In [None]:
df_result

Unnamed: 0,sample_id,price
0,100179,10.900726
1,245611,22.458578
2,146263,28.516762
3,95658,10.615926
4,36806,29.245663
...,...,...
74995,93616,6.376295
74996,249434,18.465937
74997,162217,7.318513
74998,230487,22.213520
