In [1]:
!pip install gensim

  pid, fd = os.forkpty()


Collecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Downloading scipy-1.13.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (38.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.6/38.6 MB[0m [31m42.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: scipy
  Attempting uninstall: scipy
    Found existing installation: scipy 1.14.1
    Uninstalling scipy-1.14.1:
      Successfully uninstalled scipy-1.14.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cuml 24.8.0 requires cupy-cuda11x>=12.0.0, which is not installed.
cesium 0.12.3 requires numpy<3.0,>=2.0, but you have numpy

In [2]:
import scipy
import gensim

In [3]:
# Install required packages if not already installed
!pip install transformers 



In [4]:
category = "Men_Tshirts"

In [5]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import ViTModel, ViTFeatureExtractor
from torchvision import transforms
from PIL import Image
import gensim
import numpy as np
import pandas as pd
import os
from tqdm import tqdm

# 1. Data Preparation
# Load your dataset
# Assume 'data.csv' has columns: 'image_path', 'len', 'color', 'neck', 'pattern', 'print_or_pattern_type', 'sleeve_length'
data = pd.read_csv(f'/kaggle/input/m1dataset/{category}.csv')
# data.drop("attr_10", axis=1, inplace=True)

# Ensure the 'image_path' column contains the correct paths to your images
# For example:
# data['image_path'] = data['image_path'].apply(lambda x: os.path.join('images', x))

In [6]:
import os

In [7]:
# 2. Word2Vec Embeddings
from gensim.models import KeyedVectors

# Load pre-trained word2vec embeddings (e.g., Google News vectors)
# Download link: https://code.google.com/archive/p/word2vec/
# Make sure you have 'GoogleNews-vectors-negative300.bin.gz' in your working directory
word2vec_model = KeyedVectors.load_word2vec_format('/kaggle/input/m1dataset/GoogleNews-vectors-negative300.bin/GoogleNews-vectors-negative300.bin', binary=True)

# Function to get embeddings for attribute values
def get_embedding(word):
    word = str(word).lower().replace(' ', '_')
    if word in word2vec_model:
        return word2vec_model[word]
    else:
        # Handle out-of-vocabulary words
        # You can use a random vector or zeros
        return np.zeros(word2vec_model.vector_size)

In [8]:
data.columns[3:]

Index(['color', 'neck', 'pattern', 'print_or_pattern_type', 'sleeve_length',
       'attr_6', 'attr_7', 'attr_8', 'attr_9', 'attr_10'],
      dtype='object')

In [13]:
# 3. Dataset Creation
class CustomDataset(Dataset):
    def __init__(self, dataframe, img_dir, transform=None):
        self.data = dataframe.reset_index(drop=True)
        self.transform = transform
        self.img_dir = img_dir
        self.dataframe = dataframe
        self.feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')
        self.attributes = data.columns[3:8]
        self.embedding_dim = word2vec_model.vector_size
        self.num_attributes = len(self.attributes)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Get image
        img_path = os.path.join(self.img_dir, f"{int(self.dataframe.iloc[idx, 0]):06d}.jpg")
        image = Image.open(img_path).convert('RGB')
        if self.transform:
            image = self.transform(image)
        else:
            # Use ViT's feature extractor transformations
            image = self.feature_extractor(images=image, return_tensors='pt')['pixel_values'].squeeze(0)

        # Get attribute embeddings
        embeddings = []
        for attr in self.attributes:
            value = self.data.loc[idx, attr]
            embedding = get_embedding(value)
            embeddings.append(embedding)
        target = np.concatenate(embeddings)
        target = torch.tensor(target, dtype=torch.float32)
        return image, target

# Define transformations (if needed)
# ViT models expect images of size 224x224
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

# Split data into training and validation sets
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(data, test_size=0.1, random_state=42)
# train_df = data

# Create datasets and dataloaders
train_dataset = CustomDataset(train_df, "/kaggle/input/m1dataset/train_images", transform=transform)
val_dataset = CustomDataset(val_df, "/kaggle/input/m1dataset/train_images", transform=transform)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

In [14]:
# 4. Model Definition
class ViTForAttributePrediction(nn.Module):
    def __init__(self, embedding_dim, num_attributes):
        super(ViTForAttributePrediction, self).__init__()
        self.vit = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')
        self.fc = nn.Linear(self.vit.config.hidden_size, embedding_dim * num_attributes)
        self.num_attributes = num_attributes
        self.embedding_dim = embedding_dim

    def forward(self, x):
        outputs = self.vit(pixel_values=x)
        pooled_output = outputs.pooler_output  # [batch_size, hidden_size]
        logits = self.fc(pooled_output)        # [batch_size, embedding_dim * num_attributes]
        logits = logits.view(-1, self.num_attributes, self.embedding_dim)
        return logits

# Instantiate the model
embedding_dim = word2vec_model.vector_size
num_attributes = len(train_dataset.attributes)
model = ViTForAttributePrediction(embedding_dim, num_attributes)

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)


ViTForAttributePrediction(
  (vit): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTSdpaAttention(
            (attention): ViTSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_fe

In [15]:
# 5. Loss Function and Optimizer
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=3e-5)

In [None]:
# Early Stopping Parameters
early_stopping_patience = 3  # Number of epochs to wait before stopping
best_val_loss = float('inf')
epochs_no_improve = 0

# 6. Training Loop
num_epochs = 15
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for images, targets in tqdm(train_loader, desc=f"Training Epoch {epoch+1}/{num_epochs}"):
        images = images.to(device)
        targets = targets.to(device)

        optimizer.zero_grad()
        outputs = model(images)  # outputs: [batch_size, num_attributes, embedding_dim]
        targets = targets.view(-1, num_attributes, embedding_dim)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * images.size(0)
    epoch_loss = running_loss / len(train_loader.dataset)
    print(f'Epoch {epoch+1}/{num_epochs}, Training Loss: {epoch_loss:.4f}')

    # Validation Loop
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for images, targets in tqdm(val_loader, desc="Validation"):
            images = images.to(device)
            targets = targets.to(device)
            outputs = model(images)
            targets = targets.view(-1, num_attributes, embedding_dim)
            loss = criterion(outputs, targets)
            val_loss += loss.item() * images.size(0)
    val_loss /= len(val_loader.dataset)
    print(f'Epoch {epoch+1}/{num_epochs}, Validation Loss: {val_loss:.4f}')

    # Early Stopping Check
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        epochs_no_improve = 0
        # Save the best model
        torch.save(model.state_dict(), f'/kaggle/working/{category}_best_model.pth')
        print(f'Validation loss improved. Model saved.')
    else:
        epochs_no_improve += 1
        print(f'No improvement in validation loss for {epochs_no_improve} epoch(s).')

    if epochs_no_improve >= early_stopping_patience:
        print(f'Early stopping triggered. No improvement in validation loss for {early_stopping_patience} consecutive epochs.')
        break

Training Epoch 1/15: 100%|██████████| 409/409 [04:46<00:00,  1.43it/s]


Epoch 1/15, Training Loss: 0.0140


Validation: 100%|██████████| 46/46 [00:17<00:00,  2.58it/s]


Epoch 1/15, Validation Loss: 0.0109
Validation loss improved. Model saved.


Training Epoch 2/15: 100%|██████████| 409/409 [04:13<00:00,  1.61it/s]


Epoch 2/15, Training Loss: 0.0106


Validation: 100%|██████████| 46/46 [00:11<00:00,  3.87it/s]


Epoch 2/15, Validation Loss: 0.0103
Validation loss improved. Model saved.


Training Epoch 3/15:  41%|████      | 167/409 [01:43<02:29,  1.62it/s]

In [38]:
!rm -rf *1.pth
!rm -rf *2.pth
!rm -rf *3.pth
!rm -rf *4.pth
!rm -rf *5.pth
!rm -rf *6.pth
!rm -rf *7.pth
!rm -rf *8.pth
!rm -rf *9.pth


In [46]:
# 7. Evaluation
# For evaluation, you can compute the cosine similarity between predicted embeddings and actual embeddings
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

# Example on the validation set
# model.eval()
# with torch.no_grad():
#     for images, targets in val_loader:
#         images = images.to(device)
#         outputs = model(images)
#         outputs = outputs.cpu().numpy()
#         targets = targets.view(-1, num_attributes, embedding_dim).cpu().numpy()
#         for i in range(outputs.shape[0]):
#             for j in range(num_attributes):
#                 pred_embedding = outputs[i, j, :]
#                 true_embedding = targets[i, j, :]
#                 similarity = cosine_similarity(pred_embedding, true_embedding)
#                 print(f'Attribute {train_dataset.attributes[j]} Similarity: {similarity:.4f}')

In [47]:
# Save the trained model
# torch.save(model.state_dict(), '/kaggle/working/2_best_model.pth')

In [None]:
# Initialize the model architecture
model = ViTForAttributePrediction(
    embedding_dim=embedding_dim,
    num_attributes=num_attributes
)
model.to(device)

# Load the saved model weights
model.load_state_dict(torch.load(f'/kaggle/working/{category}_best_model.pth'))
model.eval()


In [None]:
# Load test data
test_data = pd.read_csv(f'/kaggle/input/m1dataset/test_folder/test_folder/test_{category}.csv')

# # Create a column for image paths based on 'id' and 'Category'
# # Adjust the path construction according to your directory structure
# test_data['image_path'] = test_data.apply(
#     lambda row: os.path.join('images', row['Category'], f"{row['id']}.jpg"),
#     axis=1
# )


In [None]:
class TestDataset(Dataset):
    def __init__(self, dataframe, img_dir, transform=None):
        self.img_dir = img_dir
        self.dataframe = dataframe
        self.data = dataframe.reset_index(drop=True)
        self.transform = transform
        self.feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        # Get image
        img_path = os.path.join(self.img_dir, f"{int(self.dataframe.iloc[idx, 0]):06d}.jpg")
        image = Image.open(img_path).convert('RGB')
        if self.transform:
            image = self.transform(image)
        else:
            image = self.feature_extractor(images=image, return_tensors='pt')['pixel_values'].squeeze(0)
        return image


In [None]:
test_dataset = TestDataset(test_data, "/kaggle/input/m1dataset/test_images", transform=transform)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)


In [None]:
# List of attributes to predict
attributes = ['color', 'neck', 'pattern', 'print_or_pattern_type', 'sleeve_length']

# Create dictionaries for each attribute
attribute_value_embeddings = {}

for attr in attributes:
    # Get unique attribute values from training data
    unique_values = data[attr].dropna().unique()
    embeddings = []
    for value in unique_values:
        embedding = get_embedding(value)
        embeddings.append(embedding)
    attribute_value_embeddings[attr] = {
        'values': unique_values,
        'embeddings': np.stack(embeddings)
    }


In [None]:
def find_closest_attribute(embedding, attr):
    embeddings = attribute_value_embeddings[attr]['embeddings']  # [num_values, embedding_dim]
    values = attribute_value_embeddings[attr]['values']
    # Compute cosine similarity
    similarities = np.dot(embeddings, embedding) / (
        np.linalg.norm(embeddings, axis=1) * np.linalg.norm(embedding) + 1e-10
    )
    # Get the index of the most similar embedding
    idx = np.argmax(similarities)
    return values[idx]


In [None]:
# Initialize lists to store predictions
predictions = []

model.eval()
with torch.no_grad():
    for batch_images in tqdm(test_loader, desc="Testing"):
        batch_images = batch_images.to(device)
        outputs = model(batch_images)  # [batch_size, num_attributes, embedding_dim]
        outputs = outputs.cpu().numpy()
        
        batch_size = outputs.shape[0]
        for i in range(batch_size):
            sample_predictions = {}

            for j, attr in enumerate(attributes):
                # print(attr)
                pred_embedding = outputs[i, j, :]  # Predicted embedding for attribute
                # Find the closest attribute value
                pred_value = find_closest_attribute(pred_embedding, attr)
                sample_predictions[attr] = pred_value
            predictions.append(sample_predictions)


In [None]:
# Convert predictions to DataFrame
predictions_df = pd.DataFrame(predictions)

# Concatenate with test_data
test_results = pd.concat([test_data[['id', 'Category']].reset_index(drop=True), predictions_df], axis=1)


In [None]:
# Save to CSV
test_results.to_csv(f'test_predictions_new_model_{category}.csv', index=False)

print("Predictions saved to 'test_predictions.csv'")


In [10]:
!rm -rf *9.pth