In [None]:
# mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

# Load splited data
train_path = "/content/drive/MyDrive/GitHub_Repos/CS610-Product-Image-Text-Consistency-Detection-System-for-E-commerce/amazon_meta_data/split_data/train_data.parquet"
val_path = '/content/drive/MyDrive/GitHub_Repos/CS610-Product-Image-Text-Consistency-Detection-System-for-E-commerce/amazon_meta_data/split_data/val_data.parquet'
test_path = '/content/drive/MyDrive/GitHub_Repos/CS610-Product-Image-Text-Consistency-Detection-System-for-E-commerce/amazon_meta_data/split_data/test_data.parquet'

train_df = pd.read_parquet(train_path)
val_df = pd.read_parquet(val_path)
test_df = pd.read_parquet(test_path)

In [None]:
import os
import json
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer
from torchvision import transforms
from PIL import Image
from tqdm import tqdm

def load_distillation_data(output_dir, batch_size=32, num_workers=2):
    """
    This function recreates the distillation_data dictionary by loading saved files
    from the output directory, including dataset information, valid indices, and
    processed DataFrames.

    Parameters:
    -----------
    output_dir : str
        Path to the directory where the distillation files were saved
    batch_size : int, optional
        Batch size for DataLoaders, default is 32
    num_workers : int, optional
        Number of workers for DataLoaders, default is 2

    Returns:
    --------
    dict
        Reconstructed distillation_data dictionary containing:
        - train_loader, val_loader, test_loader: DataLoader objects
        - train_dataset, val_dataset, test_dataset: Dataset objects
        - tokenizer: BERT tokenizer
        - image_transforms: torchvision transforms
        - train_df_valid, val_df_valid, test_df_valid: Filtered DataFrames
    """
    # 1. Load dataset information
    with open(os.path.join(output_dir, 'dataset_info.json'), 'r') as f:
        dataset_info = json.load(f)

    # Extract necessary parameters
    max_length = dataset_info['max_length']
    bert_model_name = dataset_info['bert_model']

    # 2. Load valid indices
    train_valid_indices = np.load(os.path.join(output_dir, 'train_valid_indices.npy'))
    val_valid_indices = np.load(os.path.join(output_dir, 'val_valid_indices.npy'))
    test_valid_indices = np.load(os.path.join(output_dir, 'test_valid_indices.npy'))

    # 3. Load valid DataFrames
    train_df_valid = pd.read_csv(os.path.join(output_dir, 'train_df_valid.csv'))
    val_df_valid = pd.read_csv(os.path.join(output_dir, 'val_df_valid.csv'))
    test_df_valid = pd.read_csv(os.path.join(output_dir, 'test_df_valid.csv'))

    # 4. Reload tokenizer and define image transformations
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    # Load BERT tokenizer
    tokenizer = BertTokenizer.from_pretrained(bert_model_name)

    # Define image transformations (same as in the original code)
    image_transforms = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(
            mean=[0.485, 0.456, 0.406],  # ImageNet mean normalization
            std=[0.229, 0.224, 0.225]    # ImageNet std normalization
        )
    ])

    # 5. Recreate the dataset class
    class DistillationDataset(Dataset):
        """
        Dataset class for handling text and image data for distillation.

        This class loads text data through BERT tokenizer and image data through
        torchvision transforms, and provides both in a format suitable for
        distillation models.
        """
        def __init__(self, df, tokenizer, transform, text_column, image_column, max_length, valid_indices=None):
            self.df = df
            self.tokenizer = tokenizer
            self.transform = transform
            self.text_column = text_column
            self.image_column = image_column
            self.max_length = max_length

            # If valid indices are provided, use them directly
            if valid_indices is not None:
                self.valid_indices = valid_indices
            else:
                # Otherwise, find valid indices as in the original code
                self.valid_indices = []
                for idx, row in tqdm(df.iterrows(), total=len(df), desc="Validating image paths"):
                    if pd.notna(row[image_column]) and os.path.exists(row[image_column]):
                        self.valid_indices.append(idx)

            print(f"Valid samples: {len(self.valid_indices)}/{len(df)}")

        def __len__(self):
            return len(self.valid_indices)

        def __getitem__(self, idx):
            orig_idx = self.valid_indices[idx]
            row = self.df.iloc[orig_idx]
            text = row[self.text_column]
            image_path = row[self.image_column]
            label = row['is_match']

            # Process text
            encoding = self.tokenizer(
                text,
                max_length=self.max_length,
                padding='max_length',
                truncation=True,
                return_tensors='pt'
            )

            # Flatten batch dimension
            input_ids = encoding['input_ids'].squeeze()
            attention_mask = encoding['attention_mask'].squeeze()

            # Process image
            try:
                image = Image.open(image_path).convert('RGB')
                image_tensor = self.transform(image)
            except Exception as e:
                print(f"Error processing image {image_path}: {e}")
                # Return a zero tensor as placeholder
                image_tensor = torch.zeros((3, 224, 224))

            return {
                'input_ids': input_ids,
                'attention_mask': attention_mask,
                'image': image_tensor,
                'label': torch.tensor(label, dtype=torch.long),
                'idx': torch.tensor(orig_idx)
            }

    # 6. Create datasets using valid indices and DataFrames
    text_column = 'text_for_deep'
    image_column = 'processed_image_path'

    print("\nRecreating datasets...")
    # Since we're using the filtered DataFrames, the indices now range from 0 to len(df)
    train_dataset = DistillationDataset(
        train_df_valid, tokenizer, image_transforms,
        text_column, image_column, max_length, valid_indices=range(len(train_df_valid))
    )

    val_dataset = DistillationDataset(
        val_df_valid, tokenizer, image_transforms,
        text_column, image_column, max_length, valid_indices=range(len(val_df_valid))
    )

    test_dataset = DistillationDataset(
        test_df_valid, tokenizer, image_transforms,
        text_column, image_column, max_length, valid_indices=range(len(test_df_valid))
    )

    # 7. Create DataLoaders
    print("\nCreating DataLoaders...")
    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=num_workers,
        pin_memory=True if torch.cuda.is_available() else False
    )

    val_loader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=num_workers,
        pin_memory=True if torch.cuda.is_available() else False
    )

    test_loader = DataLoader(
        test_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=num_workers,
        pin_memory=True if torch.cuda.is_available() else False
    )

    # 8. Rebuild the distillation_data dictionary
    distillation_data = {
        'train_loader': train_loader,
        'val_loader': val_loader,
        'test_loader': test_loader,
        'train_dataset': train_dataset,
        'val_dataset': val_dataset,
        'test_dataset': test_dataset,
        'tokenizer': tokenizer,
        'image_transforms': image_transforms,
        'train_df_valid': train_df_valid,
        'val_df_valid': val_df_valid,
        'test_df_valid': test_df_valid
    }

    print("\nLoading ata complete!")
    print(f"Training set size: {len(train_dataset)}")
    print(f"Validation set size: {len(val_dataset)}")
    print(f"Test set size: {len(test_dataset)}")

    return distillation_data

output_dir = '/content/drive/MyDrive/GitHub_Repos/CS610-Product-Image-Text-Consistency-Detection-System-for-E-commerce/amazon_meta_data/distillation_features'
distillation_data = load_distillation_data(output_dir)

train_loader = distillation_data['train_loader']
val_loader = distillation_data['val_loader']
test_loader = distillation_data['test_loader']
tokenizer = distillation_data['tokenizer']

Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]


Recreating datasets...
Valid samples: 35923/35923
Valid samples: 5132/5132
Valid samples: 10265/10265

Creating DataLoaders...

Data recovery complete!
Training set size: 35923
Validation set size: 5132
Test set size: 10265
