In [1]:
# Add this at the top of your notebook
import torch.multiprocessing as mp
# Set multiprocessing method to 'spawn' instead of 'fork'
if __name__ == "__main__":
    mp.set_start_method('spawn', force=True)

In [2]:
from sentence_transformers import SentenceTransformer
import faiss

sentence_encoder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")  # dim=384
fields = ['title', 'description', 'channel', 'date']  # etc.
field_embeds = sentence_encoder.encode(fields, convert_to_tensor=True)  # shape: [num_fields, 384]


In [3]:
import torch
import torch.nn as nn
from transformers import BertModel, BertConfig

class MetadataFusion(nn.Module):
    def __init__(self, hidden_dim=384, num_fields=4):
        super().__init__()
        config = BertConfig(
            hidden_size=hidden_dim,
            num_attention_heads=6,
            num_hidden_layers=2,
            intermediate_size=hidden_dim * 4,
            max_position_embeddings=num_fields + 1,  # +1 for [CLS]
            num_labels=1
        )
        self.bert = BertModel(config)
        self.cls_token = nn.Parameter(torch.randn(1, 1, hidden_dim))  # [CLS] token
        
        # Positional embeddings to help identify which field is which
        self.field_type_embed = nn.Embedding(num_fields, hidden_dim)
        self.num_fields = num_fields

    def forward(self, field_embeddings):
        """
        field_embeddings: Tensor of shape [batch_size, num_fields, hidden_dim]
        """
        batch_size = field_embeddings.size(0)
        device = field_embeddings.device
        
        # Add learned field-type embeddings (field identity)
        field_positions = torch.arange(self.num_fields, device=device)
        field_type_bias = self.field_type_embed(field_positions)  # [num_fields, hidden_dim]
        field_type_bias = field_type_bias.unsqueeze(0).expand(batch_size, -1, -1)  # [batch, num_fields, hidden_dim]
        enriched_fields = field_embeddings + field_type_bias
        
        # Prepend [CLS] token
        cls_tokens = self.cls_token.expand(batch_size, -1, -1)  # shape: [batch_size, 1, hidden_dim]
        tokens = torch.cat([cls_tokens, enriched_fields], dim=1)  # [batch, num_fields + 1, dim]

        attention_mask = torch.ones(tokens.shape[:2], dtype=torch.long).to(tokens.device)
        output = self.bert(inputs_embeds=tokens, attention_mask=attention_mask)
        fused = output.last_hidden_state[:, 0]  # take [CLS] token
        return fused  # shape: [batch_size, hidden_dim]


In [4]:
BATCH_SIZE = 128
EPOCHS = 50
NUMBER_WORKERS = 10
PRE_FETCH = 4


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv('../dataset/train_with_flashiness.csv')
df_train, df_val = train_test_split(df, test_size=0.2, random_state=42)

In [6]:
import torch
from torch.utils.data import Dataset, DataLoader

class MetadataDataset(Dataset):
    def __init__(self, df):
        self.df = df
          
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        title = row['title']
        description = row['description']
        channel = row['channel']
        date = row['date']
        target = row['views']
        
        # Encode fields
        field_embeddings = sentence_encoder.encode([title, description, channel, date], convert_to_tensor=True)
        
        return field_embeddings, torch.tensor(target, dtype=torch.float32)
    
train_dataset = MetadataDataset(df_train)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUMBER_WORKERS, prefetch_factor=PRE_FETCH)
val_dataset = MetadataDataset(df_val)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUMBER_WORKERS, prefetch_factor=PRE_FETCH)

In [7]:
# Add regressor to the model
class MetadataRegressor(nn.Module):
    def __init__(self, hidden_dim=384, num_fields=4):
        super().__init__()
        self.fusion = MetadataFusion(hidden_dim, num_fields)
        self.regressor = nn.Linear(hidden_dim, 1)  # Assuming regression task

    def forward(self, field_embeddings):
        fused = self.fusion(field_embeddings)
        output = self.regressor(fused)
        return output

In [8]:
import matplotlib.pyplot as plt
from IPython.display import clear_output
import gc  # Add garbage collector

# First, clear any existing cache
torch.cuda.empty_cache()
gc.collect()

# Select device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Move model to device
model = MetadataRegressor().to(device)

# Move field embeddings to device
field_embeds = field_embeds.to(device)

# Optimizer and loss
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
loss_fn = torch.nn.HuberLoss(delta=1.0) # Huber loss is less sensitive to outliers than MSE

# Lists to store metrics
train_losses = []
val_losses = []
batch_losses = []

# Create a function to update the plot
def plot_losses():
    clear_output(wait=True)
    plt.figure(figsize=(12, 5))
    
    # Plot batch losses
    plt.subplot(1, 2, 1)
    plt.plot(batch_losses)
    plt.title('Loss per Batch')
    plt.xlabel('Batch')
    plt.ylabel('Loss')
    plt.grid(True)
    
    # Plot epoch losses
    plt.subplot(1, 3, 2)
    epochs = range(1, len(train_losses) + 1)
    plt.plot(epochs, train_losses, 'b-', label='Training')
    plt.plot(epochs, val_losses, 'r-', label='Validation')
    plt.title('Loss per Epoch')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid(True)
    
    plt.tight_layout()
    plt.show()


Using device: cuda


In [None]:
# Training loop

for epoch in range(EPOCHS):
    model.train()
    total_train_loss = 0.0
    for i, (imgs, targets) in enumerate(train_loader):
        imgs = imgs.to(device)
        targets = targets.to(device)

        optimizer.zero_grad()
        preds = model(imgs)
        loss = loss_fn(preds, targets)
        loss.backward()
        optimizer.step()

        batch_loss = loss.item()
        batch_losses.append(batch_loss)
        total_train_loss += loss.item() * len(imgs)
        
        # Move tensors to CPU to free GPU memory
        imgs = imgs.cpu()
        targets = targets.cpu()
        
        # Optionally plot every N batches
        if (i + 1) % 10 == 0:  # Adjust frequency as needed
            print(f"Epoch {epoch+1}/{EPOCHS} | Batch {i+1}/{len(train_loader)} | Loss: {batch_loss:.4f}")
            
    # Validation phase
    model.eval()
    total_val_loss = 0.0
    with torch.no_grad():
        for imgs, targets in val_loader:
            imgs = imgs.to(device)
            targets = targets.to(device)
            
            outputs = model(imgs)
            val_loss = loss_fn(outputs, targets)
            total_val_loss += val_loss.item() * len(imgs)
            
            # Move tensors to CPU to free GPU memory
            imgs = imgs.cpu()
            targets = targets.cpu()


    # Calculate average losses
    avg_train_loss = total_train_loss / len(train_dataset)
    avg_val_loss = total_val_loss / len(val_dataset)
    
    # Store losses for plotting
    train_losses.append(avg_train_loss)
    val_losses.append(avg_val_loss)
    print(f"Epoch {epoch+1}/{EPOCHS} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")
    
    # Free up memory
    torch.cuda.empty_cache()
    gc.collect()
    
    plot_losses()
    
plot_losses()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/usr/lib64/python3.9/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/usr/lib64/python3.9/multiprocessing/spawn.py", line 126, in _main
    self = reduction.pickle.load(from_parent)
AttributeError: Can't get attribute 'MetadataDataset' on <module '__main__' (built-in)>


In [None]:
torch.save(model.fusion.state_dict(), 'metadata_fusion.pth')
torch.save(model.regressor.state_dict(), 'metadata_regressor.pth')


In [None]:
import torch
import pandas as pd
import numpy as np
from PIL import Image
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
from torchvision import transforms, models
import torch.nn as nn
import os
from tqdm import tqdm

# Load test data
print(f"Test dataset loaded with {len(df_val)} samples")

# Recreate model architecture
model = MetadataRegressor()
model.fusion.load_state_dict(torch.load('metadata_fusion.pth'))
model.regressor.load_state_dict(torch.load('metadata_regressor.pth'))

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# Evaluation
model.eval()
predictions = []
actual = []
ids = []

with torch.no_grad():
    for imgs, targets in tqdm(val_loader, desc="Evaluating"):
        imgs = imgs.to(device)
        outputs = model(imgs)
        
        # Store predictions and ground truth
        predictions.extend(outputs.cpu().numpy().tolist())
        actual.extend(targets.numpy().tolist())

ids = df_val['id'].values.tolist()
# Create results dataframe
results = pd.DataFrame({
    'id': ids,
    'predicted_views': predictions
})

# If we have actual flashiness values in the test set, calculate metrics
if "views" in df_val.columns:
    from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
    
    mse = mean_squared_error(actual, predictions)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(actual, predictions)
    r2 = r2_score(actual, predictions)
    
    print(f"Test Results:")
    print(f"MSE: {mse:.4f}")
    print(f"RMSE: {rmse:.4f}")
    print(f"MAE: {mae:.4f}")
    print(f"R²: {r2:.4f}")
    
    # Plot predictions vs actual
    plt.figure(figsize=(10, 6))
    plt.scatter(actual, predictions, alpha=0.5)
    plt.plot([min(actual), max(actual)], [min(actual), max(actual)], 'r--')
    plt.xlabel('Actual Views')
    plt.ylabel('Predicted Views')
    plt.title('Predicted vs Actual Views')
    plt.grid(True)
    plt.show()

# View the flashiest images according to the model
results_sorted = results.sort_values(by='predicted_views', ascending=False)

# Display top 5 most attractive videos according to the model
plt.figure(figsize=(15, 10))
plt.suptitle("Top 5 Most Attractive Videos (According to Model)", fontsize=16)

for i in range(5):
    if i < len(results_sorted):
        row = results_sorted.iloc[i]
        img_path = f"../dataset/train_val/{row['id']}.jpg"
        
        if os.path.exists(img_path):
            img_path = img_path
        else:
            print(f"Image not found: {row['id']}")
            continue
            
        img = Image.open(img_path)
        plt.subplot(1, 5, i+1)
        plt.imshow(img)
        plt.title(f"Views: {row['predicted_views']:.2f}")
        plt.axis('off')

plt.tight_layout()
plt.show()