**Step 0 :** Prep the data frame for dataloader creation

In [1]:
import pandas as pd
from datetime import datetime
from sklearn.preprocessing import LabelEncoder
from sentence_transformers import SentenceTransformer
from tqdm import tqdm


df = pd.read_csv('../dataset/train_val.csv')


df['date'] = pd.to_datetime(df['date'])
# Extract date features
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df['day_of_week'] = df['date'].dt.dayofweek
df['quarter'] = df['date'].dt.quarter

# Check if date has timezone information
has_tz = df['date'].dt.tz is not None

# Fix the timezone issue with days_since_upload calculation
if has_tz:
    # Method 1: Make reference date timezone-aware
    from datetime import timezone
    reference_date = datetime.now(timezone.utc)
else:
    # Method 2: Use a timezone-naive reference date
    reference_date = pd.Timestamp.now().tz_localize(None)


# Calculate days since upload (using the date of the most recent video as reference)
df['days_since_upload'] = (reference_date - df['date']).dt.days

# Encode channel names
channel_encoder = LabelEncoder()
df['channel_encoded'] = channel_encoder.fit_transform(df['channel'])

sentence_encoder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
all_embeddings = []

for idx in tqdm(range(len(df))):
    row = df.iloc[idx]
    # Get embeddings using your existing method
    title = row['title']
    description = row['description']
    channel = row['channel']
    date = row['date']
    field_embeddings = sentence_encoder.encode([title, description, channel, date], convert_to_tensor=True)
    # Store them in numpy arrays
    all_embeddings.append(field_embeddings.cpu().numpy())
    
df['embeddings'] = all_embeddings    
df.head()

100%|██████████| 15482/15482 [01:17<00:00, 198.85it/s]


Unnamed: 0.1,Unnamed: 0,id,channel,title,date,description,views,year,month,day,day_of_week,quarter,days_since_upload,channel_encoded,embeddings
0,0,--2s6hjGrm4,UC-1rx8j9Ggp8mp4uD0ZdEIA,"CGI & VFX Breakdowns: ""Warzone"" - by Ramesh Th...",2020-12-15 05:00:01+00:00,"Check out this revealing VFX Breakdown ""Warzon...",12299,2020,12,15,1,4,1611,0,"[[-0.07760659, -0.001022775, -0.09010337, -0.0..."
1,1,--DnfroyKQ8,UC-1rx8j9Ggp8mp4uD0ZdEIA,"A Sci-Fi Short Film: ""Exit"" - by Ng King Kwan ...",2020-07-01 16:00:00+00:00,"TheCGBros Presents ""Exit"" by Ng King Kwan - Th...",7494,2020,7,1,2,3,1777,0,"[[-0.022426384, 0.05459995, -0.0177436, 0.0594..."
2,2,--aiU7VQKEw,UC-1rx8j9Ggp8mp4uD0ZdEIA,"CGI 3D Animated Short: ""Lost Love"" - by Akash ...",2019-02-18 20:30:00+00:00,"TheCGBros Presents ""Lost Love"" by Akash Manack...",11831,2019,2,18,0,1,2276,0,"[[-0.11143896, 0.022581432, 0.016571341, -0.02..."
3,6,-0SrlZAvSVM,UCW6NyJ6oFLPTnx7iGRZXDDg,Jo Goes Hunting - Careful | Animated music vid...,2020-03-10 14:30:01+00:00,"On the borderless map of a magical planet, lit...",2248,2020,3,10,1,1,1890,28,"[[-0.021549331, 0.040397692, -0.0008517903, -0..."
4,10,-13Y2Pe7kFs,UC-1rx8j9Ggp8mp4uD0ZdEIA,"CGI VFX Breakdown: ""Logan (Wolverine): Digital...",2017-09-20 20:13:52+00:00,Check out this outstanding behind-the-scenes l...,113806,2017,9,20,2,3,2792,0,"[[-0.08767335, -0.07205786, 0.027961658, -0.06..."


**Step 1 :** Instantiate Dataloader

In [None]:
from sklearn.model_selection import train_test_split
from dataset import MultiModalDataset
import torchvision.transforms as T

transform = T.Compose([
    T.RandomResizedCrop(224, scale=(0.8, 1.0)),
    T.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),  # mimic thumbnail color pop
    T.RandomHorizontalFlip(),
    T.ToTensor(),
    T.Normalize(mean=[0.5]*3, std=[0.5]*3),
]) 

# Split the data into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
train_dataset = MultiModalDataset(train_df, transform=transform)
val_dataset = MultiModalDataset(val_df, transform=transform)


