In [47]:
import pandas as pd
import re
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import torch

In [None]:
df=pd.read_csv('master_dataset_updated.csv')

In [None]:
df.columns

Index(['id', 'title', 'description', 'publishedAt', 'tags', 'categoryId',
       'defaultLanguage', 'defaultAudioLanguage', 'thumbnail_default',
       'thumbnail_high', 'duration', 'viewCount', 'likeCount', 'commentCount',
       'privacyStatus', 'channel_id', 'channel_title', 'channel_description',
       'channel_country', 'channel_thumbnail', 'channel_subscriberCount',
       'channel_videoCount', 'is_transcript_available', 'transcript'],
      dtype='object')

In [None]:
def clean_text(text):
    if text is None or pd.isna(text):
        return ""
    if not isinstance(text, str):
        text = str(text)
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = ' '.join(text.split())
    return text

In [None]:
clean_text("Why studying bees can teach us about human loneliness | Sarah Kocher | TEDxNewEngland")

'why studying bees can teach us about human loneliness sarah kocher tedxnewengland'

In [None]:
df['title_cleaned'] = df['title'].apply(clean_text)
df['description_cleaned'] = df['description'].apply(clean_text)
df['transcript_cleaned'] = df['transcript'].apply(clean_text)

In [None]:
df["title_cleaned"].head()

0    scientists cant explain whats happening on thi...
1     million pirate treasure found on the ocean floor
2       if you see a rubber band on your door act fast
3    cleopatras tomb found egypts greatest discover...
4    scientists just solved the mystery of killer w...
Name: title_cleaned, dtype: object

In [None]:
df['duration_seconds'] = pd.to_timedelta(df['duration'], errors='coerce').dt.total_seconds()

In [None]:
df[['duration','duration_seconds']].head()

Unnamed: 0,duration,duration_seconds
0,PT3H59M26S,14366.0
1,PT8M50S,530.0
2,PT4H3M28S,14608.0
3,PT3H4M25S,11065.0
4,PT9M14S,554.0


In [None]:
df.to_csv('master_dataset_processed.csv', index=False)

In [None]:
df.columns

Index(['id', 'title', 'description', 'publishedAt', 'tags', 'categoryId',
       'defaultLanguage', 'defaultAudioLanguage', 'thumbnail_default',
       'thumbnail_high', 'duration', 'viewCount', 'likeCount', 'commentCount',
       'privacyStatus', 'channel_id', 'channel_title', 'channel_description',
       'channel_country', 'channel_thumbnail', 'channel_subscriberCount',
       'channel_videoCount', 'is_transcript_available', 'transcript',
       'title_cleaned', 'description_cleaned', 'transcript_cleaned',
       'duration_seconds'],
      dtype='object')

In [2]:
from sentence_transformers import SentenceTransformer

In [4]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

Downloading model.safetensors: 100%|██████████| 90.9M/90.9M [00:29<00:00, 3.07MB/s]
Downloading model.onnx: 100%|██████████| 90.4M/90.4M [00:27<00:00, 3.32MB/s]
Downloading model_O1.onnx: 100%|██████████| 90.4M/90.4M [00:27<00:00, 3.34MB/s]
Downloading model_O2.onnx: 100%|██████████| 90.3M/90.3M [00:24<00:00, 3.70MB/s]
Downloading model_O3.onnx: 100%|██████████| 90.3M/90.3M [00:25<00:00, 3.59MB/s]
Downloading model_O4.onnx: 100%|██████████| 45.2M/45.2M [00:13<00:00, 3.32MB/s]
Downloading model_qint8_arm64.onnx: 100%|██████████| 23.0M/23.0M [00:07<00:00, 3.28MB/s]
Downloading (…)el_qint8_avx512.onnx: 100%|██████████| 23.0M/23.0M [00:06<00:00, 3.51MB/s]
Downloading (…)nt8_avx512_vnni.onnx: 100%|██████████| 23.0M/23.0M [00:06<00:00, 3.54MB/s]
Downloading model_quint8_avx2.onnx: 100%|██████████| 23.0M/23.0M [00:05<00:00, 3.89MB/s]
Downloading openvino_model.bin: 100%|██████████| 90.3M/90.3M [00:23<00:00, 3.88MB/s]
Downloading openvino_model.xml: 211kB [00:00, 68.5MB/s]
Downloading (…)_qint

In [3]:
df_e=pd.read_csv('master_dataset_processed.csv')

In [6]:
df_e.columns

Index(['id', 'title', 'description', 'publishedAt', 'tags', 'categoryId',
       'defaultLanguage', 'defaultAudioLanguage', 'thumbnail_default',
       'thumbnail_high', 'duration', 'viewCount', 'likeCount', 'commentCount',
       'privacyStatus', 'channel_id', 'channel_title', 'channel_description',
       'channel_country', 'channel_thumbnail', 'channel_subscriberCount',
       'channel_videoCount', 'is_transcript_available', 'transcript',
       'title_cleaned', 'description_cleaned', 'transcript_cleaned',
       'duration_seconds'],
      dtype='object')

In [28]:
np.set_printoptions(legacy='1.21')
print(f"NumPy version: {np.__version__}")
import numpy as np

NumPy version: 2.3.3


In [29]:
df_e['combined_title_transcript'] = df_e['title_cleaned']+" "+df_e['transcript_cleaned']

In [43]:
def chunk_text(text, max_tokens=500, overlap=50):
    if pd.isna(text) or text == "":
        return []
    words = text.split()
    chunks = []
    
    start = 0
    while start < len(words):
        end = start + max_tokens
        chunk = ' '.join(words[start:end])
        chunks.append(chunk)
        start = end - overlap
        if start >= len(words):
            break
    
    return chunks

In [49]:
def get_embeddings_pytorch(text):
    """Use pure PyTorch to avoid numpy issues"""
    if pd.isna(text) or text == "":
        return torch.zeros(384, dtype=torch.float32)
    
    chunks = chunk_text(text)
    if not chunks:
        return torch.zeros(384, dtype=torch.float32)
    
    try:
        # Get embeddings as tensor
        chunk_embeddings = model.encode(chunks, convert_to_tensor=True)
        
        # Calculate mean using PyTorch
        if len(chunk_embeddings.shape) > 1:
            final_embedding = chunk_embeddings.mean(dim=0)
        else:
            final_embedding = chunk_embeddings
            
        return final_embedding.cpu()  # Ensure it's on CPU
        
    except Exception as e:
        print(f"Error processing text: {e}")
        return torch.zeros(384, dtype=torch.float32)

In [50]:
df_e['e_title_trans_tensor'] = df_e['combined_title_transcript'].apply(get_embeddings_pytorch)
df_e['e_description_tensor'] = df_e['description_cleaned'].apply(get_embeddings_pytorch)

In [51]:
df_e.columns

Index(['id', 'title', 'description', 'publishedAt', 'tags', 'categoryId',
       'defaultLanguage', 'defaultAudioLanguage', 'thumbnail_default',
       'thumbnail_high', 'duration', 'viewCount', 'likeCount', 'commentCount',
       'privacyStatus', 'channel_id', 'channel_title', 'channel_description',
       'channel_country', 'channel_thumbnail', 'channel_subscriberCount',
       'channel_videoCount', 'is_transcript_available', 'transcript',
       'title_cleaned', 'description_cleaned', 'transcript_cleaned',
       'duration_seconds', 'combined_title_transcript', 'e_title_trans_tensor',
       'e_description_tensor'],
      dtype='object')

In [52]:
df_e.to_csv('master_dataset_embedded.csv', index=False)