In [7]:
%pip install pymongo
%pip install torch
%pip install --upgrade transformers accelerate

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Collecting transformers
  Obtaining dependency information for transformers from https://files.pythonhosted.org/packages/51/51/b87caa939fedf307496e4dbf412f4b909af3d9ca8b189fc3b65c1faa456f/transformers-4.46.3-py3-none-any.whl.metadata
  Downloading transformers-4.46.3-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.21,>=0.20 (from transformers)
  Obtaining dependency information for tokenizers<0.21,>=0.20 from https://files.pythonhosted.org/packages/aa/14/e75ece72e99f6ef9ae07777ca9fdd78608f69466a5cecf636e9bd2f25d5c/tokenizers-0.20.3-cp311-cp311-macosx_11_0_arm64.whl.metadata
  Downloading tokenizers-0.20.3-cp311-cp311-macosx_11_0_arm64.whl.metadata (6.7 kB)
Downloading transformers-4.46.3-py3-none-any.whl (10.0 MB)
[2K   [90m━━

In [1]:
from pymongo import MongoClient
import pandas as pd

In [2]:
#connecting to DB
URI = f"mongodb+srv://doadmin:2Vu93L57a6F08EZT@db-mongodb-nyc3-74933-c66242f4.mongo.ondigitalocean.com/admin?retryWrites=true&w=majority"
client = MongoClient(URI)
db = client['youtube_comments']
collection = db['trending_video_data']

In [3]:
cursor = collection.find({})  # '_id': 0 excludes the _id field
# cursor.collection.find({"comment": {"$regex": "suggest", "$options": "i"}})
# Convert the MongoDB cursor to a list of documents
data = list(cursor)
print(len(data))

25029


In [4]:
# Dictionary to store the tag to associated videos mapping
tag_association = {}

# Iterate over all documents in the collection
for document in collection.find():
    title = document.get('title', '')
    description = document.get('description', '')
    tags = document.get('tags', [])

    # Iterate through each tag in the 'tags' field of the document
    for tag in tags:
        # If the tag is not in the tag_association dictionary, initialize an entry for it
        if tag not in tag_association:
            tag_association[tag] = []

        # Append the video details (title, description, etc.) to the tag's associated videos list
        tag_association[tag].append({
            'title': title,
            'description': description,
            'tags': tags
        })

# Format the data for GPT fine-tuning
training_data = []

for tag, associated_videos in tag_association.items():
    # Construct input and output
    input_text = f'Input: "Give me video ideas about {tag}"'
    output_text = "Output:\n"

    for idx, video in enumerate(associated_videos, 1):
        output_text += (
            f"Video {idx}:\n"
            f"Video Title: \"{video['title']}\"\n"
            f"Description: \"{video['description']}\"\n"
            f"Tags: \"{', '.join(video['tags'])}\"\n\n"
        )

    # Combine input and output
    training_data.append(f"{input_text}\n{output_text}")

# Optionally, print a sample of the formatted training data
print("\nSample formatted training data:\n")
print("\n".join(training_data[:2]))

# Save the data to a text file for fine-tuning
with open('fine_tuning_data.txt', 'w') as file:
    for example in training_data:
        file.write(example + "\n")



Sample formatted training data:

Input: "Give me video ideas about machine learning"
Output:
Video 1:
Video Title: "Standardization vs Normalization Clearly Explained!"
Description: "Let's understand feature scaling and the differences between standardization and normalization in great detail.
#machinelearning  #datascience #artificialintelligence"
Tags: "machine learning,  normalized nerd,  data science,  data normalization,  standardization,  Standardization vs normalization,  what is feature scaling,  why feature scaling is needed,  why normalization is important,  feature scaling,  normalisation"

Video 2:
Video Title: "Quick explanation: One-hot encoding"
Description: "What is one-hot encoding?
It is a way to feed categorical data to Machine Learning models. "
Tags: "machine learning,  data science,  deep learning,  artificial intelligence,  python,  pandas"

Video 3:
Video Title: "Categorical variable encoding"
Description: "In this video, we implement different categorical enco

In [5]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
from torch.utils.data import Dataset

# Load pre-trained GPT-2
model = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Add a new pad token to the tokenizer
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Update the model with the new pad token
model.resize_token_embeddings(len(tokenizer))

# Prepare the dataset class
class TextDataset(Dataset):
    def __init__(self, tag_association, tokenizer, max_length=512):
        self.input_ids = []
        self.attention_masks = []
        self.labels = []

        # Limit to the first 2000 documents from the tag_association
        limited_tag_association = dict(list(tag_association.items())[:5000])

        # Format the data as a combination of tag and associated video titles
        for tag, associated_videos in limited_tag_association.items():
            # Create a prompt like "Tag: <tag> Associated Videos: <video_titles>"
            video_titles = ", ".join([video['title'] for video in associated_videos])
            text = f"Tag: {tag} Associated Videos: {video_titles}"

            # Tokenize and add padding and truncation
            encoding = tokenizer(text, truncation=True, padding='max_length', max_length=max_length, return_tensors='pt')
            input_ids = encoding['input_ids'].squeeze(0)  # Remove batch dimension
            attention_mask = encoding['attention_mask'].squeeze(0)  # Remove batch dimension

            # Labels for GPT-2 are the same as the input_ids, shifted by one token
            labels = input_ids.clone()
            labels[labels == tokenizer.pad_token_id] = -100  # Ignore pad tokens when computing loss

            self.input_ids.append(input_ids)
            self.attention_masks.append(attention_mask)
            self.labels.append(labels)

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_masks[idx],
            'labels': self.labels[idx]  # Provide the labels for GPT-2
        }

# Assuming tag_association is already created with your tags and associated videos
# Initialize dataset with the tag_association (limited to 2000 documents)
train_dataset = TextDataset(tag_association, tokenizer)

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [6]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=2,  # Reduce batch size to 2
    logging_dir='./logs',
    logging_steps=10,
    save_steps=100,
)

# Initialize Trainer
trainer = Trainer(
    model=model,                         # the pre-trained model
    args=training_args,                  # training arguments
    train_dataset=train_dataset,         # training dataset
)

# Train the model
trainer.train()

  0%|          | 0/7500 [00:00<?, ?it/s]

{'loss': 5.1484, 'grad_norm': 19.95587158203125, 'learning_rate': 4.993333333333334e-05, 'epoch': 0.0}
{'loss': 4.85, 'grad_norm': 23.232330322265625, 'learning_rate': 4.986666666666667e-05, 'epoch': 0.01}
{'loss': 4.2257, 'grad_norm': 23.772552490234375, 'learning_rate': 4.9800000000000004e-05, 'epoch': 0.01}
{'loss': 4.3401, 'grad_norm': 9.108675956726074, 'learning_rate': 4.973333333333334e-05, 'epoch': 0.02}
{'loss': 3.8987, 'grad_norm': 19.68787384033203, 'learning_rate': 4.966666666666667e-05, 'epoch': 0.02}
{'loss': 3.9493, 'grad_norm': 17.016983032226562, 'learning_rate': 4.96e-05, 'epoch': 0.02}
{'loss': 3.94, 'grad_norm': 28.610811233520508, 'learning_rate': 4.9533333333333336e-05, 'epoch': 0.03}
{'loss': 3.8656, 'grad_norm': 19.529367446899414, 'learning_rate': 4.9466666666666665e-05, 'epoch': 0.03}
{'loss': 3.7699, 'grad_norm': 31.725019454956055, 'learning_rate': 4.94e-05, 'epoch': 0.04}
{'loss': 3.843, 'grad_norm': 28.69346809387207, 'learning_rate': 4.933333333333334e-05

In [None]:
import re

def format_output(raw_output):
    structured_videos = []
    # Use regex or string parsing to extract title, description, and tags
    suggestions = raw_output.split(',')
    for idx, suggestion in enumerate(suggestions):
        structured_videos.append(f"Video {idx + 1}:\nVideo Title: \"{suggestion.strip()}\"\nDescription: \"To be generated.\"\nTags: \"python, programming\"")
    return "\n\n".join(structured_videos)

# Raw model output
raw_output = """How to use multiple substrings in Python, Python Tutorial: How to create and manipulate substring in python, How To Use Multiple Strings In Python - Python for beginners, What is a string? - Learn Python in 5 minutes"""

# Format the output
formatted_output = format_output(raw_output)
print(formatted_output)

In [None]:
# Sample prompt for structured output
prompt_text = """
What video about Python should I make?

Generate the following structured output for video suggestions:

Video 1:
Video Title: "..."
Description: "..."
Tags: "tag1, tag2, tag3"
Video 2:
Video Title: "..."
Description: "..."
Tags: "tag1, tag2, tag3"
"""

# Tokenize the prompt text
model = model.to('cuda')

# Tokenize the prompt text
input_ids = tokenizer.encode(prompt_text, return_tensors='pt')

# Move input_ids to the same device as the model (GPU)
input_ids = input_ids.to('cuda')

# Generate text from the model
output = model.generate(input_ids,
                        max_length=300,    # Increase max length for structured output
                        num_return_sequences=1,  # Number of generated sequences
                        no_repeat_ngram_size=2,  # Avoid repeating n-grams
                        temperature=1.0,  # Increased temperature for more creativity
                        top_k=50,         # Top-k sampling
                        top_p=0.95,       # Top-p (nucleus) sampling
                        pad_token_id=tokenizer.pad_token_id)  # Padding token ID

# Decode the generated output and post-process it
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

# Print the formatted output
formatted_output = format_output(generated_text)
print(formatted_output)