In [1]:
from pymongo import MongoClient
import pandas as pd

In [2]:
#connecting to DB
URI = f"Your URI"
client = MongoClient(URI)
db = client['youtube_comments']
collection = db['tech_comments']

In [4]:
cursor = collection.find({})  # '_id': 0 excludes the _id field
# cursor.collection.find({"comment": {"$regex": "suggest", "$options": "i"}})
# Convert the MongoDB cursor to a list of documents
data = list(cursor)
print(len(data))

11053


In [26]:
all_comments = []

# Iterate over all documents in the collection
for document in collection.find():
    if 'comments' in document:
        for comment in document['comments']:
            # print(comment)
            # break
            all_comments.append(comment.get('original_text'))

In [29]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
from torch.utils.data import Dataset

# Load pre-trained GPT-2
model = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Add a new pad token to the tokenizer
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Update the model with the new pad token
model.resize_token_embeddings(len(tokenizer))

# Prepare the dataset class
class TextDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=512):
        self.input_ids = []
        self.attention_masks = []
        self.labels = []
        
        for text in texts:
            
            # Tokenize and add padding and truncation
            encoding = tokenizer(text, truncation=True, padding='max_length', max_length=max_length, return_tensors='pt')
            input_ids = encoding['input_ids'].squeeze(0)  # Remove batch dimension
            attention_mask = encoding['attention_mask'].squeeze(0)  # Remove batch dimension
            
            # Labels for GPT-2 are the same as the input_ids, shifted by one token
            labels = input_ids.clone()
            labels[labels == tokenizer.pad_token_id] = -100  # Ignore pad tokens when computing loss
            
            self.input_ids.append(input_ids)
            self.attention_masks.append(attention_mask)
            self.labels.append(labels)

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_masks[idx],
            'labels': self.labels[idx]  # Provide the labels for GPT-2
        }

# Example train data (replace with your actual text data)
train_texts = all_comments[0:100]  # This is just an example, use your actual data

# Initialize dataset
train_dataset = TextDataset(train_texts, tokenizer)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # number of training epochs
    per_device_train_batch_size=4,   # batch size per device during training
    logging_dir='./logs',            # directory for storing logs
    no_cuda=True,                    # Set to True since we're using CPU (disable CUDA)
)

# Initialize Trainer
trainer = Trainer(
    model=model,                         # the pre-trained model
    args=training_args,                  # training arguments
    train_dataset=train_dataset,         # training dataset
)

# Train the model
trainer.train()





Step,Training Loss


TrainOutput(global_step=75, training_loss=3.4832389322916666, metrics={'train_runtime': 235.2633, 'train_samples_per_second': 1.275, 'train_steps_per_second': 0.319, 'total_flos': 78387609600000.0, 'train_loss': 3.4832389322916666, 'epoch': 3.0})

In [31]:
#Sample prompt
prompt_text = "Suggest a tech idea"

# Tokenize the prompt text
input_ids = tokenizer.encode(prompt_text, return_tensors='pt')

# Generate text from the model
output = model.generate(input_ids, 
                        max_length=50,    # Maximum length of the generated sequence
                        num_return_sequences=1,  # Number of generated sequences
                        no_repeat_ngram_size=2,  # Avoid repeating n-grams
                        temperature=0.7,  # Sampling temperature (lower means more deterministic)
                        top_k=50,         # Top-k sampling
                        top_p=0.95,       # Top-p (nucleus) sampling
                        pad_token_id=tokenizer.pad_token_id)  # Padding token ID

# Decode the generated output and print it
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)



Suggest a tech idea to help you understand the basics of Python.
I'm a Python beginner and I'm starting to learn Python from scratch. I am a beginner with a passion for coding and a great way to get started. Thanks for sharing your
