In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.decomposition import PCA
import numpy as np
from sklearn.model_selection import train_test_split
from datasets import Dataset

# Load your dataset
df = pd.read_csv("file_with_titles.csv")  # Replace with the path to your dataset

# Verify dataset structure
print(df.head())

# Ensure columns are properly named
assert "video_id" in df.columns and "comments" in df.columns and "title" in df.columns

# Combine video titles with comments as pairs
df['text_pair'] = df['comments'] + " [SEP] " + df['title']

# Add dummy labels (you can replace this with actual relevance scores or binary labels if available)
df['label'] = 1.0  # Assuming all pairs are relevant for now; modify as per your use case

df['comment_length'] = df['comments'].apply(len)
plt.hist(df['comment_length'], bins=30, color='skyblue', edgecolor='black')
plt.title("Distribution of Comment Lengths")
plt.xlabel("Comment Length")
plt.ylabel("Frequency")
plt.show()

# Train-test split
train_df, val_df = train_test_split(df[['text_pair', 'label']], test_size=0.1, random_state=42)

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)


      video_id                                           comments  \
0  4sbklcQ0EXc                                     tri clerk free   
1  4sbklcQ0EXc  start work mern microservic microfrontend arch...   
2  4sbklcQ0EXc  hey add featur join friend listen along know w...   
3  4sbklcQ0EXc  hello bro get bug clerkmiddlewar appus clerkmi...   
4  4sbklcQ0EXc  next project idea mern notion clone rtk queri ...   

                                               title  
0  Advanced Spotify Clone: Build & Deploy a MERN ...  
1  Advanced Spotify Clone: Build & Deploy a MERN ...  
2  Advanced Spotify Clone: Build & Deploy a MERN ...  
3  Advanced Spotify Clone: Build & Deploy a MERN ...  
4  Advanced Spotify Clone: Build & Deploy a MERN ...  


In [2]:
pip install datasets




In [3]:
pip install transformers datasets


Note: you may need to restart the kernel to use updated packages.


In [None]:
from transformers import AutoTokenizer
from datasets import Dataset
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/paraphrase-distilroberta-base-v1")

# Tokenize the dataset
def tokenize_function(examples):
    # Tokenize the text pairs
    return tokenizer(examples['text_pair'], padding="max_length", truncation=True, max_length=128)

# Apply the tokenization function to the training and validation sets
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

# Remove the 'text_pair' column as it is no longer needed
train_dataset = train_dataset.remove_columns(["text_pair"])
val_dataset = val_dataset.remove_columns(["text_pair"])

# Rename 'label' column to match Hugging Face's expectations
train_dataset = train_dataset.rename_column("label", "labels")
val_dataset = val_dataset.rename_column("label", "labels")

train_dataset.set_format("torch")
val_dataset.set_format("torch")

# Visualize token length distribution
train_token_lengths = [len(tokenizer(example['text_pair'])['input_ids']) for example in train_df.to_dict(orient='records')]
val_token_lengths = [len(tokenizer(example['text_pair'])['input_ids']) for example in val_df.to_dict(orient='records')]

plt.hist(train_token_lengths, bins=30, alpha=0.7, label='Train Dataset', color='blue', edgecolor='black')
plt.hist(val_token_lengths, bins=30, alpha=0.7, label='Validation Dataset', color='green', edgecolor='black')
plt.title("Token Length Distribution")
plt.xlabel("Number of Tokens")
plt.ylabel("Frequency")
plt.legend()
plt.show()

# Verify tokenized dataset
print(train_dataset[0])  # Print the first entry of the training dataset to verify

Map:   0%|          | 0/23833 [00:00<?, ? examples/s]

Map:   0%|          | 0/2649 [00:00<?, ? examples/s]

{'labels': tensor(1.), '__index_level_0__': tensor(7653), 'input_ids': tensor([    0, 14406,  1994,  3157,   646,  3388,   510,   742,  1534,   256,
        29092, 31197,   208, 38680,    11,   666,   116, 33958, 11649,  1721,
         1018,  5245,  8771,    11, 15294,     2,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1, 

In [None]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

# Load the model
model = AutoModelForSequenceClassification.from_pretrained("sentence-transformers/paraphrase-distilroberta-base-v1", num_labels=1)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=50,
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Train the model
trainer.train()





Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/paraphrase-distilroberta-base-v1 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/4470 [00:00<?, ?it/s]

{'loss': 0.0434, 'grad_norm': 0.698472261428833, 'learning_rate': 1.9776286353467565e-05, 'epoch': 0.03}
{'loss': 0.0078, 'grad_norm': 0.7430528402328491, 'learning_rate': 1.9552572706935124e-05, 'epoch': 0.07}
{'loss': 0.0066, 'grad_norm': 0.6802736520767212, 'learning_rate': 1.9328859060402687e-05, 'epoch': 0.1}
{'loss': 0.0074, 'grad_norm': 0.3465728461742401, 'learning_rate': 1.9105145413870246e-05, 'epoch': 0.13}
{'loss': 0.0053, 'grad_norm': 0.8693322539329529, 'learning_rate': 1.888143176733781e-05, 'epoch': 0.17}
{'loss': 0.0048, 'grad_norm': 0.20168718695640564, 'learning_rate': 1.865771812080537e-05, 'epoch': 0.2}
{'loss': 0.0049, 'grad_norm': 0.6426231861114502, 'learning_rate': 1.8434004474272932e-05, 'epoch': 0.23}
{'loss': 0.0041, 'grad_norm': 0.24735701084136963, 'learning_rate': 1.8210290827740495e-05, 'epoch': 0.27}
{'loss': 0.0037, 'grad_norm': 0.550957441329956, 'learning_rate': 1.7986577181208054e-05, 'epoch': 0.3}
{'loss': 0.0033, 'grad_norm': 0.4024861752986908, '

  0%|          | 0/166 [00:00<?, ?it/s]

{'eval_loss': 0.002390013076364994, 'eval_runtime': 165.4403, 'eval_samples_per_second': 16.012, 'eval_steps_per_second': 1.003, 'epoch': 1.0}
{'loss': 0.0022, 'grad_norm': 0.12640808522701263, 'learning_rate': 1.3288590604026848e-05, 'epoch': 1.01}
{'loss': 0.0017, 'grad_norm': 0.3174171447753906, 'learning_rate': 1.3064876957494407e-05, 'epoch': 1.04}
{'loss': 0.0023, 'grad_norm': 0.26144808530807495, 'learning_rate': 1.284116331096197e-05, 'epoch': 1.07}
{'loss': 0.0019, 'grad_norm': 0.2745983898639679, 'learning_rate': 1.2617449664429532e-05, 'epoch': 1.11}
{'loss': 0.0017, 'grad_norm': 0.11088992655277252, 'learning_rate': 1.2393736017897093e-05, 'epoch': 1.14}
{'loss': 0.0018, 'grad_norm': 0.12338884174823761, 'learning_rate': 1.2170022371364654e-05, 'epoch': 1.17}
{'loss': 0.0017, 'grad_norm': 0.49450385570526123, 'learning_rate': 1.1946308724832217e-05, 'epoch': 1.21}
{'loss': 0.0017, 'grad_norm': 0.2701670825481415, 'learning_rate': 1.1722595078299776e-05, 'epoch': 1.24}
{'los

  0%|          | 0/166 [00:00<?, ?it/s]

{'eval_loss': 0.0038663328159600496, 'eval_runtime': 159.6187, 'eval_samples_per_second': 16.596, 'eval_steps_per_second': 1.04, 'epoch': 2.0}
{'loss': 0.0015, 'grad_norm': 0.2808120846748352, 'learning_rate': 6.5771812080536925e-06, 'epoch': 2.01}
{'loss': 0.0014, 'grad_norm': 0.16792817413806915, 'learning_rate': 6.353467561521254e-06, 'epoch': 2.05}
{'loss': 0.0013, 'grad_norm': 0.09805745631456375, 'learning_rate': 6.129753914988815e-06, 'epoch': 2.08}
{'loss': 0.0014, 'grad_norm': 0.24636541306972504, 'learning_rate': 5.906040268456377e-06, 'epoch': 2.11}
{'loss': 0.0014, 'grad_norm': 0.25639012455940247, 'learning_rate': 5.682326621923938e-06, 'epoch': 2.15}


In [None]:
# Evaluate the model
results = trainer.evaluate()
print(f"Evaluation results: {results}")

# Visualize training and validation loss
training_logs = trainer.state.log_history
train_loss = [entry['loss'] for entry in training_logs if 'loss' in entry]
eval_loss = [entry['eval_loss'] for entry in training_logs if 'eval_loss' in entry]
epochs = range(1, len(train_loss) + 1)

plt.plot(epochs, train_loss, label='Training Loss', marker='o')
plt.plot(epochs, eval_loss, label='Validation Loss', marker='o', color='orange')
plt.title("Training and Validation Loss Over Epochs")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend()
plt.show()

# Confusion Matrix (assuming binary classification for simplicity)
predictions = trainer.predict(val_dataset).predictions
predicted_labels = np.argmax(predictions, axis=1)
true_labels = val_dataset['labels']

cm = confusion_matrix(true_labels, predicted_labels)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Class 0', 'Class 1'])
disp.plot(cmap='Blues')
plt.title("Confusion Matrix")
plt.show()

# Visualize embeddings
model.eval()
with torch.no_grad():
    embeddings = model(torch.tensor(train_dataset['input_ids']).to("cpu")).last_hidden_state[:, 0, :].numpy()

pca = PCA(n_components=2)
reduced_embeddings = pca.fit_transform(embeddings)

plt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], c=train_dataset['labels'], cmap='viridis', alpha=0.7)
plt.title("2D Visualization of Learned Embeddings")
plt.colorbar(label='Labels')
plt.show()

# Save the fine-tuned model
model.save_pretrained("./fine_tuned_paraphrase_model")
tokenizer.save_pretrained("./fine_tuned_paraphrase_model")


In [None]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Load the fine-tuned model
model = AutoModelForSequenceClassification.from_pretrained("./fine_tuned_paraphrase_model")
tokenizer = AutoTokenizer.from_pretrained("./fine_tuned_paraphrase_model")

# Function to compute relevance scores
def compute_relevance_score(comment, title):
    text_pair = comment + " [SEP] " + title
    inputs = tokenizer(text_pair, return_tensors="pt", padding=True, truncation=True, max_length=128)
    with torch.no_grad():
        logits = model(**inputs).logits
    score = torch.sigmoid(logits).item()  # Use sigmoid to convert to a probability score
    return score

# Example usage
comment = "What are the advantages of using MERN stack?"
title = "Introduction to MERN Stack Development"
score = compute_relevance_score(comment, title)
print(f"Relevance score: {score}")


In [10]:
pip install 'accelerate>=0.26.0'


Note: you may need to restart the kernel to use updated packages.


ERROR: Invalid requirement: "'accelerate"


In [11]:
pip install torch torchvision



Collecting torchvision
  Downloading torchvision-0.20.1-cp39-cp39-win_amd64.whl (1.6 MB)
     ---------------------------------------- 1.6/1.6 MB 1.1 MB/s eta 0:00:00
Installing collected packages: torchvision
Successfully installed torchvision-0.20.1
