In [1]:
#Qn:1
#Use a embedding model to do Sentiment Analysis by finiding cosine similarity between Input and “This Sentence is Positive” and “This Sentence is Negative”. Whichever has the highest value is the class?. Try out with different sentences and different model and see



In [3]:
import numpy as np
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.tokenize import word_tokenize

# Download the 'punkt_tab' resource
nltk.download('punkt_tab')

# Sample corpus to train Word2Vec
sentences = [
    "This is a positive sentence",
    "This is another positive sentence",
    "This is a negative sentence",
    "I love this product",
    "The movie was terrible"
]

# Tokenize the sentences
tokenized_sentences = [word_tokenize(sentence.lower()) for sentence in sentences]

# Train Word2Vec model
word2vec_model = Word2Vec(tokenized_sentences, vector_size=50, window=5, min_count=1, workers=4)

# Function to get sentence embeddings by averaging word embeddings
def get_sentence_embedding(sentence, model):
    words = word_tokenize(sentence.lower())
    word_embeddings = [model.wv[word] for word in words if word in model.wv]
    if len(word_embeddings) > 0:
        return np.mean(word_embeddings, axis=0)
    else:
        return np.zeros(model.vector_size)

# Function to calculate cosine similarity
def calculate_similarity(text1, text2):
    embedding1 = get_sentence_embedding(text1, word2vec_model)
    embedding2 = get_sentence_embedding(text2, word2vec_model)
    return cosine_similarity([embedding1], [embedding2])[0][0]

# Example sentences
text1 = "This is a positive sentence"
text2 = "This is another positive sentence"
text3 = "This is a negative sentence"

# Calculate similarities
similarity_1_2 = calculate_similarity(text1, text2)
similarity_1_3 = calculate_similarity(text1, text3)
similarity_2_3 = calculate_similarity(text2, text3)

# Print results
print(f"Similarity between '{text1}' and '{text2}': {similarity_1_2:.4f}")
print(f"Similarity between '{text1}' and '{text3}': {similarity_1_3:.4f}")
print(f"Similarity between '{text2}' and '{text3}': {similarity_2_3:.4f}")

Similarity between 'This is a positive sentence' and 'This is another positive sentence': 0.8277
Similarity between 'This is a positive sentence' and 'This is a negative sentence': 0.8526
Similarity between 'This is another positive sentence' and 'This is a negative sentence': 0.6864


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [4]:
!pip install transformers sentence-transformers

from transformers import PreTrainedTokenizerFast, BertModel
from sentence_transformers import SentenceTransformer, util
import torch

# Load pre-trained ModernBERT model and tokenizer
model_name = 'answerdotai/ModernBERT-base'  # Changed to ModernBERT
tokenizer = PreTrainedTokenizerFast.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# Or use a sentence transformer model directly for better performance
# model = SentenceTransformer('answerdotai/ModernBERT-base') # Example: ModernBERT-base


def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)  # Average pooling of token embeddings
    return embeddings


def calculate_similarity(text1, text2):
    embedding1 = get_bert_embedding(text1)
    embedding2 = get_bert_embedding(text2)

    # Calculate cosine similarity
    similarity_score = util.cos_sim(embedding1, embedding2).item()
    return similarity_score


# Example usage
text1 = "This is a positive sentence."
text2 = "This is another positive sentence."
text3 = "This is a negative sentence."

similarity_1_2 = calculate_similarity(text1, text2)
similarity_1_3 = calculate_similarity(text1, text3)
similarity_2_3 = calculate_similarity(text2, text3)

print(f"Similarity between '{text1}' and '{text2}': {similarity_1_2}")
print(f"Similarity between '{text1}' and '{text3}': {similarity_1_3}")
print(f"Similarity between '{text2}' and '{text3}': {similarity_2_3}")



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/20.8k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.13M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

You are using a model of type modernbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.


model.safetensors:   0%|          | 0.00/599M [00:00<?, ?B/s]

Some weights of BertModel were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.word_embeddings.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.0.attention.self.value.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.dense.bias', 'encod

Similarity between 'This is a positive sentence.' and 'This is another positive sentence.': 0.9906951189041138
Similarity between 'This is a positive sentence.' and 'This is a negative sentence.': 0.9898030757904053
Similarity between 'This is another positive sentence.' and 'This is a negative sentence.': 0.9805107116699219


In [None]:
#Qn:2
#Use any text classification dataset to finetune any BERT model by adding a classification head either class 2 or 3 and see the performance for different training parameters. If you have time see how the accuracy is compared to above. (You can use sms_spam dataset with Distilbert if you can’t find any)


In [2]:
pip install pandas transformers datasets scikit-learn


Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [1]:
pip install transformers datasets torch




In [12]:
import pandas as pd
from datasets import Dataset

# Prepare the dataset
data = {
    'Review_Text': [
        "This book is amazing!",
        "Not worth the money.",
        "I loved the story!",
        "Poor quality, very disappointed.",
        "The writing was beautiful.",
        "Waste of time, wouldn't recommend.",
        "Fantastic read, highly recommend!",
        "The product was defective."
    ],
    'Sentiment': [1, 0, 1, 0, 1, 0, 1, 0]
}

# Create a pandas dataframe
df = pd.DataFrame(data)

# Convert to Hugging Face dataset
dataset = Dataset.from_pandas(df)


In [13]:
from transformers import BertTokenizer

# Load pre-trained tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["Review_Text"], padding="max_length", truncation=True, max_length=128)

# Apply tokenization to the dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Rename 'Sentiment' to 'labels' as expected by the model
tokenized_datasets = tokenized_datasets.rename_column("Sentiment", "labels")

# Set the format of the dataset for PyTorch
tokenized_datasets.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


Map:   0%|          | 0/8 [00:00<?, ? examples/s]

In [16]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments
import torch

# Load pre-trained BERT model with a classification head
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",          # output directory
    num_train_epochs=3,              # number of training epochs
    per_device_train_batch_size=8,   # batch size for training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir="./logs",            # directory for storing logs
    logging_steps=10,
    evaluation_strategy="epoch",     # evaluate every epoch
    save_strategy="epoch",           # save model every epoch
    load_best_model_at_end=True,     # load the best model when finished training
    metric_for_best_model="accuracy", # use accuracy to select the best model
    report_to="none"                 # Disable Wandb
)


# Define evaluation metrics (accuracy)
from sklearn.metrics import accuracy_score

def compute_metrics(p):
    preds = p.predictions.argmax(axis=-1)
    # Return a dictionary instead of a single value
    return {"accuracy": accuracy_score(p.label_ids, preds)}


# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,                  # training arguments
    train_dataset=tokenized_datasets,    # training dataset
    eval_dataset=tokenized_datasets,     # evaluation dataset (same as training here)
    compute_metrics=compute_metrics,     # evaluation metrics
)

# Fine-tune the model
trainer.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.734591,0.5
2,No log,0.734382,0.5
3,No log,0.733934,0.5


TrainOutput(global_step=3, training_loss=0.7617638905843099, metrics={'train_runtime': 260.606, 'train_samples_per_second': 0.092, 'train_steps_per_second': 0.012, 'total_flos': 1578666332160.0, 'train_loss': 0.7617638905843099, 'epoch': 3.0})

In [17]:
# Evaluate the model
trainer.evaluate()


{'eval_loss': 0.7345906496047974,
 'eval_accuracy': 0.5,
 'eval_runtime': 4.8112,
 'eval_samples_per_second': 1.663,
 'eval_steps_per_second': 0.208,
 'epoch': 3.0}