In [1]:
import pandas as pd
import numpy as np
from transformers import PegasusTokenizer, PegasusForConditionalGeneration, AutoModel
from torch.utils.data import DataLoader, TensorDataset
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
import torch
from tqdm import tqdm
import os
from huggingface_hub import login
import datasets
login()
from nltk.tokenize import sent_tokenize
import nltk
nltk.download('punkt')

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
sent_essays = datasets.load_dataset("AnkushRaut216/test_essays")

Downloading readme:   0%|          | 0.00/307 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/297k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/173 [00:00<?, ? examples/s]

In [3]:
sent_essays_pd = sent_essays['train'].to_pandas()
sentences = sum([sent_tokenize(sent_essays_pd['text'][a]) for a in range(len(sent_essays_pd))], [])
len(sentences)

4588

## Generating paraphrases 1

In [4]:
tokenizer = PegasusTokenizer.from_pretrained("tuner007/pegasus_paraphrase")
model = PegasusForConditionalGeneration.from_pretrained("tuner007/pegasus_paraphrase",
                                                    cache_dir = '/scratch/alpine/anra7539').to("cuda")


def paraphrase_text(text, paraphrase_limit = 30):
  input_ids = tokenizer.encode(text, return_tensors="pt", truncation=True).to("cuda")
  paraphrase_ids = model.generate(input_ids, max_length=paraphrase_limit, min_length=5, num_beams=2,
                                length_penalty=1.0, early_stopping=True)
  paraphrase = tokenizer.decode(paraphrase_ids[0], skip_special_tokens=True)

  return paraphrase

  return self.fget.__get__(instance, owner)()
Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at tuner007/pegasus_paraphrase and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
paraphrases = []
for sentence in tqdm(sentences):
    paraphrases.append(paraphrase_text(sentence))

100%|██████████| 4588/4588 [18:11<00:00,  4.21it/s]


In [6]:
sentence_df = pd.DataFrame({"original_sentence":sentences, "paraphrases":paraphrases})

## Generating paraphrases 2

In [7]:
tokenizer = PegasusTokenizer.from_pretrained("tuner007/pegasus_paraphrase")
model = PegasusForConditionalGeneration.from_pretrained("tuner007/pegasus_paraphrase",
                                                    cache_dir = '/scratch/alpine/anra7539').to("cuda")


def paraphrase_text2(text, paraphrase_limit = 30):
  input_ids = tokenizer.encode(text, return_tensors="pt", truncation=True).to("cuda")
  paraphrase_ids = model.generate(input_ids, max_length=paraphrase_limit, min_length=5, num_beams=4,
                                  do_sample = True, length_penalty=1.0, early_stopping=True)
  paraphrase = tokenizer.decode(paraphrase_ids[0], skip_special_tokens=True)

  return paraphrase

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at tuner007/pegasus_paraphrase and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
paraphrases2 = []
for sentence in tqdm(sentences):
    paraphrases2.append(paraphrase_text2(sentence))

100%|██████████| 4588/4588 [20:32<00:00,  3.72it/s]


In [9]:
sentence_df2 = pd.DataFrame({"original_sentence":sentences, "paraphrases":paraphrases2})
final_df = pd.concat([sentence_df, sentence_df2], ignore_index = True)
datasets.Dataset.from_pandas(final_df).push_to_hub("AnkushRaut216/llm_generated_sentences_data_final", private = True)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/AnkushRaut216/llm_generated_sentences_data_final/commit/11c45d15c5159a93a695e4485e09754e999c9100', commit_message='Upload dataset', commit_description='', oid='11c45d15c5159a93a695e4485e09754e999c9100', pr_url=None, pr_revision=None, pr_num=None)

### Contrastive learning df creation

In [4]:
cl_data = datasets.load_dataset("AnkushRaut216/llm_generated_sentences_data_final")
cl_data_pd = cl_data['train'].to_pandas()

cl_data_pd['label'] = [1]*len(cl_data_pd)

unique_sentences = cl_data_pd.iloc[:int(len(cl_data_pd)/2),:].original_sentence

Downloading readme:   0%|          | 0.00/327 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9176 [00:00<?, ? examples/s]

In [3]:
cl_data_pd.rename(columns = {'paraphrases':'sentence2'}, inplace = True)

In [4]:
negative_sentences1 = []
negative_sentences2 = []
negative_sentences3 = []
for sent in unique_sentences:
    negative_sentences1.append(np.random.choice(unique_sentences[unique_sentences!= sent]))
    negative_sentences2.append(np.random.choice(unique_sentences[unique_sentences!= sent]))
    negative_sentences3.append(np.random.choice(unique_sentences[unique_sentences!= sent]))

In [5]:
negative_df1 = pd.DataFrame({'original_sentence':unique_sentences, 'sentence2':negative_sentences1})
negative_df2 = pd.DataFrame({'original_sentence':unique_sentences, 'sentence2':negative_sentences2})
negative_df3 = pd.DataFrame({'original_sentence':unique_sentences, 'sentence2':negative_sentences3})

negative_df = pd.concat([negative_df1, negative_df2, negative_df3], ignore_index = True)

In [6]:
negative_df['label'] = [0]*len(negative_df)
full_cl_df = pd.concat([cl_data_pd, negative_df], ignore_index = True)

In [7]:
print(full_cl_df.shape)
full_cl_df.drop_duplicates(inplace = True)
print(full_cl_df.shape)

(22940, 3)
(20686, 3)


In [8]:
full_cl_df.label.value_counts()

label
0    13759
1     6927
Name: count, dtype: int64

In [9]:
full_cl_df.reset_index(drop = True, inplace = True)

In [10]:
datasets.Dataset.from_pandas(full_cl_df).push_to_hub("AnkushRaut216/full_cl_data", private = True)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/21 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/359 [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/359 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.63M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/20686 [00:00<?, ? examples/s]

## Contrastive Learning

In [2]:
full_cl_df = datasets.load_dataset("AnkushRaut216/full_cl_data")['train'].to_pandas()
train, val = train_test_split(full_cl_df, test_size = 0.25, stratify = full_cl_df.label, random_state = 2024)
train.reset_index(drop = True, inplace = True)
val.reset_index(drop = True, inplace = True)

In [3]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', cache_folder = "/scratch/alpine/anra7539").to("cuda")

class ContrastiveLoss(torch.nn.Module):
    def __init__(self, margin=1.0):
        super(ContrastiveLoss, self).__init__()
        self.margin = margin

    def forward(self, embeddings1, embeddings2, labels):
        euclidean_distance = torch.nn.functional.pairwise_distance(embeddings1, embeddings2, keepdim=True)
        loss_contrastive = torch.mean((1-labels) * torch.pow(euclidean_distance, 2) +
                                      (labels) * torch.pow(torch.clamp(self.margin - euclidean_distance, min=0.0), 2))
        return loss_contrastive

criterion = ContrastiveLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
train_dataset = [[train.original_sentence[i], train.sentence2[i], train.label[i]] for i in range(len(train))]
val_dataset = [[val.original_sentence[i], val.sentence2[i], val.label[i]] for i in range(len(val))]

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=True)

early_stopping_rounds = 5  
best_validation_loss = float('inf')
no_improvement_count = 0

num_epochs = 50
num_training_steps = num_epochs * len(train_dataloader)
progress_bar = tqdm(range(num_training_steps))

  0%|          | 0/48500 [00:00<?, ?it/s]

In [4]:
for epoch in range(num_epochs):
    for batch in train_dataloader:
        sentences1, sentences2, labels = batch

        embeddings1 = torch.tensor(model.encode(sentences1), requires_grad = True).to("cuda")
        embeddings2 = torch.tensor(model.encode(sentences2), requires_grad = True).to("cuda")
        labels = labels.to("cuda")

        loss = criterion(embeddings1, embeddings2, labels.float())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        progress_bar.update(1)

    model.eval()
    with torch.no_grad():
        val_loss = 0
        for batch in val_dataloader:
            sentences1, sentences2, labels = batch

            embeddings1 = torch.tensor(model.encode(sentences1), requires_grad = False).to("cuda")
            embeddings2 = torch.tensor(model.encode(sentences2), requires_grad = False).to("cuda")
            labels = labels.to("cuda")

            val_loss+=criterion(embeddings1, embeddings2, labels.float())
            
        if val_loss < best_validation_loss:
            best_validation_loss = val_loss
            no_improvement_count = 0
            
            model.save('/scratch/alpine/anra7539/contrastive_learning_model/best_model')
        else:
            no_improvement_count += 1


        if no_improvement_count >= early_stopping_rounds:
            print(f'Early stopping after {epoch+1} epochs with no improvement.')
            break

        

 20%|█▉        | 9699/48500 [02:41<08:17, 77.93it/s]  

Early stopping after 10 epochs with no improvement.
