In [1]:
import pandas as pd
import numpy as np
import transformers
from torch.utils.data import DataLoader, TensorDataset
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
import torch
from tqdm import tqdm
import os
from nltk.tokenize import sent_tokenize
import nltk
import json
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/anra7539/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
# data = []
# file_name = []

# for file in os.listdir('/projects/anra7539/projects/big_data/transcript_summaries'):
#     if file != '.ipynb_checkpoints':
#         output_file = '/projects/anra7539/projects/big_data/transcript_summaries/'+file
#         file_name.append(file)
#         with open(output_file, 'r') as f:
#             data.append(pd.DataFrame([json.loads(line) for line in f]))

## Generating paraphrases 1

In [3]:
# tokenizer = transformers.T5Tokenizer.from_pretrained("google/flan-t5-large")
# model = transformers.T5ForConditionalGeneration.from_pretrained("google/flan-t5-large",
#                                                                cache_dir = "/scratch/alpine/anra7539").to("cuda")

# def paraphrase_text(text, paraphrase_limit = 500):
#   input_ids = tokenizer.encode(text, 
#                                return_tensors="pt", 
#                                truncation=True,
#                               max_length = 1024).to("cuda")
#   paraphrase_ids = model.generate(input_ids, 
#                                   max_length=paraphrase_limit,
#                                   num_beams=4,
#                                   length_penalty=1.1,
#                                   do_sample = True)
#   paraphrase = tokenizer.decode(paraphrase_ids[0], skip_special_tokens=True)

#   return paraphrase

In [4]:
# for df in data:
#     paraphrases = []
#     for summary in tqdm(df.Summary):
#         paraphrases.append(paraphrase_text(f"Paraphrase this text: {summary}"))
#     df["paraphrased_Summary"] = paraphrases

In [5]:
# for i in range(len(data)):
#     data[i].to_csv(f"/projects/anra7539/projects/big_data/transcript_summaries_with_paraphrases/{file_name[i].split('.json')[0]}.csv",
#                    index = False)

### Contrastive learning df creation

In [6]:
data = []

for file in os.listdir('/projects/anra7539/projects/big_data/transcript_summaries_with_paraphrases'):
    filename = '/projects/anra7539/projects/big_data/transcript_summaries_with_paraphrases/'+file
    data.append(pd.read_csv(filename))

In [7]:
for i in range(5):
    initial_length = len(data[i])
    primary_df = data[i].copy()
    
    secondary_df = pd.concat([df for j, df in enumerate(data) if j != i], ignore_index=True)

    num_label_0 = len(primary_df)*3
    sampled_summaries = secondary_df.sample(num_label_0, random_state=2025)['Summary'].tolist()

    label_0_df = primary_df.sample(n=num_label_0, random_state=2025, replace = True).copy()
    label_0_df['paraphrased_Summary'] = sampled_summaries
    label_0_df['label'] = [0] * num_label_0

    data[i]['label'] = [1] * len(data[i])

    data[i] = pd.concat([data[i], label_0_df], ignore_index=True)

In [8]:
pd.set_option('display.max_columns', None)

In [9]:
cl_data = pd.concat(data, ignore_index = True)

In [10]:
cl_data.rename(columns = {'paraphrased_Summary':'Summary2'}, inplace = True)

In [11]:
cl_data.label.value_counts()

0    1908
1     636
Name: label, dtype: int64

## Contrastive Learning

In [12]:
full_cl_df = cl_data.copy()
train, val = train_test_split(full_cl_df, test_size = 0.25, stratify = full_cl_df.label, random_state = 2024)
train.reset_index(drop = True, inplace = True)
val.reset_index(drop = True, inplace = True)

In [13]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', 
                            cache_folder = "/scratch/alpine/anra7539").to("cuda")

class ContrastiveLoss(torch.nn.Module):
    def __init__(self, margin=1.0):
        super(ContrastiveLoss, self).__init__()
        self.margin = margin

    def forward(self, embeddings1, embeddings2, labels):
        euclidean_distance = torch.nn.functional.pairwise_distance(embeddings1, embeddings2, keepdim=True)
        loss_contrastive = torch.mean((1-labels) * torch.pow(euclidean_distance, 2) +
                                      (labels) * torch.pow(torch.clamp(self.margin - euclidean_distance, min=0.0), 2))
        return loss_contrastive

criterion = ContrastiveLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
train_dataset = [[train.Summary[i], train.Summary2[i], train.label[i]] for i in range(len(train))]
val_dataset = [[val.Summary[i], val.Summary2[i], val.label[i]] for i in range(len(val))]

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=True)

early_stopping_rounds = 10 
best_validation_loss = float('inf')
no_improvement_count = 0

num_epochs = 100
num_training_steps = num_epochs * len(train_dataloader)
progress_bar = tqdm(range(num_training_steps))

  0%|          | 0/12000 [00:00<?, ?it/s]

In [14]:
for epoch in range(num_epochs):
    for batch in train_dataloader:
        sentences1, sentences2, labels = batch

        embeddings1 = torch.tensor(model.encode(sentences1), requires_grad = True).to("cuda")
        embeddings2 = torch.tensor(model.encode(sentences2), requires_grad = True).to("cuda")
        labels = labels.to("cuda")

        loss = criterion(embeddings1, embeddings2, labels.float())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        progress_bar.update(1)

    model.eval()
    with torch.no_grad():
        val_loss = 0
        for batch in val_dataloader:
            sentences1, sentences2, labels = batch

            embeddings1 = torch.tensor(model.encode(sentences1), requires_grad = False).to("cuda")
            embeddings2 = torch.tensor(model.encode(sentences2), requires_grad = False).to("cuda")
            labels = labels.to("cuda")

            val_loss+=criterion(embeddings1, embeddings2, labels.float())
            
        if val_loss < best_validation_loss:
            best_validation_loss = val_loss
            no_improvement_count = 0
            
            model.save('/scratch/alpine/anra7539/ml_specific_cl_finetuned/best_model')
        else:
            no_improvement_count += 1


        if no_improvement_count >= early_stopping_rounds:
            print(f'Early stopping after {epoch+1} epochs with no improvement.')
            break

        

 19%|█▉        | 2278/12000 [02:30<08:07, 19.92it/s]  

Early stopping after 19 epochs with no improvement.


In [22]:
from huggingface_hub import HfApi, login

login()

model.push_to_hub("AnkushRaut216/Contrastive-Finetuned-for-AI-all-MiniLM-L6-V2", exist_ok = True)


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/anra7539/.cache/huggingface/token
Login successful


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

'https://huggingface.co/AnkushRaut216/Contrastive-Finetuned-for-AI-all-MiniLM-L6-V2/commit/dc7293fd65c1d6b651006c71aca5f2bd8f96bbee'