In [1]:
from sklearn.utils import shuffle
import random
from nltk.corpus import wordnet

In [9]:
import nltk

nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [6]:
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/interIIT/cgptsamples-1.csv')

In [15]:
df

Unnamed: 0,premise,hypothesis,label
0,A team of engineers is developing a smart irri...,Efficient irrigation technology.,1
1,angstrom group of writer be collaborate on ang...,Children's literature for sociable values.,1
2,angstrom parent be teaching their child how to...,swimming direction be underway.,1
3,angstrom group of environmental militant be fo...,plastic decrease advocacy.,1
4,angstrom checkup research team be investigatio...,root cell cardiac therapy research.,1
...,...,...,...
629,A group of musicians is composing a symphony t...,Rainforest-inspired symphonic composition.,1
630,angstrom group of research_worker be perusal t...,nature sound therapy research.,1
631,A team of researchers is investigating the pot...,Biodegradable plastic research.,1
632,angstrom renewable energy cooperative be insta...,community solar power initiative.,1


In [5]:
def build_synonym_dict(text_list):
    synonym_dict = {}
    for text in text_list:
        for word in text.split():
            if word not in synonym_dict:
                synonyms = wordnet.synsets(word)
                if len(synonyms) > 0 and len(synonyms[0].lemmas()) > 0:
                    synonym_dict[word] = synonyms[0].lemmas()[0].name()
    return synonym_dict

def synonym_replace(sentence, synonym_dict):
    words = sentence.split()
    new_sentence = ""
    for word in words:
        new_sentence += synonym_dict.get(word, word) + " "
    return new_sentence.strip()

In [10]:
synonym_dict = build_synonym_dict(df['premise'].tolist() + df['hypothesis'].tolist())

In [11]:
augmented_data = []
for index, row in df.iterrows():
    premise, hypothesis = row['premise'], row['hypothesis']

    # Synonym replacement
    new_premise_syn = synonym_replace(premise, synonym_dict)
    new_hypothesis_syn = synonym_replace(hypothesis, synonym_dict)

    augmented_data.append((new_premise_syn, new_hypothesis_syn, row['label']))

In [12]:
augmented_df = pd.DataFrame(augmented_data, columns=['premise', 'hypothesis', 'label'])
df = pd.concat([df, augmented_df])

In [13]:
df = shuffle(df).reset_index(drop=True)

In [14]:
df

Unnamed: 0,premise,hypothesis,label
0,A team of engineers is developing a smart irri...,Efficient irrigation technology.,1
1,angstrom group of writer be collaborate on ang...,Children's literature for sociable values.,1
2,angstrom parent be teaching their child how to...,swimming direction be underway.,1
3,angstrom group of environmental militant be fo...,plastic decrease advocacy.,1
4,angstrom checkup research team be investigatio...,root cell cardiac therapy research.,1
...,...,...,...
629,A group of musicians is composing a symphony t...,Rainforest-inspired symphonic composition.,1
630,angstrom group of research_worker be perusal t...,nature sound therapy research.,1
631,A team of researchers is investigating the pot...,Biodegradable plastic research.,1
632,angstrom renewable energy cooperative be insta...,community solar power initiative.,1


In [18]:
df.to_csv('/content/drive/MyDrive/interIIT/df_aug.csv', index=False)

## Adverserial sampling

In [1]:
!pip install transformers



In [2]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import torch
import pandas as pd
import numpy as np

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [23]:
df = pd.read_csv("/content/drive/MyDrive/interIIT/df_aug.csv")

In [5]:
df

Unnamed: 0,premise,hypothesis,label
0,A team of engineers is developing a smart irri...,Efficient irrigation technology.,1
1,angstrom group of writer be collaborate on ang...,Children's literature for sociable values.,1
2,angstrom parent be teaching their child how to...,swimming direction be underway.,1
3,angstrom group of environmental militant be fo...,plastic decrease advocacy.,1
4,angstrom checkup research team be investigatio...,root cell cardiac therapy research.,1
...,...,...,...
629,A group of musicians is composing a symphony t...,Rainforest-inspired symphonic composition.,1
630,angstrom group of research_worker be perusal t...,nature sound therapy research.,1
631,A team of researchers is investigating the pot...,Biodegradable plastic research.,1
632,angstrom renewable energy cooperative be insta...,community solar power initiative.,1


In [6]:
train_df, test_df = train_test_split(df, test_size=0.2)
train_texts = train_df['premise'].tolist()
train_hypotheses = train_df['hypothesis'].tolist()
train_labels = train_df['label'].tolist()

In [27]:
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
train_encodings = tokenizer(train_texts, train_hypotheses, truncation=True, padding=True)

AttributeError: ignored

In [28]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]).to(device) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx]).to(device)
        return item

    def __len__(self):
        return len(self.labels)

In [25]:
train_dataset = MyDataset(train_encodings, train_labels)

In [29]:
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
model.to(device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.weight', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [11]:
!pip install transformers[torch]



In [12]:
!pip install accelerate -U



In [40]:
training_args = TrainingArguments(
    per_device_train_batch_size=16,
    num_train_epochs=1,
    logging_dir='./logs',
    output_dir='./output',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

trainer.train()

RuntimeError: ignored

In [32]:
def generate_adversarial_samples(model, text, hypothesis):
    inputs = tokenizer(text, hypothesis, return_tensors="pt", truncation=True, padding=True)

    inputs = {name: tensor.to(device) for name, tensor in inputs.items()}

    outputs = model(**inputs)
    logits = outputs.logits
    pred = torch.argmax(logits).item()

    if pred == 1:
        words = text.split()
        if len(words) > 1:
            idx = np.random.randint(0, len(words) - 1)
            words[idx], words[idx + 1] = words[idx + 1], words[idx]
            new_text = " ".join(words)
            return new_text, hypothesis, 0

    return None, None, None

In [33]:
adversarial_data = []
for text, hypothesis in zip(train_texts, train_hypotheses):
    new_text, new_hypothesis, new_label = generate_adversarial_samples(model, text, hypothesis)
    if new_text:
        adversarial_data.append((new_text, new_hypothesis, new_label))


In [34]:
if adversarial_data:
    adversarial_df = pd.DataFrame(adversarial_data, columns=['premise', 'hypothesis', 'label'])
    train_df = pd.concat([train_df, adversarial_df])

In [39]:
train_df

Unnamed: 0,premise,hypothesis,label
33,angstrom team of research_worker be conducting...,mindfulness research.,1
290,angstrom team of psychologist be research the ...,Nature-based mental health research.,1
298,A team of software developers is creating a la...,AI-powered language education.,1
565,row of book are neatly arrange on the library ...,The library hour_angle form bookshelves.,1
147,A team of researchers is studying the potentia...,Algae-based carbon capture research.,1
...,...,...,...
502,angstrom team of archeologist be carefully exc...,archaeological discovery.,0
503,A is mechanic repairing a car's engine.,Vehicle maintenance is taking place.,0
504,Children riding are on a carousel at the fair.,Kids are enjoying the fair rides.,0
505,angstrom renewable energy startup development ...,geothermal energy technology innovation.,0


In [42]:
train_df.to_csv('/content/drive/MyDrive/interIIT/train_df.csv',index=False)