In [1]:
pip install transformers sentence-transformers datasets

Collecting sentence-transformers
  Downloading sentence_transformers-2.3.1-py3-none-any.whl.metadata (11 kB)
Downloading sentence_transformers-2.3.1-py3-none-any.whl (132 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.8/132.8 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-2.3.1
Note: you may need to restart the kernel to use updated packages.


In [2]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, models
from transformers import BertTokenizer
from transformers import get_linear_schedule_with_warmup
import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader
from tqdm import tqdm
import time
import datetime
import random
import numpy as np
import pandas as pd

In [3]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla P100-PCIE-16GB


In [4]:
# Load the English version of the STSB dataset
dataset = load_dataset("stsb_multi_mt", "en")
print(dataset)

Downloading builder script:   0%|          | 0.00/2.10k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.69k [00:00<?, ?B/s]

Downloading and preparing dataset stsb_multi_mt/en (download: 1.02 MiB, generated: 1.06 MiB, post-processed: Unknown size, total: 2.08 MiB) to /root/.cache/huggingface/datasets/stsb_multi_mt/en/1.0.0/a5d260e4b7aa82d1ab7379523a005a366d9b124c76a5a5cf0c4c5365458b0ba9...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/229k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/74.0k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/52.9k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5749 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1379 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/1500 [00:00<?, ? examples/s]

Dataset stsb_multi_mt downloaded and prepared to /root/.cache/huggingface/datasets/stsb_multi_mt/en/1.0.0/a5d260e4b7aa82d1ab7379523a005a366d9b124c76a5a5cf0c4c5365458b0ba9. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'similarity_score'],
        num_rows: 5749
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'similarity_score'],
        num_rows: 1379
    })
    dev: Dataset({
        features: ['sentence1', 'sentence2', 'similarity_score'],
        num_rows: 1500
    })
})


In [5]:
len(dataset['train'])

5749

In [6]:
print("A sample from the STSB dataset's training split:")
for i in range(10):
    print(dataset['train'][i])

A sample from the STSB dataset's training split:
{'sentence1': 'A plane is taking off.', 'sentence2': 'An air plane is taking off.', 'similarity_score': 5.0}
{'sentence1': 'A man is playing a large flute.', 'sentence2': 'A man is playing a flute.', 'similarity_score': 3.799999952316284}
{'sentence1': 'A man is spreading shreded cheese on a pizza.', 'sentence2': 'A man is spreading shredded cheese on an uncooked pizza.', 'similarity_score': 3.799999952316284}
{'sentence1': 'Three men are playing chess.', 'sentence2': 'Two men are playing chess.', 'similarity_score': 2.5999999046325684}
{'sentence1': 'A man is playing the cello.', 'sentence2': 'A man seated is playing the cello.', 'similarity_score': 4.25}
{'sentence1': 'Some men are fighting.', 'sentence2': 'Two men are fighting.', 'similarity_score': 4.25}
{'sentence1': 'A man is smoking.', 'sentence2': 'A man is skating.', 'similarity_score': 0.5}
{'sentence1': 'The man is playing the piano.', 'sentence2': 'The man is playing the guit

In [7]:
df = pd.read_csv('/kaggle/input/eng-train2/eng_train.csv')

In [8]:
df.head()

Unnamed: 0,PairID,Text,Score
0,ENG-train-0000,"It that happens, just pull the plug.\nif that ...",1.0
1,ENG-train-0001,A black dog running through water.\nA black do...,1.0
2,ENG-train-0002,I've been searchingthe entire abbey for you.\n...,1.0
3,ENG-train-0003,If he is good looking and has a good personali...,1.0
4,ENG-train-0004,"She does not hate you, she is just annoyed wit...",1.0


In [9]:
# Split the 'Text' column and create a DataFrame from the resulting list of lists
split_text = df['Text'].apply(lambda x: x.split('\n', 1)).to_list()
split_text_df = pd.DataFrame(split_text, columns=['sentence1', 'sentence2'])

# Assign the columns from the split_text_df to the original DataFrame (df)
df[['sentence1', 'sentence2']] = split_text_df[['sentence1', 'sentence2']]

In [10]:
df.head()

Unnamed: 0,PairID,Text,Score,sentence1,sentence2
0,ENG-train-0000,"It that happens, just pull the plug.\nif that ...",1.0,"It that happens, just pull the plug.","if that ever happens, just pull the plug."
1,ENG-train-0001,A black dog running through water.\nA black do...,1.0,A black dog running through water.,A black dog is running through some water.
2,ENG-train-0002,I've been searchingthe entire abbey for you.\n...,1.0,I've been searchingthe entire abbey for you.,I'm looking for you all over the abbey.
3,ENG-train-0003,If he is good looking and has a good personali...,1.0,If he is good looking and has a good personali...,"If he's good looking, and a good personality, ..."
4,ENG-train-0004,"She does not hate you, she is just annoyed wit...",1.0,"She does not hate you, she is just annoyed wit...","She doesn't hate you, she is just annoyed."


In [11]:
df.drop('PairID', axis = 1, inplace = True)
df.drop('Text', axis = 1, inplace = True)
df['similarity_score'] = df['Score'] * 5.0
df.drop('Score', axis=1, inplace = True)

In [12]:
df.head()
#print(len(df))

Unnamed: 0,sentence1,sentence2,similarity_score
0,"It that happens, just pull the plug.","if that ever happens, just pull the plug.",5.0
1,A black dog running through water.,A black dog is running through some water.,5.0
2,I've been searchingthe entire abbey for you.,I'm looking for you all over the abbey.,5.0
3,If he is good looking and has a good personali...,"If he's good looking, and a good personality, ...",5.0
4,"She does not hate you, she is just annoyed wit...","She doesn't hate you, she is just annoyed.",5.0


In [13]:
combined  = pd.read_csv('/kaggle/input/combined/combined.csv')

In [27]:
combined.head()

Unnamed: 0,sentence1,sentence2,similarity_score
0,The young boys are playing outdoors and the ma...,There is no boy playing outdoors and there is ...,3.6
1,A person in a black jacket is doing tricks on ...,A skilled person is riding a bicycle on one wheel,3.4
2,Four children are doing backbends in the gym,Four girls are doing backbends and playing out...,3.8
3,A player is throwing the ball,Two teams are competing in a football match,2.9
4,Five children are standing in front of a woode...,Five children are standing in a wooden hut,4.2


In [14]:
sem = {(row['sentence1'], row['sentence2']): row['similarity_score'] for index, row in df.iterrows()}
com = {(row['sentence1'], row['sentence2']): row['similarity_score'] for index, row in combined.iterrows()}
tr = {(row['sentence1'], row['sentence2']): row['similarity_score'] for row in dataset['train']}

In [15]:
print(len(sem))
print(len(com))
print(len(tr))

5500
16497
5706


In [16]:
result = dict()
counter = dict()

for key, value in list(sem.items())[:]:
    if key in result:
        result[key] += value
        counter[key] += 1
    else:
        result[key] = value
        counter[key] = 1
        
for key, value in list(com.items())[:]:
    if key in result:
        result[key] += value
        counter[key] += 1
    else:
        result[key] = value
        counter[key] = 1
        
for key, value in list(tr.items())[:]:
    if key in result:
        result[key] += value
        counter[key] += 1
    else:
        result[key] = value
        counter[key] = 1
        
print(len(result))

24909


In [17]:
for key, value in list(result.items())[:]:
    result[key] = round(result[key] / (1.00 * counter[key]), 10)

In [18]:
for key, value in list(result.items())[:10]:
    print(key, value)

('It that happens, just pull the plug.', 'if that ever happens, just pull the plug.') 5.0
('A black dog running through water.', 'A black dog is running through some water.') 4.8666667302
("I've been searchingthe entire abbey for you.", "I'm looking for you all over the abbey.") 5.0
('If he is good looking and has a good personality, he might be straight - but is more likely bisexual.', "If he's good looking, and a good personality, he MIGHT be straight, but more likely bi.") 5.0
('She does not hate you, she is just annoyed with you.', "She doesn't hate you, she is just annoyed.") 5.0
('Actor Gazzara dead at 81', 'Actor Ben Gazzara dies at 81') 4.7000000477
("No, I really didn't want New York to win.", "No i didn't want New york to win") 5.0
('I hae no problems with them.', 'lol i have no problems with them.') 5.0
('Your parents do not have to like your boyfriend, you do.', 'your parents dont have to like your bf, you do.') 5.0
('I think Taylor is really cute, but I hate his voice.', '

In [19]:
print(dataset['train'][0]['sentence1'])

"""baal1 = list(train_df['sentence1'])
baal2 = list(train_df['sentence2'])
baal3 = list(train_df['similarity_score'])

baal11 = list(combined['sentence1'])
baal22 = list(combined['sentence2'])
baal33 = list(combined['similarity_score'])

baal1.extend(baal11)
baal2.extend(baal22)
baal3.extend(baal33)"""
#updated_dataset = dataset['train'].map(lambda baal : {'sentence1': + baal1['baal']})

#print(len(baal1))
baal1 = list(result.keys())
baal2 = list(result.values())

A plane is taking off.


In [20]:
baal11 = []
baal12 = []

for i in range(len(baal1)):
    sentence1, sentence2 = baal1[i]
    baal11.append(sentence1)
    baal12.append(sentence2)

print(len(baal11))
print(len(baal12))

24909
24909


In [22]:
kf = pd.DataFrame({
    'sentence1': baal11,
    'sentence2': baal12,
    'similarity_score': baal2
})

kf.head()

Unnamed: 0,sentence1,sentence2,similarity_score
0,"It that happens, just pull the plug.","if that ever happens, just pull the plug.",5.0
1,A black dog running through water.,A black dog is running through some water.,4.866667
2,I've been searchingthe entire abbey for you.,I'm looking for you all over the abbey.,5.0
3,If he is good looking and has a good personali...,"If he's good looking, and a good personality, ...",5.0
4,"She does not hate you, she is just annoyed wit...","She doesn't hate you, she is just annoyed.",5.0


In [71]:
"""import random

partition = 3000

dev_baal1 = baal11[21000:]
#dev_baal1 = random.sample(baal11, partition)
dev_baal2 = baal12[21000:]
#dev_baal2 = random.sample(baal12, partition)
dev_baal3 = baal2[21000:]
#dev_baal3 = random.sample(baal2, partition)

#baal11 = [element for element in baal11 if element not in dev_baal1]

baal11 = baal11[:21000]

#baal12 = [element for element in baal12 if element not in dev_baal2]
baal12 = baal12[:21000]

#baal2 = [element for element in baal2 if element not in dev_baal3]
baal2 = baal2[:21000]"""

'import random\n\npartition = 3000\n\ndev_baal1 = baal11[21000:]\n#dev_baal1 = random.sample(baal11, partition)\ndev_baal2 = baal12[21000:]\n#dev_baal2 = random.sample(baal12, partition)\ndev_baal3 = baal2[21000:]\n#dev_baal3 = random.sample(baal2, partition)\n\n#baal11 = [element for element in baal11 if element not in dev_baal1]\n\nbaal11 = baal11[:21000]\n\n#baal12 = [element for element in baal12 if element not in dev_baal2]\nbaal12 = baal12[:21000]\n\n#baal2 = [element for element in baal2 if element not in dev_baal3]\nbaal2 = baal2[:21000]'

In [72]:
"""print(len(baal11))
print(len(dev_baal1))
for i in range(10):
    print(baal11[i], baal12[i], baal2[i])"""

'print(len(baal11))\nprint(len(dev_baal1))\nfor i in range(10):\n    print(baal11[i], baal12[i], baal2[i])'

In [73]:
"""print(len(dataset['train']))
print(len(baal11))
print(len(baal12))
print(len(baal2))"""

"print(len(dataset['train']))\nprint(len(baal11))\nprint(len(baal12))\nprint(len(baal2))"

In [74]:
"""from datasets import DatasetDict, Dataset

dataset222 = dataset.copy()
#print(dataset222)

updated_train_dataa = []
updated_train_dataa = [{'sentence1': s1, 'sentence2': s2, 'similarity_score': score} for s1, s2, score in zip(baal11, baal12, baal2)]

dataset222['train'] = updated_train_dataa

print(len(dataset222['train']))
print(len(updated_train_dataa))"""

"from datasets import DatasetDict, Dataset\n\ndataset222 = dataset.copy()\n#print(dataset222)\n\nupdated_train_dataa = []\nupdated_train_dataa = [{'sentence1': s1, 'sentence2': s2, 'similarity_score': score} for s1, s2, score in zip(baal11, baal12, baal2)]\n\ndataset222['train'] = updated_train_dataa\n\nprint(len(dataset222['train']))\nprint(len(updated_train_dataa))"

In [75]:
"""for i in range(10):
    print(dataset222['test'][i])
print(len(dataset222['train']))
print(len(dataset222['test']))"""

"for i in range(10):\n    print(dataset222['test'][i])\nprint(len(dataset222['train']))\nprint(len(dataset222['test']))"

In [76]:
"""from datasets import DatasetDict, Dataset

dataset['train'][0]['sentence1']

#baal1 = list(train_df['sentence1'])
#baal2 = list(train_df['sentence2'])
#baal3 = list(train_df['similarity_score'])

#updated_train_data
dataset2 = dataset.copy()
#print(len(dataset['train']))
# Create a new list of dictionaries with updated values
updated_train_data = []
#print(len(updated_train_data))

updated_train_data = [{'sentence1': s1, 'sentence2': s2, 'similarity_score': score} for s1, s2, score in zip(baal11, baal12, baal2)]

#updated_train_data1 = [dataset['train'][i] for i in range(len(dataset['train']))]
#print(len(updated_train_data1))

#updated_train_data1.extend(updated_train_data)
# Assign the updated list back to dataset['train'] 
#dataset2['train'] = updated_train_data
#print(len(dataset2['train']))
#print(len(updated_train_data))

#for i in range(0, 5749):
#    dataset2['train'][i] = dataset['train'][i].copy()

dataset = dataset222.copy()

print(len(dataset['train']))
print(len(dataset['test']))"""

"from datasets import DatasetDict, Dataset\n\ndataset['train'][0]['sentence1']\n\n#baal1 = list(train_df['sentence1'])\n#baal2 = list(train_df['sentence2'])\n#baal3 = list(train_df['similarity_score'])\n\n#updated_train_data\ndataset2 = dataset.copy()\n#print(len(dataset['train']))\n# Create a new list of dictionaries with updated values\nupdated_train_data = []\n#print(len(updated_train_data))\n\nupdated_train_data = [{'sentence1': s1, 'sentence2': s2, 'similarity_score': score} for s1, s2, score in zip(baal11, baal12, baal2)]\n\n#updated_train_data1 = [dataset['train'][i] for i in range(len(dataset['train']))]\n#print(len(updated_train_data1))\n\n#updated_train_data1.extend(updated_train_data)\n# Assign the updated list back to dataset['train'] \n#dataset2['train'] = updated_train_data\n#print(len(dataset2['train']))\n#print(len(updated_train_data))\n\n#for i in range(0, 5749):\n#    dataset2['train'][i] = dataset['train'][i].copy()\n\ndataset = dataset222.copy()\n\nprint(len(dataset

In [77]:
#len(dataset222['train'])

In [78]:
#print(len(dataset['dev']))

In [23]:
sf=pd.read_csv('/kaggle/input/ansdata/eng_dev_with_labels.csv')

split_text = sf['Text'].apply(lambda x: x.split('\n', 1)).to_list()
split_text_sf = pd.DataFrame(split_text, columns=['sentence1', 'sentence2'])

# Assign the columns from the split_text_df to the original DataFrame (df)
sf[['sentence1', 'sentence2']] = split_text_df[['sentence1', 'sentence2']]

sf.drop('PairID', axis = 1, inplace = True)
sf.drop('Text', axis = 1, inplace = True)

sf['similarity_score'] = sf['Score'] * 5.0
sf.drop('Score', axis=1, inplace = True)

sf.head()
#sf.shape
print(sf.shape)
print(kf.shape)

(250, 3)
(24909, 3)


In [24]:
from sklearn.model_selection import train_test_split
train_com_df, dev_df = train_test_split(kf, test_size = 0.2, random_state = 1)

In [25]:
print(len(train_com_df))
print(len(dev_df))

19927
4982


In [82]:
kf=[dev_df,sf]
#kf = sf
dev_df = pd.concat(kf)
dev_df.shape

(5232, 3)

In [83]:
dev_com_df = dev_df

In [84]:
print(train_com_df.shape)
print(dev_com_df.shape)

(19927, 3)
(5232, 3)


In [85]:
print(dataset['train'][0]['sentence1'])

#baal1 = list(train_df['sentence1'])
#baal2 = list(train_df['sentence2'])
#baal3 = list(train_df['similarity_score'])

baal1 = list(train_com_df['sentence1'])
baal2 = list(train_com_df['sentence2'])
baal3 = list(train_com_df['similarity_score'])

#baal1.extend(baal11)
##baal2.extend(baal22)
#baal3.extend(baal33)
#updated_dataset = dataset['train'].map(lambda baal : {'sentence1': + baal1['baal']})

print(len(baal1))

A plane is taking off.
19927


In [86]:
from datasets import DatasetDict, Dataset

dataset['train'][0]['sentence1']

#baal1 = list(train_df['sentence1'])
#baal2 = list(train_df['sentence2'])
#baal3 = list(train_df['similarity_score'])

#updated_train_data
dataset2 = dataset.copy()
#print(len(dataset['train']))
# Create a new list of dictionaries with updated values
updated_train_data = []
#print(len(updated_train_data))

updated_train_data = [{'sentence1': s1, 'sentence2': s2, 'similarity_score': score} for s1, s2, score in zip(baal1, baal2, baal3)]

updated_train_data1 = [dataset['train'][i] for i in range(len(dataset['train']))]
print(len(updated_train_data1))

updated_train_data1.extend(updated_train_data)
# Assign the updated list back to dataset['train']

dataset2['train'] = updated_train_data1
print(len(dataset2['train']))
#print(len(updated_train_data))

#for i in range(0, 5749):
#    dataset2['train'][i] = dataset['train'][i].copy()

dataset['train'] = dataset2['train'].copy()

print(len(dataset['train']))

5749
25676
25676


In [87]:
#baal1 = list(dev_df['sentence1'])
#baal2 = list(dev_df['sentence2'])
#baal3 = list(dev_df['similarity_score'])

baal1 = list(dev_com_df['sentence1'])
baal2 = list(dev_com_df['sentence2'])
baal3 = list(dev_com_df['similarity_score'])

#baal1.extend(baal11)
#baal2.extend(baal22)
#baal3.extend(baal33)


#print(len(baal1))
updated_dev_data = []
# Create a new list of dictionaries with updated values
updated_dev_data = [{'sentence1': s1, 'sentence2': s2, 'similarity_score': score} for s1, s2, score in zip(baal1, baal2, baal3)]
updated_dev_data1 = [dataset['dev'][i] for i in range(len(dataset['dev']))]

#print(len(updated_dev_data))
#print(len(updated_dev_data1))

updated_dev_data1.extend(updated_dev_data)
print(len(updated_dev_data1))
# Assign the updated list back to dataset['train']

dataset2['dev'] = updated_dev_data1
#print(len(updated_dev_data))
#print(len(dataset['dev']))
#for i in range(0,1500):
#    dataset2['dev'][i] = dataset['dev'][i].copy()

dataset['dev'] = dataset2['dev'].copy()
    
print(len(dataset['dev']))

6732
6732


In [88]:
from transformers import RobertaTokenizer

# Use the correct tokenizer class for RoBERTa
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')


In [89]:
class STSBDataset(torch.utils.data.Dataset):

    def __init__(self, dataset):
        # Normalize the similarity scores in the dataset
        similarity_scores = [i['similarity_score'] for i in dataset]
        self.normalized_similarity_scores = [i/5.0 for i in similarity_scores]
        self.first_sentences = [i['sentence1'] for i in dataset]
        self.second_sentences = [i['sentence2'] for i in dataset]
        self.concatenated_sentences = [[str(x), str(y)] for x,y in   zip(self.first_sentences, self.second_sentences)]

    def __len__(self):
        return len(self.concatenated_sentences)

    def get_batch_labels(self, idx):
        return torch.tensor(self.normalized_similarity_scores[idx])

    def get_batch_texts(self, idx):
        return tokenizer(self.concatenated_sentences[idx], padding='max_length', max_length=128, truncation=True, return_tensors="pt")

    def __getitem__(self, idx):
        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)
        return batch_texts, batch_y



def collate_fn(texts):
    input_ids = texts['input_ids']
    attention_masks = texts['attention_mask']
    features = [{'input_ids': input_id, 'attention_mask': attention_mask}
                for input_id, attention_mask in zip(input_ids, attention_masks)]
    return features

In [90]:
class BertForSTS(torch.nn.Module):

    def __init__(self):
        super(BertForSTS, self).__init__()
        self.bert = models.Transformer('RoBERTa-base', max_seq_length=512)
        self.pooling_layer = models.Pooling(self.bert.get_word_embedding_dimension())
        self.sts_bert = SentenceTransformer(modules=[self.bert, self.pooling_layer])

    def forward(self, input_data):
        output = self.sts_bert(input_data)['sentence_embedding']
        return output

In [91]:
# Instantiate the model and move it to GPU
model = BertForSTS()
model.to(device)

Some weights of RobertaModel were not initialized from the model checkpoint at RoBERTa-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSTS(
  (bert): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: RobertaModel 
  (pooling_layer): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False})
  (sts_bert): SentenceTransformer(
    (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: RobertaModel 
    (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False})
  )
)

In [92]:
class CosineSimilarityLoss(torch.nn.Module):

    def __init__(self,  loss_fn=torch.nn.MSELoss(), transform_fn=torch.nn.Identity()):
        super(CosineSimilarityLoss, self).__init__()
        self.loss_fn = loss_fn
        self.transform_fn = transform_fn
        self.cos_similarity = torch.nn.CosineSimilarity(dim=1)

    def forward(self, inputs, labels):
        emb_1 = torch.stack([inp[0] for inp in inputs])
        emb_2 = torch.stack([inp[1] for inp in inputs])
        outputs = self.transform_fn(self.cos_similarity(emb_1, emb_2))
        return self.loss_fn(outputs, labels.squeeze())

In [93]:
train_ds = STSBDataset(dataset['train'])
val_ds = STSBDataset(dataset['dev'])

# Create a 90-10 train-validation split.
train_size = len(train_ds)
val_size = len(val_ds)

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

25,676 training samples
6,732 validation samples


In [94]:
train_ds[0]

({'input_ids': tensor([[   0,  250, 3286,   16,  602,  160,    4,    2,    1,    1,    1,    1,
             1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
             1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
             1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
             1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
             1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
             1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
             1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
             1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
             1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
             1,    1,    1,    1,    1,    1,    1,    1],
         [   0, 4688,  935, 3286,   16,  602,  160,    4,    2,    1,    1,    1,
             1,    1,    

In [95]:
batch_size = 8

train_dataloader = DataLoader(
            train_ds,  # The training samples.
            num_workers = 4,
            batch_size = batch_size, # Use this batch size.
            shuffle=True # Select samples randomly for each batch
        )

validation_dataloader = DataLoader(
            val_ds,
            num_workers = 4,
            batch_size = batch_size # Use the same batch size
        )

In [96]:
optimizer = AdamW(model.parameters(),
                  lr = 1e-6)
epochs = 12
# Total number of training steps is [number of batches] x [number of epochs].
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)



In [97]:
# Define format_time function
import time

def format_time(elapsed):
    """
    Takes a time in seconds and returns a string hh:mm:ss
    """
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(time.strftime("%H:%M:%S", time.gmtime(elapsed_rounded)))

In [98]:
def train():
  seed_val = 42
  criterion = CosineSimilarityLoss()
  criterion = criterion.cuda()
  random.seed(seed_val)
  torch.manual_seed(seed_val)
  # We'll store a number of quantities such as training and validation loss,
  # validation accuracy, and timings.
  training_stats = []
  total_t0 = time.time()
  for epoch_i in range(0, epochs):
      t0 = time.time()
      total_train_loss = 0
      model.train()
      # For each batch of training data...
      for train_data, train_label in tqdm(train_dataloader):
          train_data['input_ids'] = train_data['input_ids'].to(device)
          train_data['attention_mask'] = train_data['attention_mask'].to(device)
          train_data = collate_fn(train_data)
          model.zero_grad()
          output = [model(feature) for feature in train_data]
          loss = criterion(output, train_label.to(device))
          total_train_loss += loss.item()
          loss.backward()
          torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
          optimizer.step()
          scheduler.step()

      # Calculate the average loss over all of the batches.
      avg_train_loss = total_train_loss / len(train_dataloader)
      # Measure how long this epoch took.
      training_time = format_time(time.time() - t0)
      t0 = time.time()
      model.eval()
      total_eval_accuracy = 0
      total_eval_loss = 0
      nb_eval_steps = 0
      # Evaluate data for one epoch
      for val_data, val_label in tqdm(validation_dataloader):
          val_data['input_ids'] = val_data['input_ids'].to(device)
          val_data['attention_mask'] = val_data['attention_mask'].to(device)
          val_data = collate_fn(val_data)
          with torch.no_grad():
              output = [model(feature) for feature in val_data]
          loss = criterion(output, val_label.to(device))
          total_eval_loss += loss.item()
      # Calculate the average loss over all of the batches.
      avg_val_loss = total_eval_loss / len(validation_dataloader)
      # Measure how long the validation run took.
      validation_time = format_time(time.time() - t0)
      # Record all statistics from this epoch.
      training_stats.append(
          {
              'epoch': epoch_i + 1,
              'Training Loss': avg_train_loss,
              'Valid. Loss': avg_val_loss,
              'Training Time': training_time,
              'Validation Time': validation_time
          }
      )
  return model, training_stats

# Launch the training
model, training_stats = train()


100%|██████████| 3210/3210 [17:40<00:00,  3.03it/s]
100%|██████████| 842/842 [01:26<00:00,  9.76it/s]
100%|██████████| 3210/3210 [17:39<00:00,  3.03it/s]
100%|██████████| 842/842 [01:25<00:00,  9.79it/s]
100%|██████████| 3210/3210 [17:31<00:00,  3.05it/s]
100%|██████████| 842/842 [01:25<00:00,  9.84it/s]
100%|██████████| 3210/3210 [17:30<00:00,  3.06it/s]
100%|██████████| 842/842 [01:25<00:00,  9.81it/s]
100%|██████████| 3210/3210 [17:34<00:00,  3.04it/s]
100%|██████████| 842/842 [01:26<00:00,  9.77it/s]
100%|██████████| 3210/3210 [17:41<00:00,  3.02it/s]
100%|██████████| 842/842 [01:25<00:00,  9.83it/s]
100%|██████████| 3210/3210 [17:37<00:00,  3.03it/s]
100%|██████████| 842/842 [01:26<00:00,  9.76it/s]
100%|██████████| 3210/3210 [17:38<00:00,  3.03it/s]
100%|██████████| 842/842 [01:26<00:00,  9.76it/s]
100%|██████████| 3210/3210 [17:41<00:00,  3.02it/s]
100%|██████████| 842/842 [01:25<00:00,  9.81it/s]
100%|██████████| 3210/3210 [17:39<00:00,  3.03it/s]
100%|██████████| 842/842 [01:2

In [99]:
# Create a DataFrame from our training statistics
df_stats = pd.DataFrame(data=training_stats)

# Use the 'epoch' as the row index
df_stats = df_stats.set_index('epoch')

# Display the table
df_stats

Unnamed: 0_level_0,Training Loss,Valid. Loss,Training Time,Validation Time
epoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.038794,0.030792,00:17:40,00:01:26
2,0.021645,0.026976,00:17:40,00:01:26
3,0.018398,0.025044,00:17:31,00:01:26
4,0.016545,0.023949,00:17:31,00:01:26
5,0.015097,0.023466,00:17:35,00:01:26
6,0.014061,0.023109,00:17:41,00:01:26
7,0.013276,0.02248,00:17:38,00:01:26
8,0.012533,0.022392,00:17:39,00:01:26
9,0.012047,0.022405,00:17:41,00:01:26
10,0.011619,0.02203,00:17:40,00:01:27


In [126]:
# load the test set
test_dataset = load_dataset("stsb_multi_mt", name="en", split="test")


test_df = pd.read_csv('/kaggle/input/test-data/eng_test.csv')
#print(test_dataset)

test_df[['sentence1', 'sentence2']] = test_df['Text'].apply(lambda x: x.split('\n', 1)).to_list()
test_df.drop('Text', axis=1, inplace=True)
#dev_df.drop('PairID', axis=1, inplace=True)
test_df['similarity_score'] = 0.0
baal1 = list(test_df['sentence1'])
baal2 = list(test_df['sentence2'])
baal3 = list(test_df['similarity_score'])
# Create a new list of dictionaries with updated values
updated_train_data = [{'sentence1': s1, 'sentence2': s2, 'similarity_score': s3} for s1, s2, s3 in zip(baal1, baal2, baal3)]

# Assign the updated list back to dataset['train']
test_dataset = updated_train_data


# Prepare the data
first_sent = [i['sentence1'] for i in test_dataset]
second_sent = [i['sentence2'] for i in test_dataset]
full_text = [[str(x), str(y)] for x,y in zip(first_sent, second_sent)]

print(len(full_text))

model.eval()

"""def predict_similarity(sentence_pair):
  test_input = tokenizer(sentence_pair, padding='max_length', max_length = 128, truncation=True, return_tensors="pt").to(device)
  test_input['input_ids'] = test_input['input_ids']
  test_input['attention_mask'] = test_input['attention_mask']

  del test_input['token_type_ids']
  output = model(test_input)
  sim = torch.nn.functional.cosine_similarity(output[0], output[1], dim=0).item()
  return sim"""

def predict_similarity(sentence_pair):
    test_input = tokenizer(sentence_pair, padding='max_length', max_length=128, truncation=True, return_tensors="pt").to(device)
    
    # Ensure 'token_type_ids' is not present for RoBERTa
    if 'token_type_ids' in test_input:
        del test_input['token_type_ids']

    output = model(test_input)
    sim = torch.nn.functional.cosine_similarity(output[0], output[1], dim=0).item()
    return sim

2600


In [127]:
example_1 = full_text[0]
print(f"Sentence 1: {example_1[0]}")
print(f"Sentence 2: {example_1[1]}")
print(f"Predicted similarity score: {round(predict_similarity(example_1), 2)}")

Sentence 1: Egypt's Brotherhood stands ground after killings
Sentence 2: Egypt: Muslim Brotherhood Stands Behind Morsi
Predicted similarity score: 0.64


In [128]:
scor_list = []
for i in full_text:
    score = round(predict_similarity(i), 2)
    if(score<0):
        score = 0.00
#     print(score)
    scor_list.append(score)

In [129]:
test_df.drop('sentence1', axis=1, inplace=True)
test_df.head()

Unnamed: 0,PairID,sentence2,similarity_score
0,ENG-test-0000,Egypt: Muslim Brotherhood Stands Behind Morsi,0.0
1,ENG-test-0001,"Install the program, which is free to download...",0.0
2,ENG-test-0002,Pretty much the first thing people mentioned w...,0.0
3,ENG-test-0003,You can watch The Wiggles every day on Nick JR.,0.0
4,ENG-test-0004,My 13-year-old son recommended this book to me...,0.0


In [130]:
test_df.drop('sentence2', axis=1, inplace=True)
test_df.drop('similarity_score', axis=1, inplace=True)

In [131]:
test_df.head()

Unnamed: 0,PairID
0,ENG-test-0000
1,ENG-test-0001
2,ENG-test-0002
3,ENG-test-0003
4,ENG-test-0004


In [132]:
test_df['Pred_Score'] = scor_list
test_df.head()

Unnamed: 0,PairID,Pred_Score
0,ENG-test-0000,0.64
1,ENG-test-0001,0.82
2,ENG-test-0002,0.37
3,ENG-test-0003,0.02
4,ENG-test-0004,0.31


In [133]:
test_df.to_csv('/kaggle/working/pred_eng_a.csv', index=False)