In [None]:
import json
import pandas as pd
from google.colab import drive
drive.mount("/content/Drive")

Mounted at /content/Drive


In [None]:
f_train_CL = open("/content/Drive/MyDrive/Technical/RR/Data/CL_train.json", "r")
f_test_CL = open("/content/Drive/MyDrive/Technical/RR/Data/CL_test.json", "r")
f_test_IT = open("/content/Drive/MyDrive/Technical/RR/Data/IT_test.json", "r")
f_train_IT = open("/content/Drive/MyDrive/Technical/RR/Data/IT_train.json", "r")

data_tr_CL = json.load(f_train_CL)
f_train_CL.close()
data_te_CL = json.load(f_test_CL)
f_test_CL.close()
data_tr_IT = json.load(f_train_IT)
f_train_IT.close()
data_te_IT = json.load(f_test_IT)
f_test_IT.close()

In [None]:
############ Avoiding none labels ##############
def avoid_none(df):
  dummy_df = {}
  for key in df.keys():
    dummy_df[key] = {}
    dummy_df[key]["sentences"] = []
    dummy_df[key]["complete"] = []
    for i, sentence in enumerate(df[key]["sentences"]):
      if(df[key]["complete"][i] == "None"):
        #print("Found None")
        continue
      dummy_df[key]["sentences"].append(sentence)
      dummy_df[key]["complete"].append(df[key]["complete"][i])

  return dummy_df

In [None]:
#### Data conversion #######

def json_to_df(data, avoid=False):
  if(avoid == True):
    data = avoid_none(data)
  sentences_1 = []
  sentences_2 = []
  label = []
  for doc in data.keys():
    length_sentences = len(data[doc]["sentences"])
    print(length_sentences)
    for i,sentence in enumerate(data[doc]["sentences"]):
      if(i== length_sentences-1):
        break
      sentences_1.append(data[doc]["sentences"][i])
      sentences_2.append(data[doc]["sentences"][i+1])
      label_1 = data[doc]["complete"][i]
      label_2 = data[doc]["complete"][i+1]
      if label_1 != label_2:
        label.append(1)
      else:
        label.append(0)

  df = pd.DataFrame(list(zip(sentences_1, sentences_2, label)), columns =['Sentence 1', 'Sentence 2', "label"])
  return df

In [None]:
train_df_CL = json_to_df(data_tr_CL, avoid=True)
test_df_CL = json_to_df(data_te_CL, avoid=True)
train_df_IT = json_to_df(data_tr_IT, avoid=True)
test_df_IT = json_to_df(data_te_IT, avoid=True)

In [None]:
train_comb_df = pd.concat([train_df_IT, train_df_CL])
test_comb_df = pd.concat([test_df_CL, test_df_IT])

In [None]:
test_df_IT["label"].value_counts()

In [None]:
!pip install -U sentence-transformers

In [None]:
from sentence_transformers import SentenceTransformer
SB_model = SentenceTransformer('bert-base-nli-max-tokens')

In [None]:
model = SB_model
sentence_embeddings_1 = model.encode(train_df_IT["Sentence 1"].to_list())
sentence_embeddings_2 = model.encode(train_df_IT["Sentence 2"].to_list())

In [None]:
import torch
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import numpy as np

In [None]:
class Sequences(Dataset):
    def __init__(self, df, SB_model):
        self.labels = df.label.tolist()
        self.sentence_1_embeddings = SB_model.encode(df["Sentence 1"].to_list())
        self.sentence_2_embeddings = SB_model.encode(df["Sentence 2"].to_list())
        self.sequences = []
        for i, s1_e in enumerate(self.sentence_1_embeddings):
          sentence_diff_embedding = np.absolute(np.array(self.sentence_2_embeddings[i]) - np.array(self.sentence_1_embeddings[i]))
          concat_extra = np.concatenate((self.sentence_1_embeddings[i], self.sentence_2_embeddings[i]), axis=0)
          concat_full = np.concatenate((concat_extra, sentence_diff_embedding), axis=0)
          np_concat_full = np.array(concat_full)
          self.sequences.append(np.expand_dims(np_concat_full, axis=0))

        self.sequences = np.array(self.sequences)
        print(self.sequences)

        
    def __getitem__(self, i):
        return self.sequences[i], self.labels[i]
    
    def __len__(self):
        return self.sequences.shape[0]

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

In [None]:
train_dataset = Sequences(train_comb_df, SB_model)
test_dataset = Sequences(test_comb_df, SB_model)

In [None]:
train_loader = DataLoader(train_dataset, batch_size=1024)
test_loader = DataLoader(test_dataset, batch_size=1024)

In [None]:
train_dataset[5][0].shape

In [None]:
class SiameseClassifier(nn.Module):
    def __init__(self, vec_dim, hidden1, hidden2):
        super(SiameseClassifier, self).__init__()
        self.fc1 = nn.Linear(vec_dim, hidden1)
        self.fc2 = nn.Linear(hidden1, hidden2)
        self.fc3 = nn.Linear(hidden2, 1)
    
    def forward(self, inputs):
        x = F.relu(self.fc1(inputs.squeeze(1).float()))
        x = F.relu(self.fc2(x))
        return x, self.fc3(x)

In [None]:
model = SiameseClassifier(2304, 256, 128)

In [None]:
model

In [None]:
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam([p for p in model.parameters() if p.requires_grad], lr=0.001)

In [None]:
from tqdm import tqdm, tqdm_notebook

In [None]:
model.train()
train_losses = []
for epoch in range(5):
    progress_bar = tqdm_notebook(train_loader, leave=False)
    losses = []
    total = 0
    for inputs, target in progress_bar:
        model.zero_grad()

        output = model(inputs)
        loss = criterion(output[1].squeeze(), target.float())
        
        loss.backward()
              
        nn.utils.clip_grad_norm_(model.parameters(), 3)

        optimizer.step()
        
        progress_bar.set_description(f'Loss: {loss.item():.3f}')
        
        losses.append(loss.item())
        total += 1
    
    epoch_loss = sum(losses) / total
    train_losses.append(epoch_loss)
        
    tqdm.write(f'Epoch #{epoch + 1}\tTrain Loss: {epoch_loss:.3f}')

In [None]:
pred_label_list = []
gold_label_list = []
model.eval()
progress_bar = tqdm_notebook(test_loader, leave=False)
for inputs, targets in progress_bar:

    outputs = model(inputs)
    print(outputs[1].shape)
    for output in outputs[1]:
      prediction = torch.sigmoid(output).item()
      if (prediction > 0.5):
        pred_label_list.append(1)
      else:
        pred_label_list.append(0)

    for target in targets:
      gold_label_list.append(int(target))
    
    #epoch_loss = sum(losses) / total
    #train_losses.append(epoch_loss)
        
    #qdm.write(f'Epoch #{epoch + 1}\tTrain Loss: {epoch_loss:.3f}')

In [None]:
print(gold_label_list)

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(gold_label_list, pred_label_list))

In [None]:
############### SAVING EMBEDDINGS ##############

In [None]:
shift_embs_test = []
model.eval()
progress_bar = tqdm_notebook(test_loader, leave=False)
for inputs, targets in progress_bar:

    outputs = model(inputs)

    for output in outputs[0]:
      npo = output.detach().numpy()
      shift_embs_test.append(npo)

In [None]:
shift_embs_train = []
model.eval()
progress_bar = tqdm_notebook(train_loader, leave=False)
for inputs, targets in progress_bar:

    outputs = model(inputs)

    for output in outputs[0]:
      npo = output.detach().numpy()
      shift_embs_train.append(npo)

In [None]:
print(len(shift_embs_train))

In [None]:
i=0

### Comment the below line to have None label #####
data_tr_IT = avoid_none(data_tr_IT)
for key in data_tr_IT.keys():
  limit = len(data_tr_IT[key]["sentences"])
  sp = shift_embs_train[i:i+limit-1]
  np.save("/content/Drive/My Drive/Technical/RR/Siamese Net/avoidnone_shiftembs_train/" + key[:-4], np.array(sp))
  i = i+limit-1

In [None]:
i=0

### Comment the below line to have None label #####
data_te_IT = avoid_none(data_te_IT)
for key in data_te_IT.keys():
  limit = len(data_te_IT[key]["sentences"])
  sp = shift_embs_test[i:i+limit-1]
  np.save("/content/Drive/My Drive/Technical/RR/Siamese Net/avoidnone_shiftembs_test/" + key[:-4], np.array(sp))
  i = i+limit-1