In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://packagecloud.io/github/git-lfs/pypi/simple


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from transformers import BertTokenizer, BertModel
import torch.optim as optim
import pandas as pd
import numpy as np
from torchtext.legacy import data
import torch
import torch.nn as nn

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
max_len = 128
MAX_LEN = 256

In [None]:
def trim_sentence(sent):
  curr_len = len(sent)
  
  sent = sent[:min(max_len, curr_len)]
  return sent

In [None]:
def pad_sentence(sent, max_len):

  padding = [0 for _ in range(len(max_len-len(sent)))]
  padded_sent = sent.extend(padding)

  return padded_sent

In [None]:
def get_attention(sent):

  attention = [1]*len(sent)
  return attention

In [None]:
def process_sent1(sent):
  return [tokenizer.cls_token] + sent + [tokenizer.sep_token]

def process_sent2(sent):
  return sent + [tokenizer.sep_token]

In [None]:
def get_sent1_token_id(sent):
  return [0]*len(sent)

def get_sent2_token_id(sent):
  return [1]*len(sent)

In [None]:
def join_mask(mask):

  mask = [str(i) for i in mask]
  mask = " ".join(mask)

  return mask

In [None]:
def tokenize_sent(sent):

  tokens = tokenizer.tokenize(sent)
  return tokens

In [None]:
def join_seq(sent):
  return " ".join(sent)

def join_mask(sent):
  if type(sent)!=list:
    sent = [1]
  tmp = [str(val) for val in sent]
  return " ".join(tmp)

In [None]:
def get_bert_format(df):

  tokens1 = df["sentence1"].apply(tokenize_sent)
  tokens2 = df["sentence2"].apply(tokenize_sent)

  tokens1 = tokens1.apply(trim_sentence)
  tokens2 = tokens2.apply(trim_sentence)

  tokens1 = tokens1.apply(process_sent1)
  tokens2 = tokens2.apply(process_sent2)
    
  df["token_ids1"] = tokens1.apply(get_sent1_token_id)
  df["token_ids2"] = tokens2.apply(get_sent2_token_id)
  df["token_ids"] = df["token_ids1"] + df["token_ids2"]

  df["tmp_tokens"] = tokens1+tokens2
  df["attention_sent"] = df["tmp_tokens"].apply(get_attention)

  df["tokens"] = df["tmp_tokens"].apply(join_seq)

  df["attention_sent"] = df["attention_sent"].apply(join_mask)
  df["token_ids"] = df["token_ids"].apply(join_mask)

  return df["tokens"], df["attention_sent"], df["token_ids"]

In [None]:
train_data = pd.read_csv("/content/drive/MyDrive/snli_1.0/snli_1.0/snli_1.0_train.txt", sep="\t")
train_data = train_data[["sentence1", "sentence2", "gold_label"]]

val_data = pd.read_csv("/content/drive/MyDrive/snli_1.0/snli_1.0/snli_1.0_dev.txt", sep="\t")
val_data = val_data[["sentence1", "sentence2", "gold_label"]]

test_data = pd.read_csv("/content/drive/MyDrive/snli_1.0/snli_1.0/snli_1.0_test.txt", sep="\t")
test_data = test_data[["sentence1", "sentence2", "gold_label"]]

In [None]:
train_data = train_data.dropna(axis=0, how='any')
val_data = val_data.dropna(axis=0, how='any')
test_data = test_data.dropna(axis=0, how='any')

In [None]:
train_data["tokens"], train_data["attention_sent"], train_data["token_ids"] = get_bert_format(train_data)
train_data = train_data[["tokens", "attention_sent", "token_ids", "gold_label"]]

In [None]:
val_data["tokens"], val_data["attention_sent"], val_data["token_ids"] = get_bert_format(val_data)
val_data = val_data[["tokens", "attention_sent", "token_ids", "gold_label"]]

In [None]:
test_data["tokens"], test_data["attention_sent"], test_data["token_ids"] = get_bert_format(test_data)
test_data = test_data[["tokens", "attention_sent", "token_ids", "gold_label"]]

In [None]:
train_data = train_data[train_data.gold_labels!="-"]
val_data = val_data[val_data.gold_labels!="-"]
test_data = test_data[test_data.gold_labels!="-"]

In [None]:
train_data.to_csv('/content/drive/MyDrive/snli_1.0/snli_1.0/updated_train.csv', index=False)
val_data.to_csv('/content/drive/MyDrive/snli_1.0/snli_1.0/updated_val.csv', index=False)
test_data.to_csv('/content/drive/MyDrive/snli_1.0/snli_1.0/updated_test.csv', index=False)