In [None]:
import torch
import torch.nn.functional as F
from src.methods import SophiaG


In [None]:
import wget
import os

print('Downloading dataset')
url = 'https://nyu-mll.github.io/CoLA/cola_public_1.1.zip'

if not os.path.exists('./cola_public_1.1.zip'):
  wget.download(url, './cola_public_1.1.zip')
  if not os.path.exists('./cola_public/'):
    !unzip cola_public_1.1.zip

In [None]:
import pandas as pd
df = pd.read_csv("./cola_public/raw/in_domain_train.tsv", delimiter='\t',
                 header=None, names=['sentence_source', 'label', 'label_notes', 'sentence'])
df

In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [None]:
input_ids = []
for sent in df.sentence.values:
    encoded_sent = tokenizer.encode(
                        sent, 
                        add_special_tokens = True, 
                   )
    input_ids.append(encoded_sent)
print('Original: ', df.sentence.values[0])
print('Token IDs:', input_ids[0])

In [None]:
print('Max sentence length: ', max([len(sen) for sen in input_ids]))

In [None]:
def pad_sequences(seqs, maxlen=None, value=0, padding="post"):
    if maxlen is None:
        raise ValueError("Invalid maxlen: {}".format(maxlen))
    for i in range(len(seqs)):
        add = [value] * max(0, maxlen - len(seqs[i]))
        if padding == "post":
            seqs[i] = seqs[i] + add
        elif padding == "pre":
            seqs[i] = add + seqs[i]
    return seqs

MAX_LEN =  max([len(sen) for sen in input_ids])+1
print('\nPadding/truncating all sentences to %d values...' % MAX_LEN)
print('\nPadding token: "{:}", ID: {:}'.format(tokenizer.pad_token, tokenizer.pad_token_id))

input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, value=0, padding="post")
print('\Done.')

In [None]:
attention_masks = []
for sent in input_ids:
    att_mask = [int(token_id > 0) for token_id in sent]
    attention_masks.append(att_mask)

In [None]:
import matplotlib.pyplot as plt
plt.pcolor(attention_masks[1000:2000])

In [None]:
df.label.values

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import matthews_corrcoef

data_split_rs = 49


train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids,df.label.values,
                                                            random_state=data_split_rs, test_size=0.1)

train_masks, validation_masks, _, _ = train_test_split(attention_masks, df.label.values,
                                             random_state=data_split_rs, test_size=0.1)