In [1]:
import os
import tqdm
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
from utils import check_accuracy_classification
import transformers
from torch.optim import Adam
from models import BertProbeClassifer
from utils import text_to_dataloader, tokenize_word

In [2]:
train_path = os.path.join("data","en_partut-ud-train.conllu")
dev_path = os.path.join("data","en_partut-ud-dev.conllu")
test_path = os.path.join("data","en_partut-ud-test.conllu")

In [3]:
HEADER_CONST = "# sent_id = "
TEXT_CONST = "# text = "
STOP_CONST = "\n"
WORD_OFFSET = 1
LABEL_OFFSET = 3


def txt_to_dataframe(data_path):
    '''
    read UD text file and convert to df format
    '''
    with open(data_path, "r") as fp:
        df = pd.DataFrame(
            columns={
                "text",
                "word",
                "label"
            }
        )
        for line in fp.readlines():
            if TEXT_CONST in line:
                words_list = []
                labels_list = []
                text = line.split(TEXT_CONST)[1]
                # this is a new text, need to parse all the words in it
            elif line is not STOP_CONST and HEADER_CONST not in line:
                temp_list = line.split("\t")
                words_list.append(temp_list[WORD_OFFSET])
                labels_list.append(temp_list[LABEL_OFFSET])
            if line == STOP_CONST:
                # this is the end of the text, adding to df
                cur_df = pd.DataFrame(
                    {
                        "text": len(words_list) * [text],
                        "word": words_list,
                        "label": labels_list
                    }
                )
                df = pd.concat([df,cur_df])
        return df
            


In [4]:
df_train = txt_to_dataframe(train_path)
df_dev = txt_to_dataframe(dev_path)
df_test = txt_to_dataframe(test_path)

In [5]:
TYPES = [
    "ADJ",
    "ADP",
    "ADV",
    "AUX",
    "CCONJ",
    "DET",
    "INTJ",
    "NOUN",
    "NUM",
    "PART",
    "PRON",
    "PROPN",
    "PUNCT",
    "SCONJ",
    "SYM",
    "VERB",
    "X",
    "_"
]

In [6]:
file_name = 'tex_artifacts/label_dist_train.tex'
SORT_COL = "Count"

with open(file_name,'w') as tf:
    display_df = df_train["label"].value_counts().rename_axis("Type").to_frame("Count").reset_index()
    #display_df.index = TYPES
    display_df.sort_values(by=SORT_COL, inplace=True, ascending=False)
    latex_data = display_df.to_latex(index=False)
    tf.write(latex_data)

In [7]:
file_name = 'tex_artifacts/label_dist_dev.tex'


with open(file_name,'w') as tf:
    display_df = df_dev["label"].value_counts().rename_axis("Type").to_frame("Count").reset_index()
    display_df.index = TYPES
    display_df.sort_values(by="Type", inplace=True)
    latex_data = display_df.to_latex(index=False)
    tf.write(latex_data)

ValueError: Length mismatch: Expected axis has 17 elements, new values have 18 elements

In [None]:
file_name = 'tex_artifacts/label_dist_test.tex'


with open(file_name,'w') as tf:
    display_df = df_test["label"].value_counts().rename_axis("Type").to_frame("Count").reset_index()
    display_df.index = TYPES
    display_df.sort_values(by="Type", inplace=True)
    latex_data = display_df.to_latex(index=False)
    tf.write(latex_data)

In [None]:
df_test[df_test["label"] == "VERB"]

In [8]:
bert_tokenizer = transformers.BertTokenizer.from_pretrained("bert-base-uncased")

In [9]:
df_train, dataloader_train = text_to_dataloader(df_train, "cuda", 32, bert_tokenizer, 256)
df_test, dataloader_test = text_to_dataloader(df_test, "cuda", 32, bert_tokenizer, 256)

In [None]:
dataloader_train

In [None]:
file_name = 'tex_artifacts/tokens_per_word_dist_train.tex'

INDEX_AXIS_NAME = "Tokens/Word"
SORT_COL = "Tokens/Word"

with open(file_name,'w') as tf:
    display_df = df_train["query_mask"].apply(lambda x: sum(x)).value_counts().rename_axis(INDEX_AXIS_NAME).to_frame("Count").reset_index()
    display_df.sort_values(by=SORT_COL, inplace=True)
    latex_data = display_df.to_latex(index=False)
    tf.write(latex_data)


In [None]:
df_train.sample(10)

In [None]:
df_train[df_train["query_mask"].apply(lambda x: sum(x)) == 0]

In [16]:
num_hidden_layers = 9


bert_config = transformers.BertConfig(num_hidden_layers=num_hidden_layers)
bert_model = transformers.BertModel.from_pretrained("bert-base-uncased")
bert_model.to("cuda")

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [17]:
batch_size = 32


labels = []
contextual_embeddings = []

with torch.no_grad():
    for batch in tqdm.tqdm(dataloader_train):
        text, mask, _, batch_labels = batch
        batch_contextual_embeddings, _ = bert_model(text, mask)
        
        # need to get the embeddings of only masked words
        for batch_idx in range(batch_contextual_embeddings.shape[0]):
            for word_idx in range(batch_contextual_embeddings.shape[1]):
                if mask[batch_idx, word_idx] == 1:
                    contextual_embeddings.append(batch_contextual_embeddings[batch_idx, word_idx, :].cpu().detach().numpy())
                    labels.append(batch_labels[word_idx].cpu().detach().numpy())

  0%|          | 2/1356 [00:40<7:34:17, 20.13s/it]


KeyboardInterrupt: 

In [12]:
np.vstack(labels).shape
np.vstack(contextual_embeddings).shape

(1156, 768)

In [18]:
display_labels = [int(arr) for arr in labels]


import umap

reducer = umap.UMAP()
lower_dim_data = reducer.fit_transform(np.vstack(contextual_embeddings))

In [19]:
lower_dim_data.shape

(1119, 2)

In [20]:
import matplotlib.pyplot as plt
%matplotlib qt



plt.figure(figsize=(10,10))
plt.scatter(lower_dim_data[:,0], lower_dim_data[:,1], c=display_labels, cmap="tab20")
plt.legend()
plt.colorbar()
plt.grid()
plt.show()

No handles with labels found to put in legend.


In [None]:
set(display_labels)

In [None]:
display_labels