<a href="https://colab.research.google.com/github/AUT-Student/NLP-HW3/blob/main/NLP_HW3_Q2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Libraries

In [44]:
import numpy as np
import pandas as pd

from gensim.models.word2vec import Word2Vec
import gensim.downloader as gensim_api

import tensorflow as tf
import tensorflow.keras as keras
from keras.layers import LSTM, Dense, Input, Embedding, Bidirectional, TimeDistributed
from keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras import backend as K

# Dataset

In [18]:
!gdown 11WeeMttH6I6MJ0t1h7FVSEtw0lKwpRA6
!gdown 1gLGNxjQzy6C8y4Oivr8etU1MMGfuKuEE
!gdown 127-sOeW6KMf6XNSAVM3bGjfwnmW0NciU

Downloading...
From: https://drive.google.com/uc?id=11WeeMttH6I6MJ0t1h7FVSEtw0lKwpRA6
To: /content/dev.conll
100% 478k/478k [00:00<00:00, 37.0MB/s]
Downloading...
From: https://drive.google.com/uc?id=1gLGNxjQzy6C8y4Oivr8etU1MMGfuKuEE
To: /content/test.conll
100% 677k/677k [00:00<00:00, 56.4MB/s]
Downloading...
From: https://drive.google.com/uc?id=127-sOeW6KMf6XNSAVM3bGjfwnmW0NciU
To: /content/train.conll
100% 11.4M/11.4M [00:00<00:00, 241MB/s]


In [19]:
def dependency_create(row):
  dependency_index = row["DependencyIndex"]
  index = row["Index"]

  if dependency_index == 0:
    return "Root"
  else:
    different = dependency_index - index

    if different > 0:
      return f"{different}R"
    elif different < 0:
      return f"{abs(different)}L"
    else:
      raise Exception(row)

In [20]:
def load_and_preprocess_dataset(path):
  dataset = pd.read_csv(path, delimiter="\t", header=None, names=["Index", "Word", "DependencyIndex"])
  dataset = dataset.dropna()
  dataset["Index"] = dataset["Index"].astype(int)
  dataset["DependencyIndex"] = dataset["DependencyIndex"].astype(int)
  dataset["Dependency"] = dataset.apply(dependency_create, axis=1)

  return dataset

In [21]:
train_dataset = load_and_preprocess_dataset("/content/train.conll")
valid_dataset = load_and_preprocess_dataset("/content/dev.conll")
test_dataset = load_and_preprocess_dataset("/content/test.conll")

In [22]:
label_dictionary = {"PAD": 0}

for i, label in enumerate(set(train_dataset["Dependency"].values.tolist() +
                              valid_dataset["Dependency"].values.tolist() +
                              test_dataset["Dependency"].values.tolist())):
  label_dictionary[label] = i+1

In [23]:
len(label_dictionary)

159

In [24]:
max(max(train_dataset["Index"].values), max(valid_dataset["Index"].values), max(test_dataset["Index"].values))

141

In [25]:
def convert_to_sequence(dataset):
  sequenced_dataset = []

  for i, item in dataset.iterrows():
    if item["Index"] == 1:
      if i>0:
        sequenced_dataset.append({"Words": new_words, "Labels": new_labels})
      new_words = [item["Word"]]
      new_labels = [item["Dependency"]]
    else:
      new_words.append(item["Word"])
      new_labels.append(item["Dependency"])
  
  sequenced_dataset.append({"Words": new_words, "Labels": new_labels})
  return pd.DataFrame(sequenced_dataset)

In [26]:
train_dataset = convert_to_sequence(train_dataset)
valid_dataset = convert_to_sequence(valid_dataset)
test_dataset = convert_to_sequence(test_dataset)

# Word2Vec

In [27]:
print(gensim_api.load("word2vec-google-news-300", return_path=True))

/root/gensim-data/word2vec-google-news-300/word2vec-google-news-300.gz


In [28]:
word2vec_model = gensim_api.load("word2vec-google-news-300")

In [29]:
def apply_embedding(row):
  vectors = []
  for word in row["Words"]:
    try:
      vector = word2vec_model.get_vector(word)
    except KeyError:
      vector = np.zeros(300)
    vectors.append(vector)

  for i in range(141-len(row["Words"])):
    vectors.append(np.zeros(300))

  return np.stack(vectors)

In [30]:
def apply_label_index(row):
  labels = []
  for label in row["Labels"]:
    labels.append(label_dictionary[label])

  for i in range(141-len(row["Labels"])):
    labels.append(label_dictionary["PAD"])
  
  return np.stack(labels)

In [42]:
valid_dataset["Vector"] = valid_dataset.apply(apply_embedding, axis=1)
valid_dataset["LabelIndex"] = valid_dataset.apply(apply_label_index, axis=1)
test_dataset["Vector"] = test_dataset.apply(apply_embedding, axis=1)
test_dataset["LabelIndex"] = test_dataset.apply(apply_label_index, axis=1)

In [40]:
train_dataset = train_dataset.iloc[:1700]

train_dataset["Vector"] = train_dataset.apply(apply_embedding, axis=1)
train_dataset["LabelIndex"] = train_dataset.apply(apply_label_index, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


# BiLSTM Model

In [45]:
class BiLSTM(keras.Model):
  def __init__(self):
    super().__init__()
    self.model = Sequential([
                             Input((141, 300)),
                             Bidirectional(LSTM(32, return_sequences=True)),
                             TimeDistributed(Dense(159, activation="softmax"))
    ])
  
  def call(self, inputs):
    return self.model(inputs)

In [46]:
def ignore_class_accuracy(to_ignore=0):
   # Refrence: https://nlpforhackers.io/lstm-pos-tagger-keras/
  
  def ignore_accuracy(y_true, y_pred):
    y_pred_class = K.argmax(y_pred, axis=-1)
    y_true_class = K.argmax(y_true, axis=-1)
    y_true_class = K.cast(y_true, "int64")

    ignore_mask = K.cast(K.not_equal(y_pred_class, to_ignore), 'int32')
    matches = K.cast(K.equal(y_true_class, y_pred_class), 'int32') * ignore_mask
    accuracy = K.sum(matches) / K.maximum(K.sum(ignore_mask), 1)
    return accuracy
  
  return ignore_accuracy

In [47]:
bilstm_model = BiLSTM()

In [48]:
bilstm_model.compile(optimizer="Adam", loss="sparse_categorical_crossentropy", metrics=["Accuracy", ignore_class_accuracy(0)])

In [49]:
es_callback = EarlyStopping(monitor="val_loss", patience=4, restore_best_weights=True)

In [None]:
bilstm_model.fit(x=np.stack(train_dataset["Vector"].values),
                 y=np.stack(train_dataset["LabelIndex"].values),
                 epochs=10,
                 validation_data=(np.stack(valid_dataset["Vector"].values),
                                  np.stack(valid_dataset["LabelIndex"].values)))

Epoch 1/10
Tensor("Cast_5:0", shape=(None, 141), dtype=float32)
Tensor("bi_lstm_1/sequential_1/time_distributed_1/Reshape_1:0", shape=(None, 141, 159), dtype=float32)
Tensor("strided_slice_1:0", shape=(141,), dtype=float32)
Tensor("strided_slice_2:0", shape=(141, 159), dtype=float32)
Tensor("Cast_5:0", shape=(None, 141), dtype=float32)
Tensor("bi_lstm_1/sequential_1/time_distributed_1/Reshape_1:0", shape=(None, 141, 159), dtype=float32)
Tensor("strided_slice_1:0", shape=(141,), dtype=float32)
Tensor("strided_slice_2:0", shape=(141, 159), dtype=float32)
Tensor("bi_lstm_1/sequential_1/time_distributed_1/Reshape_1:0", shape=(None, 141, 159), dtype=float32)
Tensor("strided_slice_1:0", shape=(141,), dtype=float32)
Tensor("strided_slice_2:0", shape=(141, 159), dtype=float32)
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
