# Assignment 1

This Jupyter Notebook document is our implementation of Assignment 1. 


In [None]:
!pip install pandas 
!pip install numpy
!pip install matplotlib

In [2]:
# !pip install wget



In [None]:
!pip install tqdm

# Importing Libraries

    

In [163]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import urllib.request
import zipfile
import os
from tqdm import tqdm

# Task 1: Data Loading and Splitting
* **Download** the corpus.
* **Encode** the corpus into a pandas.DataFrame object.
* **Split** it in training, validation, and test sets.

[Penn TreeBank corpus](https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/dependency_treebank.zip).

In [4]:
# Download the data from Penn TreeBank corpus 

address = 'https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/dependency_treebank.zip'
urllib.request.urlretrieve(address, 'dependency_treebank.zip')
# Unzip the data
with zipfile.ZipFile('dependency_treebank.zip', 'r') as zip_ref:
    zip_ref.extractall()


In [129]:
# Encode the data into a dataframe and also add the sentence number
def encode_data(file_name):
    data = pd.read_csv(file_name, sep='\t', header=None)
    data.columns = ['word', 'POS', 'head']
    return data

# Train data: 1-100
# Test data: 101-150
# Val data: 151-199
training_data_docs  = 100
test_data_docs = 50
val_data_docs = 49

# Create a dataframe for the train data
for i in range(1, training_data_docs+1):
    file_name = 'dependency_treebank/wsj_{}.dp'.format(str(i).zfill(4))
    if i == 1:
        train_data = encode_data(file_name)
    else:
        temp = encode_data(file_name)
        train_data = pd.concat([train_data, temp], ignore_index=True)

# drop head column
train_data = train_data.drop(['head'], axis=1)

# Create a dataframe for the test data
for i in range(training_data_docs+1, training_data_docs+test_data_docs+1):
    file_name = 'dependency_treebank/wsj_{}.dp'.format(str(i).zfill(4))
    if i == training_data_docs+1:
        test_data = encode_data(file_name)
    else:
        temp = encode_data(file_name)
        test_data = pd.concat([test_data, temp], ignore_index=True)

# drop head column
test_data = test_data.drop(['head'], axis=1)

# Create a dataframe for the val data

for i in range(training_data_docs+test_data_docs+1, training_data_docs+test_data_docs+val_data_docs+1):

    file_name = 'dependency_treebank/wsj_{}.dp'.format(str(i).zfill(4))
    if i == training_data_docs+test_data_docs+1:
        val_data = encode_data(file_name)
    else:
        temp = encode_data(file_name)
        val_data = pd.concat([val_data, temp], ignore_index=True)

# drop head column  
val_data = val_data.drop(['head'], axis=1)

# Make all the words lower case
train_data['word'] = train_data['word'].str.lower()
test_data['word'] = test_data['word'].str.lower()
val_data['word'] = val_data['word'].str.lower()

print(train_data.head())
print(train_data.shape)
print(test_data.head())
print(test_data.shape)
print(val_data.head())
print(val_data.shape)

     word  POS
0  pierre  NNP
1  vinken  NNP
2       ,    ,
3      61   CD
4   years  NNS
(47356, 2)
           word  POS
0             a   DT
1  house-senate  NNP
2    conference   NN
3      approved  VBD
4         major   JJ
(31183, 2)
        word  POS
0  intelogic  NNP
1      trace  NNP
2       inc.  NNP
3          ,    ,
4        san  NNP
(15545, 2)


# Task 2: Text encoding

To train a neural POS tagger, you first need to encode text into numerical format.

### Instructions

* Embed words using **GloVe embeddings**.
* You are **free** to pick any embedding dimension.
* [Optional] You are free to experiment with text pre-processing: **make sure you do not delete any token!**

In [None]:
!pip install gensim

In [144]:
import gensim
import gensim.downloader as gloader

def load_embedding_model(embedding_dimension: int = 50) -> gensim.models.keyedvectors.KeyedVectors:
    download_path = ""
    download_path = "glove-wiki-gigaword-{}".format(embedding_dimension)
        
    try:
        emb_model = gloader.load(download_path)
    except ValueError as e:
        print("Invalid embedding model name! Check the embedding dimension:")
        print("Glove: 50, 100, 200, 300")
        raise e

    return emb_model

In [145]:
# Glove -> 50, 100, 200, 300
embedding_model = load_embedding_model(embedding_dimension=50)

In [146]:
from typing import List, Dict

def check_OOV_terms(embedding_model: gensim.models.keyedvectors.KeyedVectors, word_listing: List[str]):
    
    embedding_vocabulary = set(embedding_model.key_to_index.keys())
    oov = set(word_listing).difference(embedding_vocabulary)
    return list(oov)

In [147]:
oov_terms = check_OOV_terms(embedding_model, train_data.values[:,0])
oov_percentage = float(len(oov_terms)) * 100 / len(train_data.values[:,0])

print(oov_percentage)

0.760199341160571


In [180]:
# Create word to index dictionary for the train data
from collections import OrderedDict

word_to_idx = OrderedDict()
for i, word in enumerate(train_data['word'].unique()):
    word_to_idx[word] = i
print(word_to_idx)
print(len(word_to_idx))

# Create a tag to index dictionary for the train data
tag_to_idx = OrderedDict()
for i, tag in enumerate(train_data['POS'].unique()):
    tag_to_idx[tag] = i
print(tag_to_idx)

7405
OrderedDict([('NNP', 0), (',', 1), ('CD', 2), ('NNS', 3), ('JJ', 4), ('MD', 5), ('VB', 6), ('DT', 7), ('NN', 8), ('IN', 9), ('.', 10), ('VBZ', 11), ('VBG', 12), ('CC', 13), ('VBD', 14), ('VBN', 15), ('RB', 16), ('TO', 17), ('PRP', 18), ('RBR', 19), ('WDT', 20), ('VBP', 21), ('RP', 22), ('PRP$', 23), ('JJS', 24), ('POS', 25), ('``', 26), ('EX', 27), ("''", 28), ('WP', 29), (':', 30), ('JJR', 31), ('WRB', 32), ('$', 33), ('NNPS', 34), ('WP$', 35), ('-LRB-', 36), ('-RRB-', 37), ('PDT', 38), ('RBS', 39), ('FW', 40), ('UH', 41), ('SYM', 42), ('LS', 43), ('#', 44)])


In [164]:
def build_embedding_matrix(embedding_model: gensim.models.keyedvectors.KeyedVectors,
                           embedding_dimension: int,
                           word_to_idx: Dict[str, int],
                           vocab_size: int) -> np.ndarray:
    
    embedding_matrix = np.zeros((vocab_size, embedding_dimension), dtype=np.float32)
    for word, idx in tqdm(word_to_idx.items()):
        try:
            embedding_vector = embedding_model[word]
        except (KeyError, TypeError):
            embedding_vector = np.random.uniform(low=-0.05, high=0.05, size=embedding_dimension)

        embedding_matrix[idx] = embedding_vector

    return embedding_matrix

In [174]:
embedding_dimension = 50
vocab_size = len(word_to_idx)
embedding_matrix = build_embedding_matrix(embedding_model, embedding_dimension, word_to_idx, vocab_size)
print(embedding_matrix.shape)
print(embedding_matrix[word_to_idx['the']])

100%|██████████| 7405/7405 [00:00<00:00, 255296.17it/s]

(7405, 50)
[ 4.1800e-01  2.4968e-01 -4.1242e-01  1.2170e-01  3.4527e-01 -4.4457e-02
 -4.9688e-01 -1.7862e-01 -6.6023e-04 -6.5660e-01  2.7843e-01 -1.4767e-01
 -5.5677e-01  1.4658e-01 -9.5095e-03  1.1658e-02  1.0204e-01 -1.2792e-01
 -8.4430e-01 -1.2181e-01 -1.6801e-02 -3.3279e-01 -1.5520e-01 -2.3131e-01
 -1.9181e-01 -1.8823e+00 -7.6746e-01  9.9051e-02 -4.2125e-01 -1.9526e-01
  4.0071e+00 -1.8594e-01 -5.2287e-01 -3.1681e-01  5.9213e-04  7.4449e-03
  1.7778e-01 -1.5897e-01  1.2041e-02 -5.4223e-02 -2.9871e-01 -1.5749e-01
 -3.4758e-01 -4.5637e-02 -4.4251e-01  1.8785e-01  2.7849e-03 -1.8411e-01
 -1.1514e-01 -7.8581e-01]





In [181]:
word_sentences = []
temp_word = []
temp_tag = []
tag_sentences = []
for elem in train_data.values:
    if elem[0] == '.' or elem[0] == '?' or elem[0] == '!':
        temp_word.append(elem[0])
        temp_tag.append(elem[1])
        word_sentences.append(temp_word)
        tag_sentences.append(temp_tag)
        temp_word = []
        temp_tag = []
    else: 
        temp_word.append(elem[0])
        temp_tag.append(elem[1])

print(word_sentences)
print(tag_sentences)

[['NNP', 'NNP', ',', 'CD', 'NNS', 'JJ', ',', 'MD', 'VB', 'DT', 'NN', 'IN', 'DT', 'JJ', 'NN', 'NNP', 'CD', '.'], ['NNP', 'NNP', 'VBZ', 'NN', 'IN', 'NNP', 'NNP', ',', 'DT', 'NNP', 'VBG', 'NN', '.'], ['NNP', 'NNP', ',', 'CD', 'NNS', 'JJ', 'CC', 'JJ', 'NN', 'IN', 'NNP', 'NNP', 'NNP', 'NNP', ',', 'VBD', 'VBN', 'DT', 'JJ', 'NN', 'IN', 'DT', 'JJ', 'JJ', 'NN', '.'], ['DT', 'NN', 'IN', 'NN', 'RB', 'VBN', 'TO', 'VB', 'NNP', 'NN', 'NNS', 'VBZ', 'VBN', 'DT', 'JJ', 'NN', 'IN', 'NN', 'NNS', 'IN', 'DT', 'NN', 'IN', 'NNS', 'VBN', 'TO', 'PRP', 'RBR', 'IN', 'CD', 'NNS', 'IN', ',', 'NNS', 'VBD', '.'], ['DT', 'NN', 'NN', ',', 'NN', ',', 'VBZ', 'RB', 'JJ', 'IN', 'PRP', 'VBZ', 'DT', 'NNS', ',', 'IN', 'RB', 'JJ', 'NNS', 'TO', 'PRP', 'VBG', 'NNS', 'WDT', 'VBP', 'RP', 'NNS', 'JJ', ',', 'NNS', 'VBD', '.'], ['NNP', 'NNP', ',', 'DT', 'NN', 'IN', 'JJ', 'JJ', 'NNP', 'NNP', 'WDT', 'VBZ', 'NNP', 'NNS', ',', 'VBD', 'VBG', 'NN', 'IN', 'PRP$', 'NN', 'NN', 'NNS', 'IN', 'CD', '.'], ['IN', 'JJ', 'NNS', 'VBD', 'VBN', 'RBR',

In [188]:
# Create a embedded sentence for the train data using embedding matrix
def create_embedded_sentence(word_sentences, embedding_matrix, word_to_idx):
    embedded_sentence = []
    for sentence in word_sentences:
        temp = []
        for word in sentence:
            temp.append(embedding_matrix[word_to_idx[word]])
        embedded_sentence.append(temp)
    return embedded_sentence

embedded_sentence = create_embedded_sentence(word_sentences, embedding_matrix, word_to_idx)
print(len(embedded_sentence))
print(embedded_sentence[0])
print(len(embedded_sentence[0]))

# Create a tag sentence for the train data
def create_tag_sentence(tag_sentences, tag_to_idx):
    tag_sentence = []
    for sentence in tag_sentences:
        temp = []
        for tag in sentence:
            temp.append(tag_to_idx[tag])
        tag_sentence.append(temp)
    return tag_sentence

tag_sentence = create_tag_sentence(tag_sentences, tag_to_idx)
print(len(tag_sentence))
print(tag_sentence[0])
print(len(tag_sentence[0]))

1959
[array([ 0.23568 ,  0.39638 , -0.60135 , -0.52681 ,  0.15878 ,  0.035317,
       -1.3855  , -0.050794, -0.59138 ,  0.28227 ,  0.75602 ,  0.17395 ,
       -1.4483  ,  0.18503 ,  0.33835 , -0.71197 ,  0.9338  , -1.1688  ,
       -0.064745, -0.63867 , -0.46409 ,  1.1839  , -0.43759 , -0.089778,
       -0.95805 ,  0.012933, -0.64701 , -0.28791 ,  0.071964,  1.4309  ,
        0.44505 , -0.27071 , -1.6473  ,  0.006484,  0.46397 , -0.21665 ,
        0.1401  ,  0.91752 ,  1.1278  ,  1.4435  ,  0.85627 ,  1.2505  ,
        0.39964 , -1.3863  , -0.331   ,  0.48466 , -1.4504  , -0.2049  ,
        0.77637 ,  0.2319  ], dtype=float32), array([-0.0280655 ,  0.00704549, -0.00467026, -0.02021534, -0.00995345,
        0.01220005, -0.02097502, -0.01256409, -0.02151987,  0.0070985 ,
       -0.02067018, -0.02456168, -0.00235166,  0.00471799,  0.02179842,
        0.00861878, -0.04122692,  0.03878938,  0.01753302,  0.01376668,
        0.0383067 ,  0.037013  ,  0.04192438, -0.00558445,  0.0081544 ,
    

# Task 3: Model definition

You are now tasked to define your neural POS tagger.

### Instructions

* **Baseline**: implement a Bidirectional LSTM with a Dense layer on top.
* You are **free** to experiment with hyper-parameters to define the baseline model.

* **Model 1**: add an additional LSTM layer to the Baseline model.
* **Model 2**: add an additional Dense layer to the Baseline model.

* **Do not mix Model 1 and Model 2**. Each model has its own instructions.

**Note**: if a document contains many tokens, you are **free** to split them into chunks or sentences to define your mini-batches.

In [None]:
!pip install torch

In [None]:
!pip install --upgrade torch torchvision
!pip install --upgrade typing-extensions

### Baseline





In [168]:
import torch
import torch.nn as nn

class Baseline(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(Baseline, self).__init__()
        
        # Bidirectional LSTM layer
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        
        # Dense layer for classification
        self.fc = nn.Linear(hidden_size * 2, num_classes)  # Multiply by 2 for bidirectional
        
    def forward(self, x):
        # Forward pass through Bidirectional LSTM layer
        out, _ = self.lstm(x)
        
        # Forward pass through Dense layer for classification
        out = self.fc(out)
        return out

In [189]:
input_size = 50 # GloVe embedding dimension
num_classes = len(tag_to_idx) # Number of tags
# Create an instance of the BidirectionalLSTM model
model = Baseline(input_size, 64, 1, num_classes)

# Print the model architecture
print(model)

Baseline(
  (lstm): LSTM(50, 64, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=128, out_features=45, bias=True)
)


In [190]:
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Convert the embedded sentence and tag sentence into tensors
embedded_sentence = torch.FloatTensor(embedded_sentence)
tag_sentence = torch.LongTensor(tag_sentence)

# Train the model
num_epochs = 10
for epoch in range(num_epochs):
    # Forward pass and loss calculation
    outputs = model(embedded_sentence)
    loss = criterion(outputs.view(-1, num_classes), tag_sentence.view(-1))
    
    # Backward and optimize
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    print ('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, num_epochs, loss.item()))


ValueError: expected sequence of length 18 at dim 1 (got 13)

### Model 1

In [5]:
class Model1(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(Model1, self).__init__()
        
        # Bidirectional LSTM layers
        self.lstm1 = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        self.lstm2 = nn.LSTM(hidden_size * 2, hidden_size, num_layers, batch_first=True, bidirectional=True)
        
        # Dense layer for classification
        self.fc = nn.Linear(hidden_size * 2, num_classes)  # Multiply by 2 for bidirectional
        
    def forward(self, x):
        # Forward pass through first Bidirectional LSTM layer
        out, _ = self.lstm1(x)
        
        # Forward pass through second Bidirectional LSTM layer
        out, _ = self.lstm2(out)
        
        # Get the output from the last time step (if you want to use the output from all time steps, modify this accordingly)
        out = out[:, -1, :]
        
        # Forward pass through Dense layer for classification
        out = self.fc(out)
        return out


In [None]:
# Create an instance of the BidirectionalLSTM model
model = Model1(input_size, 64, 1, num_classes)

# Print the model architecture
print(model)

### Model 2

In [6]:
class Model2(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(Model2, self).__init__()
        
        # Bidirectional LSTM layer
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        
        # First Dense layer
        self.fc1 = nn.Linear(hidden_size * 2, hidden_size)  # Multiply by 2 for bidirectional
        
        # Second Dense layer for classification
        self.fc2 = nn.Linear(hidden_size, num_classes)
        
    def forward(self, x):
        # Forward pass through Bidirectional LSTM layer
        out, _ = self.lstm(x)
        
        # Get the output from the last time step (if you want to use the output from all time steps, modify this accordingly)
        out = out[:, -1, :]
        
        # Forward pass through the first Dense layer
        out = self.fc1(out)
        
        # ReLU activation function (you can use other activation functions as well)
        out = nn.functional.relu(out)
        
        # Forward pass through the second Dense layer for classification
        out = self.fc2(out)
        return out

In [None]:
# Create an instance of the BidirectionalLSTM model
model = Model2(input_size, 64, 1, num_classes)

# Print the model architecture
print(model)

# Task 4: Metrics

Before training the models, you are tasked to define the evaluation metrics for comparison.

### Instructions

* Evaluate your models using macro F1-score, compute over **all** tokens.
* **Concatenate** all tokens in a data split to compute the F1-score. (**Hint**: accumulate FP, TP, FN, TN iteratively) 
* **Do not consider punctuation and symbol classes** $\rightarrow$ [What is punctuation?](https://en.wikipedia.org/wiki/English_punctuation)

**Note**: What about OOV tokens?
   * All the tokens in the **training** set that are not in GloVe are **not** considered as OOV
   * For the remaining tokens (i.e., OOV in the validation and test sets), you have to assign them a **static** embedding.
   * You are **free** to define the static embedding using any strategy (e.g., random, neighbourhood, 

In [12]:
from typing import List, Dict

def check_OOV_terms(embedding_model: gensim.models.keyedvectors.KeyedVectors, word_listing: List[str]):
    
    embedding_vocabulary = set(embedding_model.key_to_index.keys())
    oov = set(word_listing).difference(embedding_vocabulary)
    return list(oov)

In [13]:
oov_terms = check_OOV_terms(embedding_model, df_train.values[:,0])
oov_percentage = float(len(oov_terms)) * 100 / len(df_train.values[:,0])

print(oov_percentage)

4.768483715638269


In [14]:
def build_embedding_matrix(embedding_model: gensim.models.keyedvectors.KeyedVectors,
                           embedding_dimension: int,
                           word_to_idx: Dict[str, int],
                           vocab_size: int,
                           oov_terms: List[str]) -> np.ndarray:
    
    embedding_matrix = np.zeros((vocab_size, embedding_dimension), dtype=np.float32)
    for word, idx in tqdm(word_to_idx.items()):
        try:
            embedding_vector = embedding_model[word]
        except (KeyError, TypeError):
            embedding_vector = np.random.uniform(low=-0.05, high=0.05, size=embedding_dimension)

        embedding_matrix[idx] = embedding_vector

    return embedding_matrix

In [15]:
embedding_dimension = 50
embedding_matrix = build_embedding_matrix(embedding_model, embedding_dimension, word_to_idx, len(word_to_idx), oov_terms)
print(f"Embedding matrix shape: {embedding_matrix.shape}")

NameError: name 'word_to_idx' is not defined

# Task 5: Training and Evaluation

You are now tasked to train and evaluate the Baseline, Model 1, and Model 2.

### Instructions

* Train **all** models on the train set.
* Evaluate **all** models on the validation set.
* Compute metrics on the validation set.
* Pick **at least** three seeds for robust estimation.
* Pick the **best** performing model according to the observed validation set performance.