# Assignment 1

This Jupyter Notebook document is our implementation of Assignment 1. 


In [1]:
# !pip install pandas 
# !pip install numpy
# !pip install matplotlib



In [2]:
# !pip install wget



# Importing Libraries

    

In [3]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import urllib.request
import zipfile
import os

# Task 1: Data Loading and Splitting
* **Download** the corpus.
* **Encode** the corpus into a pandas.DataFrame object.
* **Split** it in training, validation, and test sets.

[Penn TreeBank corpus](https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/dependency_treebank.zip).

In [4]:
# Download the data from Penn TreeBank corpus 

address = 'https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/dependency_treebank.zip'
urllib.request.urlretrieve(address, 'dependency_treebank.zip')
# Unzip the data
with zipfile.ZipFile('dependency_treebank.zip', 'r') as zip_ref:
    zip_ref.extractall()


In [5]:
# Encode the corpus in a dataframe object
list_train = [] # documents 1-100
list_test = [] # documents 101-150
list_val = [] # documents 151-199
for filename in os.listdir('dependency_treebank/'):
    if filename.endswith('.dp'):
        with open('dependency_treebank/' + filename, 'r') as f:
            lines = f.readlines()
            for line in lines:
                if int(filename[4:8]) <= 100:
                    list_train.append(line.split()[:-1])
                elif int(filename[4:8]) <= 150:
                    list_test.append(line.split()[:-1])
                elif int(filename[4:8]) <= 199:
                    list_val.append(line.split()[:-1])
         
print(len(list_train))
print(len(list_test))
print(len(list_val))

print(list_train[0:20])

# create a dataframe object
df_train = pd.DataFrame(list_train)
df_test = pd.DataFrame(list_test)
df_val = pd.DataFrame(list_val)

print(df_train.shape)
print(df_test.shape)
print(df_val.shape)

print(df_train.head(5))
print(df_test.head(5))
print(df_val.head(5))

print(df_train.values[:,0])

49219
32432
16148
[['Pierre', 'NNP'], ['Vinken', 'NNP'], [',', ','], ['61', 'CD'], ['years', 'NNS'], ['old', 'JJ'], [',', ','], ['will', 'MD'], ['join', 'VB'], ['the', 'DT'], ['board', 'NN'], ['as', 'IN'], ['a', 'DT'], ['nonexecutive', 'JJ'], ['director', 'NN'], ['Nov.', 'NNP'], ['29', 'CD'], ['.', '.'], [], ['Mr.', 'NNP']]
(49219, 2)
(32432, 2)
(16148, 2)
        0    1
0  Pierre  NNP
1  Vinken  NNP
2       ,    ,
3      61   CD
4   years  NNS
              0    1
0             A   DT
1  House-Senate  NNP
2    conference   NN
3      approved  VBD
4         major   JJ
           0    1
0  Intelogic  NNP
1      Trace  NNP
2       Inc.  NNP
3          ,    ,
4        San  NNP
['Pierre' 'Vinken' ',' ... 'has' 'faced' '.']


# Task 2: Text encoding

To train a neural POS tagger, you first need to encode text into numerical format.

### Instructions

* Embed words using **GloVe embeddings**.
* You are **free** to pick any embedding dimension.
* [Optional] You are free to experiment with text pre-processing: **make sure you do not delete any token!**

In [6]:
# !pip install gensim

In [7]:
import gensim
import gensim.downloader as gloader

def load_embedding_model(embedding_dimension: int = 50) -> gensim.models.keyedvectors.KeyedVectors:
    download_path = ""
    download_path = "glove-wiki-gigaword-{}".format(embedding_dimension)
        
    try:
        emb_model = gloader.load(download_path)
    except ValueError as e:
        print("Invalid embedding model name! Check the embedding dimension:")
        print("Glove: 50, 100, 200, 300")
        raise e

    return emb_model



In [8]:
# Glove -> 50, 100, 200, 300
embedding_model = load_embedding_model(embedding_dimension=50)

# Task 3: Model definition

You are now tasked to define your neural POS tagger.

### Instructions

* **Baseline**: implement a Bidirectional LSTM with a Dense layer on top.
* You are **free** to experiment with hyper-parameters to define the baseline model.

* **Model 1**: add an additional LSTM layer to the Baseline model.
* **Model 2**: add an additional Dense layer to the Baseline model.

* **Do not mix Model 1 and Model 2**. Each model has its own instructions.

**Note**: if a document contains many tokens, you are **free** to split them into chunks or sentences to define your mini-batches.

In [9]:
# !pip install torch



In [10]:
# !pip install --upgrade torch torchvision
# !pip install --upgrade typing-extensions

Requirement already up-to-date: torch in c:\users\39328\anaconda3\lib\site-packages (2.1.0)
Requirement already up-to-date: torchvision in c:\users\39328\anaconda3\lib\site-packages (0.16.0)
Requirement already up-to-date: typing-extensions in c:\users\39328\anaconda3\lib\site-packages (4.8.0)


In [11]:
import torch
import torch.nn as nn

class BidirectionalLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(BidirectionalLSTM, self).__init__()
        
        # Bidirectional LSTM layer
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        
        # Dense layer for classification
        self.fc = nn.Linear(hidden_size * 2, num_classes)  # Multiply by 2 for bidirectional
        
    def forward(self, x):
        # Forward pass through Bidirectional LSTM layer
        out, _ = self.lstm(x)
        
        # Get the output from the last time step (if you want to use the output from all time steps, modify this accordingly)
        out = out[:, -1, :]
        
        # Forward pass through Dense layer for classification
        out = self.fc(out)
        return out

In [12]:
from typing import List, Dict

def check_OOV_terms(embedding_model: gensim.models.keyedvectors.KeyedVectors, word_listing: List[str]):
    
    embedding_vocabulary = set(embedding_model.key_to_index.keys())
    oov = set(word_listing).difference(embedding_vocabulary)
    return list(oov)

In [13]:
oov_terms = check_OOV_terms(embedding_model, df_train.values[:,0])
oov_percentage = float(len(oov_terms)) * 100 / len(df_train.values[:,0])

print(oov_percentage)

4.768483715638269


In [14]:
def build_embedding_matrix(embedding_model: gensim.models.keyedvectors.KeyedVectors,
                           embedding_dimension: int,
                           word_to_idx: Dict[str, int],
                           vocab_size: int,
                           oov_terms: List[str]) -> np.ndarray:
    
    embedding_matrix = np.zeros((vocab_size, embedding_dimension), dtype=np.float32)
    for word, idx in tqdm(word_to_idx.items()):
        try:
            embedding_vector = embedding_model[word]
        except (KeyError, TypeError):
            embedding_vector = np.random.uniform(low=-0.05, high=0.05, size=embedding_dimension)

        embedding_matrix[idx] = embedding_vector

    return embedding_matrix

In [15]:
embedding_dimension = 50
embedding_matrix = build_embedding_matrix(embedding_model, embedding_dimension, word_to_idx, len(word_to_idx), oov_terms)
print(f"Embedding matrix shape: {embedding_matrix.shape}")

NameError: name 'word_to_idx' is not defined