In [1]:
# !pip install pandas 
# !pip install numpy
# !pip install matplotlib
# !pip install tqdm



In [2]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import urllib.request
import zipfile
import os
from tqdm import tqdm

In [3]:
# Create documents dataframe
dp_docs = [file for file in os.listdir('dependency_treebank/') if file.endswith('.dp')]
dataframes = []

for file in tqdm(dp_docs):
    with open('dependency_treebank/' + file, 'r') as f:
        lines = f.readlines()
        data = [line.split('\t') for line in lines]
        df = pd.DataFrame(data, columns=['word', 'pos', 'head'])
        # drop the last column
        df = df.iloc[:, :-1]
        dataframes.append(df)

df = pd.DataFrame({'Dataframes': dataframes})
print(df.head(5))
print(df['Dataframes'][0][0:5])
print(df.shape)

  0%|          | 0/199 [00:00<?, ?it/s]

100%|██████████| 199/199 [00:00<00:00, 399.16it/s]


                                          Dataframes
0              word   pos
0         Pierre   NNP
...
1              word  pos
0        Rudolph  NNP
1 ...
2           word   pos
0           A    DT
1     ...
3               word  pos
0          Yields  NNS
...
4                 word   pos
0              J.P. ...
     word  pos
0  Pierre  NNP
1  Vinken  NNP
2       ,    ,
3      61   CD
4   years  NNS
(199, 1)


In [4]:
# Split the dataframes into train, validation and test sets
train = df['Dataframes'][0:100]
val = df['Dataframes'][100:150]
test = df['Dataframes'][150:200]

print(train.shape)
print(val.shape)
print(test.shape)

(100,)
(50,)
(49,)


In [5]:
# Text preprocessing
def to_lower_case(df, docs):
    for i in range(docs, len(df)):
        df[i]['word'] = df[i]['word'].str.lower()

to_lower_case(train, 0)
to_lower_case(val, 100)
to_lower_case(test, 150)
print(train[0][0:5])

     word  pos
0  pierre  NNP
1  vinken  NNP
2       ,    ,
3      61   CD
4   years  NNS


In [6]:
# !pip install gensim



In [7]:
import gensim
import gensim.downloader as gloader

def load_embedding_model(embedding_dimension: int = 50) -> gensim.models.keyedvectors.KeyedVectors:
    download_path = ""
    download_path = "glove-wiki-gigaword-{}".format(embedding_dimension)
        
    try:
        emb_model = gloader.load(download_path)
    except ValueError as e:
        print("Invalid embedding model name! Check the embedding dimension:")
        print("Glove: 50, 100, 200, 300")
        raise e

    return emb_model



In [8]:
# Glove -> 50, 100, 200, 300
embedding_model = load_embedding_model(embedding_dimension=50)

In [9]:
from typing import List, Dict

def check_OOV_terms(embedding_model: gensim.models.keyedvectors.KeyedVectors, word_listing: List[str]):
    
    embedding_vocabulary = set(embedding_model.key_to_index.keys())
    oov = set(word_listing).difference(embedding_vocabulary)
    return list(oov)

In [10]:
# Build the word2index and index2word dictionaries
def build_word_index(embedding_model: gensim.models.keyedvectors.KeyedVectors, 
                     word_listing: List[str]) -> Dict[str, int]:
    
    word2index = {}
    index2word = {}
    
    for i, word in enumerate(word_listing):
        word2index[word] = i
        index2word[i] = word
    
    return word2index, index2word

# Build the embedding matrix
def build_embedding_matrix(embedding_model: gensim.models.keyedvectors.KeyedVectors, 
                           word2index: Dict[str, int], 
                           embedding_dimension: int = 50) -> np.ndarray:
    
    embedding_matrix = np.zeros((len(word2index), embedding_dimension))
    
    for word, index in word2index.items():
        try:
            embedding_vector = embedding_model.get_vector(word)
            embedding_matrix[index] = embedding_vector
        except KeyError:
            pass
    
    return embedding_matrix

word2index, index2word = build_word_index(embedding_model, list(embedding_model.key_to_index.keys()))
embedding_matrix = build_embedding_matrix(embedding_model, word2index, embedding_dimension=50)

In [11]:
# Build the POS2index and index2POS dictionaries
def build_POS_index(df):
    pos2index = {}
    index2pos = {}
    for elem in df:
        for pos in elem['pos']:
            if pos not in pos2index:
                pos2index[pos] = len(pos2index)
                index2pos[len(index2pos)] = pos
    return pos2index, index2pos

pos2index, index2pos = build_POS_index(train)
print(pos2index)

{'NNP': 0, ',': 1, 'CD': 2, 'NNS': 3, 'JJ': 4, 'MD': 5, 'VB': 6, 'DT': 7, 'NN': 8, 'IN': 9, '.': 10, None: 11, 'VBZ': 12, 'VBG': 13, 'CC': 14, 'VBD': 15, 'VBN': 16, 'RB': 17, 'TO': 18, 'PRP': 19, 'RBR': 20, 'WDT': 21, 'VBP': 22, 'RP': 23, 'PRP$': 24, 'JJS': 25, 'POS': 26, '``': 27, 'EX': 28, "''": 29, 'WP': 30, ':': 31, 'JJR': 32, 'WRB': 33, '$': 34, 'NNPS': 35, 'WP$': 36, '-LRB-': 37, '-RRB-': 38, 'PDT': 39, 'RBS': 40, 'FW': 41, 'UH': 42, 'SYM': 43, 'LS': 44, '#': 45}


In [12]:
# Apply the word2index and POS2index dictionaries to the dataframes
def apply_word2index(df, word2index):
    for i in range(len(df)):
        df[i]['word'] = df[i]['word'].apply(lambda x: word2index[x] if x in word2index else word2index['9999'])
    return df

def apply_POS2index(df, pos2index):
    for i in range(len(df)):
        df[i]['pos'] = df[i]['pos'].apply(lambda x: pos2index[x])
    return df

train_indexed = apply_word2index(train, word2index)
train_indexed = apply_POS2index(train_indexed, pos2index)
print(train_indexed[0][0:5])

     word  pos
0    5029    0
1  173714    0
2       1    1
3    4978    2
4      82    3


In [13]:
# Apply embeddings matrix to the dataframes
def apply_embeddings(df, embedding_matrix):
    for i in range(len(df)):
        df[i]['word'] = df[i]['word'].apply(lambda x: embedding_matrix[x])
    return df

train_embedded = apply_embeddings(train, embedding_matrix)
print(train_embedded[0][0:5])

                                                word  pos
0  [0.2356799989938736, 0.3963800072669983, -0.60...    0
1  [0.08332999795675278, -0.2219800055027008, 0.3...    0
2  [0.013441000133752823, 0.23681999742984772, -0...    1
3  [-0.3661099970340729, 0.38075000047683716, 1.5...    2
4  [0.16962000727653503, 0.4343999922275543, -0.0...    3


### Instructions

* **Baseline**: implement a Bidirectional LSTM with a Dense layer on top.
* You are **free** to experiment with hyper-parameters to define the baseline model.

* **Model 1**: add an additional LSTM layer to the Baseline model.
* **Model 2**: add an additional Dense layer to the Baseline model.

* **Do not mix Model 1 and Model 2**. Each model has its own instructions.

**Note**: if a document contains many tokens, you are **free** to split them into chunks or sentences to define your mini-batches.

In [14]:
# !pip install torch
# !pip install --upgrade torch torchvision
# !pip install --upgrade typing-extensions

Requirement already up-to-date: torch in c:\users\39328\anaconda3\lib\site-packages (2.1.0)
Requirement already up-to-date: torchvision in c:\users\39328\anaconda3\lib\site-packages (0.16.0)
Requirement already up-to-date: typing-extensions in c:\users\39328\anaconda3\lib\site-packages (4.8.0)


In [15]:
# Baseline model definition
import torch

class BaselineModel(torch.nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(BaselineModel, self).__init__()
        self.hidden_size = hidden_size
        self.lstmlayer = torch.nn.LSTM(input_size, hidden_size, batch_first=True, bidirectional=True)
        self.linear = torch.nn.Linear(hidden_size * 2, output_size)
        self.softmax = torch.nn.Softmax(dim=1)
    
    def forward(self, x):
        out, _ = self.lstmlayer(x)
        out = self.linear(out)
        out = self.softmax(out)
        return out