<a href="https://colab.research.google.com/github/EdoardoMaines/NLU-Project/blob/main/NLU_MyProject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import collections
import re
import torch as d2l
import nltk
from nltk.corpus.reader import ConllCorpusReader
from nltk.lm.vocabulary import Vocabulary


In [17]:
#OPEN FILES
def readFile(path):
  corpus = []
  with open(path) as file:
    for line in file:
      corpus.append(line)

    return corpus

In [25]:
#SPLIT IN WORDS
def tokenize(lines):
  return [line.split() for line in lines]

In [29]:
#CLASS VOCABOLARY
class Vocab: 
  """Vocabulary for text."""
  def __init__(self, tokens=None, min_freq=0, reserved_tokens=None):
    if tokens is None:
      tokens = []
    if reserved_tokens is None:
      reserved_tokens = []

  # Sort according to frequencies
    counter = count_corpus(tokens)
    self._token_freqs = sorted(counter.items(), key=lambda x: x[1],
    reverse=True)
    # The index for the unknown token is 0
    self.idx_to_token = ['<unk>'] + reserved_tokens
    self.token_to_idx = {token: idx for idx, token in enumerate(self.idx_to_token)}
    for token, freq in self._token_freqs:
      if freq < min_freq:
        break
      if token not in self.token_to_idx:
        self.idx_to_token.append(token)
        self.token_to_idx[token] = len(self.idx_to_token) - 1
  def __len__(self):
    return len(self.idx_to_token)

  def __getitem__(self, tokens):
    if not isinstance(tokens, (list, tuple)):
      return self.token_to_idx.get(tokens, self.unk)
    return [self.__getitem__(token) for token in tokens]
  
  def to_tokens(self, indices):
    if not isinstance(indices, (list, tuple)):
      return self.idx_to_token[indices]
    return [self.idx_to_token[index] for index in indices]


  @property
  def unk(self): # Index for the unknown token
    return 0
  @property
  def token_freqs(self): # Index for the unknown token
    return self._token_freqs


def count_corpus(tokens):
  """Count token frequencies."""
  # Here `tokens` is a 1D list or 2D list
  if len(tokens) == 0 or isinstance(tokens[0], list):
  # Flatten a list of token lists into a list of tokens
    tokens = [token for line in tokens for token in line]
  return collections.Counter(tokens)

In [None]:
test_corpus = readFile('/content/ptb.test.txt')
train_corpus = readFile('/content/ptb.train.txt')
valid_corpus = readFile('/content/ptb.valid.txt')

#print(len(test_corpus))
token = tokenize(test_corpus)
for i in range(20):
  print(token[i])

In [30]:
vocab = Vocab(token)
print(list(vocab.token_to_idx.items())[:10]) ## in the dataset N = instead of numbers

[('<unk>', 0), ('the', 1), ('N', 2), ('of', 3), ('to', 4), ('a', 5), ('in', 6), ('and', 7), ("'s", 8), ('that', 9)]
