<a href="https://colab.research.google.com/github/Alan-alan-Lin/2021_NLP_Lab1/blob/main/309706019_%E6%9E%97%E6%98%B1%E7%87%8A.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# For debugging
import pdb
# For checking progress
from tqdm import tqdm
# For loading data
import pandas as pd
# For tokenizaton
import nltk
from nltk import word_tokenize, sent_tokenize
nltk.download('punkt')
# For building n-gram model
from collections import Counter, namedtuple
import numpy as np
# For pos tagging
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

**Part 1. Data Preprocessing**

show the top-10 common words and their counts before/after preprocessing

**Functions and Classes**

Remove the punctuations

Lower the cases

In [None]:
def get_corpus():
  """ Reads and formats the corpus.

  Returns:
    corpus (list[str]):
      A list of sentences in the corpus.
  """
  df = pd.read_csv('https://raw.githubusercontent.com/yunzhusong/NLP109/main/lab1_data.csv')
  corpus = df.content.to_list()
  return corpus

In [None]:
def preprocess(documents):
  """ Preprocesses the corpus.
  
  Args:
    documents (list[str]):
      A list of sentences in the corpus.
  Returns:
    cleaned_documents (list[str]):
      A list of cleaned sentences in the corpus.
  """
  cleaned_documents = []
  punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~”'''
  for doc in documents:
    # Tokenizes the sentence
    sents = sent_tokenize(doc)
    for sent in sents:
      #pdb.set_trace() # delete this line for the final version
      # Removes the punctuations, hint: recursively remove in character level
      for c in punc: 
        sent=sent.replace(c,"") 
      # Lowers the case,  
      #[TODO]
      for i in range(len(sent)):
        sent[i].lower() 
      cleaned_documents.append(sent)

  #print(cleaned_documents[:5])
  return cleaned_documents

In [None]:
# Compute word frequency
def get_vocab(documents):
  """ Gets the vocabulary from the corpus.
  
  Args:
    documents (list[str]):
      A list of sentences in the corpus
  Returns:
    vocabulary (collections.Counter)
  """
  vocabulary = Counter()

  for doc in tqdm(documents):
    tokens = word_tokenize(doc)
    vocabulary.update(tokens)

  return vocabulary

**Executions**
1. Show the top-10 common words and their counts before/after preprocessing

In [None]:
# Read data
raw_documents = get_corpus()

# Build vocabulary
vocab = get_vocab(raw_documents).most_common(10)
print('\n Before preprocessing:', vocab)

# Build vocabulary after preprocessing
documents = preprocess(raw_documents)
vocab = get_vocab(documents).most_common(10)
print('\n After preprocesing:', vocab)

100%|██████████| 100000/100000 [00:20<00:00, 4771.88it/s]



 Before preprocessing: [('.', 85947), ('the', 49772), (',', 39728), ('to', 34407), ('!', 33580), ('a', 28765), ('is', 26339), ('?', 24057), ('and', 22890), ('of', 22542)]


100%|██████████| 175323/175323 [00:17<00:00, 9749.62it/s]


 After preprocesing: [('the', 49703), ('to', 34389), ('a', 28752), ('is', 25735), ('and', 22789), ('of', 22529), ('you', 21192), ('I', 15291), ('in', 15027), ('that', 14622)]





**Part 2. N-Gram Model and POS Tagging**

1.Build 2-gram / 4-gram model by processed dataset

2.Show the top-5 probable next words and their probability after initial token `<s>` by 2-gram model

3.Generate a sentence with 2-gram model and find the POS taggings

4.Generate a sentence with 4-gram model and find the POS taggings

**Functions and Classes**

In [None]:
class Ngram_model(object):
  """ Ngram model implementation.

  Attributes:
    n (int):
      The number of grams to be considered.
    model (dict):
      The ngram model.
  """
  def __init__(self, documents, N=2):
    self.n = N
    self.model = self.get_ngram_model(documents)

  def get_ngram_model(self, documents):
    N = self.n
    ngram_model = dict()
    full_grams = list()
    grams = list()
    #split_words = list()
    Word = namedtuple('Word', ['word', 'prob'])
    # for each sentence in documents
    for doc in documents:
      # Tokenizes to words
      token = nltk.word_tokenize(doc)
      # Append (N-1) start tokens '<s>' and an end token '<\s>'
      if N == 2:
        split_words = ['<s>'] + list(token) + ['<\s>']
        # Calculates numerator (construct list with full grams, i.e., N-grams)   計算分子
        [full_grams.append(tuple(split_words[i:i+N])) for i in range(len(split_words)-N+1)]
        # Calculate denominator (construct list with grams, i.e., (N-1)-grams)   計算分母
        [grams.append(tuple(split_words[i:i+N-1])) for i in range(len(split_words)-N+2)]
      elif N == 4:
        split_words = ['<s>'] + ['<s>'] + ['<s>'] + list(token) + ['<\s>'] 
        # Calculates numerator (construct list with full grams, i.e., N-grams)   計算分子
        [full_grams.append(tuple(split_words[i:i+N])) for i in range(len(split_words)-N+1)]
        # Calculate denominator (construct list with grams, i.e., (N-1)-grams)   計算分母
        [grams.append(tuple(split_words[i:i+N-1])) for i in range(len(split_words)-N+2)]
    # Count the occurence frequency of each gram
    # Take 2-gram model as example:
    #   full_grams -> list[('a', 'gram'),('other', 'gram'), ...]
    #   grams -> list[('a'), ('other'), ('gram'), ...]
    #   full_gram_counter -> dict{('a', 'gram'):frequency_1, ('other','gram'):frequency_2, ...}
    #   gram_counter -> dict{('a'):frequency_1, ('gram'):frequency_2, ...}
    full_gram_counter = Counter(full_grams)
    gram_counter = Counter(grams)

    # Build model
    # Take 2-gram model as example:
    #   { '<s>': [tuple(word='i', prob=0.6), tuple(word='the', prob=0.2), ...],
    #   'i': [tuple(word='am', prob=0.7), tuple(word='want', prob=0.1), ...],
    #    ... }
    for key in full_gram_counter:
      word = ''.join(key[:N-1])

      if word not in ngram_model:
        ngram_model.update({word: set()})

      # next_word_prob -> float
      next_word_prob = full_gram_counter[key] / gram_counter[key[:N-1]]
      w = Word(key[-1], next_word_prob)
      ngram_model[word].add(w)

    # Sort the result by frequency
    for word, ng in ngram_model.items():
      ngram_model[word] = sorted(ng, key=lambda x: x.prob, reverse=True)

    return ngram_model


  def predict_sent(self, text=None, max_len=30):
    """ Predicts a sentence with the ngram model.

    Args:
      text (string or list[string])
    Returns:
      A prediction string.
    """

    N = self.n
    backup_tokens = ['<s>']*(N-1)
    if not text:
      tokens = backup_tokens
      output = []

    elif type(text)==str:
      tokens = backup_tokens + text.split(' ')
      tokens = tokens[-(N-1):]
      if not self.check_existence(tokens):
        return 
      output = tokens

    elif type(text) == list:
      tokens = backup_tokens + text
      tokens = tokens[-(N-1):]
      if not self.check_existence(tokens):
        return
      output = tokens

    else:
      print('[Error] the input text must be string or list of string')
      return

    for i in range(max_len):
      possible_words = list(self.model[''.join(tokens)])
      probs = [word.prob for word in possible_words]
      words = [word.word for word in possible_words]
      next_word = np.random.choice(words, 1, p=probs)[0]
      tokens = tokens[1:] + [next_word]

      if next_word == '<\\s>':
        break

      output.append(next_word)
    return ' '.join(output)

  def predict_next(self, text=None, top=5):
    """ Predicts next word with the ngram model.

    Args:
      text (string or list[string])

    Returns:
      possible_next_words (list[namedtuple]):
        A list of top few possible next words.
    """

    N = self.n
    backup_tokens = ['<s>']*(N-1)
    if not text:
      tokens = backup_tokens

    elif type(text)==str:
      tokens = backup_tokens + text.split(' ')
      tokens = tokens[-(N-1):]
      if not self.check_existence(tokens):
        return 

    elif type(text) == list:
      tokens = backup_tokens + text
      tokens = tokens[-(N-1):]
      if not self.check_existence(tokens):
        return
    else:
      print('[Error] the input text must be string or list of string')

    possible_next_words = self.model[''.join(tokens)][:top]
    possible_next_words = [(word.word, word.prob) for word in possible_next_words]

    return possible_next_words

  def check_existence(self, tokens):
    if not ''.join(tokens) in self.model.keys():
      print('[Error] the input text {} not in the vocabulary'.format(tokens))
      return False
    else:
      return True

**Executions**
1. Build 2-gram/4-gram model by processed dataset

In [None]:
twogram = Ngram_model(documents, N=2)
fourgram = Ngram_model(documents, N=4)

2. Show the top-5 probable next words and their probability after initial token `<s>` by 2-gram model

In [None]:
output = twogram.predict_next(text='<s>', top=5)
print('Next word predictions of two gram model:', output)

Next word predictions of two gram model: [('I', 0.05075774427770458), ('<\\s>', 0.031182446113744346), ('The', 0.029613912607016762), ('You', 0.029477022410065994), ('They', 0.018040987206470342)]


3. Generate a sentence with 2-gram model and find the POS taggings

In [None]:
output = twogram.predict_sent(max_len=30)
print('Generation results of two gram model:', output)
nltk.pos_tag(word_tokenize(output))

Generation results of two gram model: Your DigiAssets DAXUPByJRmEPi9zDCFJoErtQt8NiJa27Li


[('Your', 'PRP$'),
 ('DigiAssets', 'NNS'),
 ('DAXUPByJRmEPi9zDCFJoErtQt8NiJa27Li', 'VBP')]

4. Generate a sentence with 4-gram model and find the POS taggings

In [None]:
output = fourgram.predict_sent(max_len=30)
print('Generation results of four gram model: ', output)
nltk.pos_tag(word_tokenize(output))

Generation results of four gram model:  You nailed it Joy


[('You', 'PRP'), ('nailed', 'VBD'), ('it', 'PRP'), ('Joy', 'NNP')]