In [1]:
#===============================================================================
#
#           FILE: my_classifier_1-2_ntb.py 
#         AUTHOR: Bianca Ciobanica
#	       EMAIL: bianca.ciobanica@student.uclouvain.be
#
#           BUGS: 
#        VERSION: 3.10.6
#        CREATED: 20-10-2023 
#
#===============================================================================
#    DESCRIPTION:  
#    
#          USAGE: 
#===============================================================================

In [2]:
import pandas as pd
import os
import sys
import nltk
import itertools
from collections import Counter
from nltk.tokenize import WordPunctTokenizer
from nltk.lm import Vocabulary, MLE, NgramCounter, Laplace
from nltk.lm.util import log_base2
from nltk.util import ngrams
from nltk.lm.preprocessing import pad_both_ends, pad_sequence
from nltk.probability import FreqDist, ConditionalFreqDist, MLEProbDist, ConditionalProbDist

In [3]:
# load corpus from data dir
#absolute_path = os.path.dirname(__file__)
#full_path = os.path.join(absolute_path, "data")

# filename = sys.argv[1] # give argument
df_corpus = pd.read_csv("data/train.csv")

In [4]:
def remove_html_tags(text):
  import re
  clean = re.compile('<.*?>')
  return re.sub(clean, '', text)

In [5]:
# tokenize
df_corpus['Body'] = df_corpus['Body'].apply(lambda x: remove_html_tags(x))
df_corpus['Body_tokenized'] = df_corpus['Body'].apply(lambda x: WordPunctTokenizer().tokenize(x))

In [6]:
tokenized_corpus = [token 
                    for row in df_corpus['Body_tokenized'] 
                    for token in row] # flattened corpus
unk_threshold = 3
vocab_data = Vocabulary(tokenized_corpus, unk_cutoff=unk_threshold)
fdist = FreqDist(tokenized_corpus)

In [7]:
vocab_len = len(vocab_data)
#print(vocab['<s>'])
print(vocab_len)

35549


In [8]:
#fdist.plot(20, cumulative=True)

In [9]:
sorted_vocab = sorted(vocab_data)
last10_tokens = [(word,vocab_data[word]) for word in sorted_vocab[-10:]]

print(last10_tokens)

[('~', 178), ('~$', 4), ('~*', 3), ('~/', 12), ('~/.', 20), ('~=', 5), ('~[', 113), ('~]#', 3), ('~~', 3), ('\x7f', 5)]


In [10]:
print(fdist.most_common(10))

[('.', 166257), (',', 57372), (':', 52374), ('(', 52179), ('the', 45297), ('I', 40538), (';', 38359), ('to', 36449), ('=', 35217), ('-', 33518)]


In [11]:
# calculate threshold
words_in_vocab = sum(vocab_data[token] for token in vocab_data)
total_tokens = len(tokenized_corpus)
counts = total_tokens - words_in_vocab

#print((counts / len(tokenized_corpus) ) * 100)

oov_counts = sum(1 for token in tokenized_corpus if token not in vocab_data)
oov_proportion = (oov_counts / len(tokenized_corpus)) * 100
rounded_rate = round(oov_proportion, 3)

print("Oov rate : ", rounded_rate, "%")
#print(vocab_data['<UNK>'])

Oov rate :  3.242 %


In [12]:
# bigrams of the training corpus in the sentence order 
# with padding and without UNK tokens
# print(df_corpus['Body_tokenized'][8198])

n_order = 2
# generate bigrams
all_bigrams = [list(ngrams(pad_both_ends(tokens, n=n_order), n_order))
               for tokens in df_corpus['Body_tokenized']
              ]
padded_bigram_at_8198 = all_bigrams[8198]
print(padded_bigram_at_8198)

[('<s>', 'How'), ('How', 'to'), ('to', 'rename'), ('rename', 'a'), ('a', 'pane'), ('pane', 'in'), ('in', 'tmux'), ('tmux', '?'), ('?', '</s>')]


In [13]:
# update vocab with padding
df_corpus['Body_tokenized_padded'] = df_corpus['Body_tokenized'].apply(
    lambda x: list(pad_both_ends(x, n=n_order)))

tokenized_padded_corpus = [token 
                    for row in df_corpus['Body_tokenized_padded'] 
                    for token in row] # flattened corpus

In [14]:
# print(len(df_corpus['Body_tokenized_padded']))
# print(len(tokenized_corpus))
vocab_data = Vocabulary(tokenized_padded_corpus, unk_cutoff=unk_threshold)

In [15]:
# ~~~~~~~ get Most likely starter with MLE for bigram model ~~~~~~~
padded_bigram_lm = MLE(2,vocab_data) # create MLE model
padded_bigram_lm.fit(all_bigrams)

In [16]:
# print("vocab_data length = ",len(padded_bigram_lm.vocab))
# print(padded_bigram_lm.vocab)
# print(padded_bigram_lm.counts)

In [17]:
# get {token:proba} for top 3 using lm.score(token, [context])
# sort scores in descending order
# [:3] to get last three
most_likely_3_starter = {
    token: round(padded_bigram_lm.score(token, ['<s>']), 2)
    for token in sorted(padded_bigram_lm.vocab, 
                        key=lambda token: padded_bigram_lm.score(token, ['<s>']), 
                        reverse=True)[:3]
}
print("most likely 3 starters after <s> with MLE ")
print(most_likely_3_starter)

most likely 3 starters after <s> with MLE 
{'I': 0.51, 'i': 0.04, 'In': 0.02}


In [18]:
# ~~~~~~~ get Most likely starter with Laplace for bigram model ~~~~~~~
laplace_smoothed_lm = Laplace(2, vocab_data)
laplace_smoothed_lm.fit(all_bigrams)

#print(laplace_smoothed_lm.counts)
print(laplace_smoothed_lm.vocab)
print(len(laplace_smoothed_lm.vocab))

<Vocabulary with cutoff=3 unk_label='<UNK>' and 35551 items>
35551


In [19]:
most_likely_3_starter_Laplace = {
    token: round(laplace_smoothed_lm.score(token, ['<s>']), 2)
    for token in sorted(laplace_smoothed_lm.vocab, 
                        key=lambda token: laplace_smoothed_lm.score(token, ['<s>']), 
                        reverse=True)[:3]
}
print("most likely 3 starters after <s> with Laplace ")
print(most_likely_3_starter_Laplace)

most likely 3 starters after <s> with Laplace 
{'I': 0.14, 'i': 0.01, 'In': 0.01}


In [20]:
# ~~~~~ perlplexity test on test set ~~~~~
test_corpus = pd.read_csv("data/test.csv")# load test set
# preprocess
test_corpus['Body'] = test_corpus['Body'].apply(lambda x: remove_html_tags(x))
test_corpus['Body_tokenized'] = test_corpus['Body'].apply(lambda x: WordPunctTokenizer().tokenize(x))
test_corpus['Body_tokenized_padded'] = test_corpus['Body_tokenized'].apply(lambda x: list(pad_both_ends(x, n=n_order)))

In [21]:
test_corpus_bigrams = [list(ngrams(tokens, n_order))
               for tokens in test_corpus['Body_tokenized_padded']
                      ]

In [22]:
def test_perplexity(model, test_data):
    log_prob_res = 0.0
    m = 0
    for i in range(1, len(test_data)): # for each question
        for bigram in test_data[i]:    # for each token
            log_prob_res += model.logscore(bigram[1], [bigram[0]])
            #m += 1
        
    ll = log_prob_res / m #log likelihood
    perplexity = pow(2.0, -ll)
    
    return round(perplexity,3)
        
print("laplace smoothed bigram model perplexity : ", test_perplexity( laplace_smoothed_lm, test_corpus_bigrams))

ZeroDivisionError: float division by zero