In [2]:
from google.colab import files
uploaded = files.upload()

Saving dontpatronizeme_pcl.tsv to dontpatronizeme_pcl.tsv


In [3]:
# import torch
# import torch.nn as nn
# from torchtext.data.utils import get_tokenizer
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# parse data from file
data_df = pd.read_csv('dontpatronizeme_pcl.tsv', sep='\t', header=None)
data_df.columns = ['id', 'identifier', 'label', 'country_code', 'text', 'target']
texts = data_df["text"].fillna("").tolist()



In [4]:
from collections import Counter, defaultdict
import math
import copy
import random
import operator

In [5]:
class NGramLM():
    def __init__(self, N):
        self.N = N
        self.vocab = set()
        self.data = []
        self.prob = {}
        self.counts = defaultdict(Counter)

    # For N = 1, the probability is stored in a dict   P = prob[next_word]
    # For N > 1, the probability is in a nested dict   P = prob[context][next_word]
    def train(self, vocab, data, smoothing_k=0):
        self.vocab = vocab
        self.data = data
        self.smoothing_k = smoothing_k

        if self.N == 1:
            self.counts = Counter(flatten(data))
            self.prob = self.get_prob(self.counts)
        else:
            self.vocab.add('<s>')
            counts = self.count_ngram()

            self.prob = {}
            for context, counter in counts.items():
                self.prob[context] = self.get_prob(counter)

    def count_ngram(self):
        counts = defaultdict(Counter)
        for sentence in self.data:
            sentence = (self.N - 1) * ['<s>'] + sentence
            for i in range(len(sentence)-self.N+1):
                context = sentence[i:i+self.N-1]
                context = " ".join(context)
                word = sentence[i+self.N-1]
                counts[context][word] += 1

        self.counts = counts
        return counts

    # normalize counts into probability(considering smoothing)
    def get_prob(self, counter):
        total = float(sum(counter.values()))
        k = self.smoothing_k

        prob = {}
        for word, count in counter.items():
            prob[word] = (count + k) / (total + len(self.vocab) * k)
        return prob

    def get_ngram_logprob(self, word, seq_len=1, context=""):
        if self.N == 1 and word in self.prob.keys():
            return math.log(self.prob[word]) / seq_len
        elif self.N > 1 and not self._is_unseen_ngram(context, word):
            return math.log(self.prob[context][word]) / seq_len
        else:
            # assign a small probability to the unseen ngram
            # to avoid log of zero and to penalise unseen word or context
            return math.log(1/len(self.vocab)) / seq_len

    def get_ngram_prob(self, word, context=""):
        if self.N == 1 and word in self.prob.keys():
            return self.prob[word]
        elif self.N > 1 and not self._is_unseen_ngram(context, word):
            return self.prob[context][word]
        elif word in self.vocab and self.smoothing_k > 0:
            # probability assigned by smoothing
            return self.smoothing_k / (sum(self.counts[context].values()) + self.smoothing_k*len(self.vocab))
        else:
            # unseen word or context
            return 0

    # In this method, the perplexity is measured at the sentence-level, averaging over all sentences.
    # Actually, it is also possible to calculate perplexity by merging all sentences into a long one.
    def perplexity(self, test_data):
        log_ppl = 0
        if self.N == 1:
            for sentence in test_data:
                for word in sentence:
                    log_ppl += self.get_ngram_logprob(word=word, seq_len=len(sentence))
        else:
            for sentence in test_data:
                for i in range(len(sentence)-self.N+1):
                    context = sentence[i:i+self.N-1]
                    context = " ".join(context)
                    word = sentence[i+self.N-1]
                    log_ppl += self.get_ngram_logprob(context=context, word=word, seq_len=len(sentence))

        log_ppl /= len(test_data)
        ppl = math.exp(-log_ppl)
        return ppl

    def _is_unseen_ngram(self, context, word):
        if context not in self.prob.keys() or word not in self.prob[context].keys():
            return True
        else:
            return False

    # generate the most probable k words
    def generate_next(self, context, k):
        context = (self.N-1) * '<s> ' + context
        context = context.split()
        ngram_context_list = context[-self.N+1:]
        ngram_context = " ".join(ngram_context_list)

        if ngram_context in self.prob.keys():
            candidates = self.prob[ngram_context]
            most_probable_words = sorted(candidates.items(), key=lambda kv: kv[1], reverse=True)
            for i in range(min(k, len(most_probable_words))):
                print(" ".join(context[self.N-1:])+" "+most_probable_words[i][0]+"\t P={}".format(most_probable_words[i][1]))
        else:
            print("Unseen context!")

    # generate the next n words with greedy search
    def generate_next_n(self, context, n):
        context = (self.N-1) * '<s> ' + context
        context = context.split()
        ngram_context_list = context[-self.N+1:]
        ngram_context = " ".join(ngram_context_list)

        for i in range(n):
            try:
                candidates = self.prob[ngram_context]
                most_likely_next = max(candidates.items(), key=operator.itemgetter(1))[0]
                context += [most_likely_next]
                ngram_context_list = ngram_context_list[1:] + [most_likely_next]
                ngram_context = " ".join(ngram_context_list)
            except:
                break
        print(" ".join(context[self.N-1:]))


In [6]:
class InterpolateNGramLM(NGramLM):

    def __init__(self, N):
        super(InterpolateNGramLM, self).__init__(N)
        self.ngram_lms = []
        self.lambdas = []

    def train(self, vocab, data, smoothing_k=0, lambdas=[]):
        assert len(lambdas) == self.N
        assert sum(lambdas) - 1 < 1e-9
        self.vocab = vocab
        self.lambdas = lambdas

        for i in range(self.N, 0, -1):
            lm = NGramLM(i)
            print("Training {}-gram language model".format(i))
            lm.train(vocab, data, smoothing_k)
            self.ngram_lms.append(lm)

    def get_ngram_logprob(self, word, seq_len, context):
        prob = 1e-9
        for i, (coef, lm) in enumerate(zip(self.lambdas, self.ngram_lms)):
            context_words = context.split()
            cutted_context = " ".join(context_words[-self.N + i + 1:])
            prob += coef * lm.get_ngram_prob(context=cutted_context, word=word)
        return math.log(prob) / seq_len

In [7]:
import numpy as np
flatten = lambda l: [item for sublist in l for item in sublist]
y = np.array(((data_df["target"] != 1) & (data_df["target"] != 0)).astype(int))

x_train, x_val, y_train, y_val = train_test_split(data_df["text"].fillna(""), y, test_size=0.2, random_state=42)
train_con = x_train[y_train==1]
train_norm = x_train[y_train!=1]
train_con_lists = []
train_norm_lists = []
for sentence in train_con:
  train_con_lists.append(sentence.split())
for sentence in train_norm:
  train_norm_lists.append(sentence.split())

con_corpus = flatten(train_con_lists)
con_vocab = set(con_corpus)
norm_corpus = flatten(train_norm_lists)
norm_vocab = set(norm_corpus)

In [8]:
ilm_con = InterpolateNGramLM(3)
ilm_con.train(con_vocab, train_con_lists, lambdas=[0.5, 0.1, 0.4])
ilm_norm = InterpolateNGramLM(3)
ilm_norm.train(norm_vocab, train_norm_lists, lambdas=[0.5,0.1,  0.4])

Training 3-gram language model
Training 2-gram language model
Training 1-gram language model
Training 3-gram language model
Training 2-gram language model
Training 1-gram language model


In [9]:
classif = np.zeros(y_train.shape)
prob = dict()
for i,line in enumerate(x_train):
  words = line.split()
  context = ""
  prob_con = 0
  prob_norm = 0

  for word in words:
    context +=  word + " "
    p_con = ilm_con.get_ngram_logprob(context=context, word=word, seq_len=1)
    prob_con += p_con
    p_norm = ilm_norm.get_ngram_logprob(context=context, word=word, seq_len=1)
    prob_norm += p_norm
    c = " ".join(context.split()[-3:])
    prob[c] = prob.get(context, 0) + p_con - p_norm
  classif[i] = 1 if prob_con * 0.97 > prob_norm else 0



In [14]:
logits = sorted(prob.items(), key=lambda x: x[1])
print(logits[:10])
logits[-10:]

[('established in 1979', -19.432297741059145), ('\'s " Wild', -17.322167987218876), ('" Wild Wild', -17.322167987218876), ('used stolen identification', -16.475312277884363), ('and player identification', -16.475312277884363), ('aids in identification', -16.475312277884363), ('and a total', -11.540781695916918), ('of the total', -11.540781695916918), ('. The total', -11.540781695916918), ('percent of total', -11.540781695916918)]


[('that employee volunteerism', 10.261865494631778),
 ('courage and dedication', 10.261865494631778),
 ('is so meager', 10.261865494631778),
 ('and a prayer', 10.772677142121784),
 ('to become prayer', 10.772677142121784),
 ('and public prayer', 10.772677142121784),
 ('feels only darkness', 10.954995204816223),
 ('dispel the darkness', 10.954995204816223),
 ('the coming darkness', 10.954995204816223),
 ('total hopeless darkness', 10.954995204816223)]

In [10]:
# print(np.sum(classif == y_test)/ len(y_test))
# print(np.sum(0 == y_test)/ len(y_test) )
print(classification_report(y_train, classif))

              precision    recall  f1-score   support

           0       1.00      0.96      0.98      7593
           1       0.74      1.00      0.85       782

    accuracy                           0.97      8375
   macro avg       0.87      0.98      0.92      8375
weighted avg       0.98      0.97      0.97      8375



In [35]:
 unigram_con = ilm_con.ngram_lms[-1].prob
 unigram_norm = ilm_norm.ngram_lms[-1].prob
 vocab = ilm_con.ngram_lms[-1].vocab
 print(vocab)
 diff = []
 for v in vocab:
  diff.append((v, abs(math.log(unigram_con.get(v, 1e-10)) - math.log(unigram_norm.get(v, 1e-5)))))

 sort_uni = sorted(diff, key=lambda x: x[1], reverse=True)
 sort_uni[:10]



{'marifat', 'Bundaberg', 'realizes', 'Campaign', 'convenience', 'MAN', 'winter', 'monstrosity', 'Said', 'equally', 'many', 'British', '16.7', 'democracy', 'Barbuda', 'followed', 'poems', 'Bar', 'RICE', 'ethnic', 'invaluable', 'legate', 'Sheepherding', 'face', 'voice', 'Thirty-eight', 'preying', 'manages', 'Dazed', 'local', 'portray', 'civil', 'was', 'Enterprise', 'district', 'international', 'street', 'Sattar', 'Practical', 'Temple', 'HK$2,800', 'Galway', 'inhumane', 'experience', 'mother', 'garden', 'Durban', 'thus', 'makes', 'Dewar', 'faring', 'plane', 'Stalin', 'Any', 'Br', 'Ahmadi', 'opening', 'Reuters', 'counseling', 'hunger', 'VICTORIA', "n't", 'Yemeni', 'DPKO', 'Medical', 'conditions', 'distinguish', 'bursting', 'picking', 'Lenten', 'tried', 'safety', 'silence', 'instance', 'worrying', 'Ltd.', 'resulting', 'hassle', ':', 'wants', 'trends', 'represented', 'Feminist', 'Danny', 'High', 'ignored', 'recalled', 'harsh', 'Wong', 'are', 'Bu', 'eradicate', 'stay', 'Southend', '280,000', 

[('<s>', 11.512925464970229),
 ('compassion', 3.946805256700893),
 ('diapers', 3.7644836999069398),
 ('hearts', 3.7644836999069398),
 ('Teresa', 3.7644836999069398),
 ('smiles', 3.541340148592729),
 ('FM', 3.541340148592729),
 ('philanthropic', 3.541340148592729),
 ('Blood', 3.541340148592729),
 ('mercy', 3.4078087559682064)]

In [None]:
 unigram_con = ilm_con.ngram_lms[-2].prob
 unigram_norm = ilm_norm.ngram_lms[-2].prob
 print(unigram_norm)

 vocab = ilm_con.ngram_lms[-2].vocab
 print(vocab)
 diff = []
 for v in vocab:
  diff.append((v, abs(math.log(unigram_con.get(v, 1e-10)) - math.log(unigram_norm.get(v, 1e-5)))))

 sort_uni = sorted(diff, key=lambda x: x[1], reverse=True)
 sort_uni[:10]



{'marifat', 'Bundaberg', 'realizes', 'Campaign', 'convenience', 'MAN', 'winter', 'monstrosity', 'Said', 'equally', 'many', 'British', '16.7', 'democracy', 'Barbuda', 'followed', 'poems', 'Bar', 'RICE', 'ethnic', 'invaluable', 'legate', 'Sheepherding', 'face', 'voice', 'Thirty-eight', 'preying', 'manages', 'Dazed', 'local', 'portray', 'civil', 'was', 'Enterprise', 'district', 'international', 'street', 'Sattar', 'Practical', 'Temple', 'HK$2,800', 'Galway', 'inhumane', 'experience', 'mother', 'garden', 'Durban', 'thus', 'makes', 'Dewar', 'faring', 'plane', 'Stalin', 'Any', 'Br', 'Ahmadi', 'opening', 'Reuters', 'counseling', 'hunger', 'VICTORIA', "n't", 'Yemeni', 'DPKO', 'Medical', 'conditions', 'distinguish', 'bursting', 'picking', 'Lenten', 'tried', 'safety', 'silence', 'instance', 'worrying', 'Ltd.', 'resulting', 'hassle', ':', 'wants', 'trends', 'represented', 'Feminist', 'Danny', 'High', 'ignored', 'recalled', 'harsh', 'Wong', 'are', 'Bu', 'eradicate', 'stay', 'Southend', '280,000', 

[('<s>', 11.512925464970229),
 ('compassion', 3.946805256700893),
 ('diapers', 3.7644836999069398),
 ('hearts', 3.7644836999069398),
 ('Teresa', 3.7644836999069398),
 ('smiles', 3.541340148592729),
 ('FM', 3.541340148592729),
 ('philanthropic', 3.541340148592729),
 ('Blood', 3.541340148592729),
 ('mercy', 3.4078087559682064)]