In [3]:
import pandas as pd
import nltk
import random
from collections import Counter, defaultdict
import numpy as np

Import some necessary packages

# TASK 1

In [4]:
data = pd.read_csv("C:/Users/georg/_CompLing/subtlexus2/SUBTLEXus.txt" , header = 0, delim_whitespace = True) # load SUBTLEX  
types = list(data['Word'])                       # list of types = words from dataframe
freqs = list(data['FREQcount'])                  # get frequencies of all words into a list
tokens = []                                      # initiate empty  list of tokens
for word, freq in zip(types, freqs):
    i = freq
    while i > 0:                                 # append each word in types to the list of tokens i times where i is it's 
        tokens.append(word)                      # frequency as stated in the dataframe
        i -= 1




In [5]:
data.head(10)

Unnamed: 0,Word,FREQcount,CDcount,FREQlow,Cdlow,SUBTLWF,Lg10WF,SUBTLCD,Lg10CD
0,the,1501908,8388,1339811,8388,29449.18,6.1766,100.0,3.9237
1,to,1156570,8383,1138435,8380,22677.84,6.0632,99.94,3.9235
2,a,1041179,8382,976941,8380,20415.27,6.0175,99.93,3.9234
3,you,2134713,8381,1595028,8376,41857.12,6.3293,99.92,3.9233
4,and,682780,8379,515365,8374,13387.84,5.8343,99.89,3.9232
5,it,963712,8377,685089,8370,18896.31,5.9839,99.87,3.9231
6,s,1057301,8377,1052788,8373,20731.39,6.0242,99.87,3.9231
7,of,590439,8375,573021,8372,11577.24,5.7712,99.85,3.923
8,for,351650,8374,332686,8370,6895.1,5.5461,99.83,3.923
9,I,2038529,8372,5147,350,39971.16,6.3093,99.81,3.9229


# TASK 2

In [6]:
print(len(tokens) == sum(freqs),'\n',len(types)==len(freqs))       

True 
 True


If the equations above both result to True, it indicates that the list of tokens consists of the same amount of individual tokens as SUBTLEX (the sum of the frequencies of the types) and the list of types contains the same amount of individual types as SUBTLEX (the amount of words in the 'Word' column of SUBTLEX.

# TASK 3

In [7]:
class Corpus(object):
    def __init__(self, n, corpus):                        
        self.ngram_size = n
        self.corpus = corpus  
        self.words = self.read()
        
        
    def read(self):
        
        """The read function gets no arguments except for self. 
        It takes as inputs all items in the list that is stored in self.corpus
        and outputs a list of lists of lowercase letters with the necessary 
        amount of BoS and EoS symbols added."""
        
        words = []                                                            #initiate empty list 
        r = 1 if self.ngram_size == 1 else self.ngram_size - 1                #calculate how many #bos# symbols
                                                                              # to add to words in the corpus
            
        for word in self.corpus:                                              #iterate over words in corpus
            if isinstance(word,str):                                          #if the word is a string:
                words.append(['#bos#']*r + list(word.lower()) + ['#eos#'])    #   add r * #bos# symbols and an #eos# symbol 
                                                                              #   to the word and make the word a list of letters
                                                                              #   append the new form of the word to a wordslist
                        
        else:                                                                 #if  word is not a string:
                words.append(['#bos#']*r + list(str(word)) + ['#eos#'])       #    turn it into a string and add #bos# and #eos#
                                                                              #    symbols and append it to the wordslist
                
        return words                                                          #return the list of words in their new form       
    
        
    
    
    
    



I decided to implement Corpus and Language Model as two separate classes. The thought behind this was that this way I could use much of the code provided in Notebook 4 on language modeling.

In [8]:
class LM(object):
    def __init__(self, n):
      
        self.k = 0.01 
        self.ngram_size = n
        self.smoother = 'Laplace'
        self.lambdas = {i+1: 1/n for i in range(n)}
        
    def get_ngram(self, word, i, n):
        
        if n == 1:
            return word[i]
        else:
            ngram = word[i-(n-1):i+1]
            history = tuple(ngram[:-1])
            target = ngram[-1]
            return (history, target)
        
        
    def update_counts(self, corpus, n):
        
        """
        Processes the input corpus given an ngram sizes and stores transition counts. Depending on which smoother
        is selected when creating the LM object, different courses of actions are taken. The bottom line is that
        a dictionary of dictionary is created, where the first level key indicates the ngram size, the second 
        level key indicates the history, the third level key indicates the current word, and the value indicates
        the history-current word co-occurrence count.
        """
        
        if self.ngram_size != corpus.ngram_size:
            raise ValueError("The corpus was pre-processed considering an ngram size of {} while the "
                             "language model was created with an ngram size of {}. \n"
                             "Please choose the same ngram size for pre-processing the corpus and fitting "
                             "the model.".format(corpus.ngram_size, self.ngram_size))
        
        self.counts = defaultdict(dict)
        
        ngram_sizes = [n]
        for ngram_size in ngram_sizes:
            self.counts[ngram_size] = defaultdict(dict) if ngram_size > 1 else Counter()
        for word in corpus.words:
            for ngram_size in ngram_sizes:
                for idx in range(n-1, len(word)):
                    ngram = self.get_ngram(word, idx, ngram_size)
                    if ngram_size == 1:
                        self.counts[ngram_size][ngram] += 1
                    else:
                        # it's faster to try to do something and catch an exception than to use an if statement to 
                        # check whether a condition is met beforehand. The if is checked everytime, the exception 
                        # is only catched the first time, after that everything runs smoothly
                        try:
                            self.counts[ngram_size][ngram[0]][ngram[1]] += 1
                        except KeyError:
                            self.counts[ngram_size][ngram[0]][ngram[1]] = 1
        
        # first loop through the words in the corpus, than loop through each letter in a word
        self.vocab = {letter for word in corpus.words for letter in word}
        self.vocab_size = len(self.vocab)
        
    def get_unigram_probability(self, ngram):
        
        """
        Takes the counts of the model and calculates a probability for transmissions in the unigram
        """
        
        tot = sum(list(self.counts[1].values())) + (self.vocab_size*self.k)
        
        try:
            ngram_count = self.counts[1][ngram] + self.k
        except KeyError:
            ngram_count = self.k
        
        return ngram_count/tot
    
    
    def get_laplace_ngram_probability(self, history, target):
        
        """
        Takes the counts of the model, ngram history and a target and calculates a probability for transmissions in the
        ngram using laplace smoothing with the specifed coefficient k
        """
        
        try:
            ngram_tot = np.sum(list(self.counts[self.ngram_size][history].values())) + (self.vocab_size*self.k)
            try:
                transition_count = self.counts[self.ngram_size][history][target] + self.k
            except KeyError:
                transition_count = self.k
        except KeyError:
            transition_count = self.k
            ngram_tot = self.vocab_size*self.k
            
        return transition_count/ngram_tot 
    
    def perplexity(self, test_corpus):
        
        """
        Uses the estimated language model to process a corpus and computes the perplexity 
        of the language model over the corpus.
        """
        
        probs = []
        for word in test_corpus.words:
            for idx in range(self.ngram_size-1, len(word)):
                ngram = self.get_ngram(word, idx, self.ngram_size)
                if self.ngram_size == 1:
                    probs.append(self.get_unigram_probability(ngram))
                else:
                    probs.append(self.get_laplace_ngram_probability(ngram[0], ngram[1]))
                        
        entropy = np.log2(probs)

        # this assertion makes sure that valid probabilities are retrieved, whose log must be <= 0
        assert all(entropy <= 0)
        
        avg_entropy = -1 * (sum(entropy) / len(entropy))
        
        return pow(2.0, avg_entropy)
    
    def generate(self, limit):
    
        """
        This function takes as input an integer specifying the maximum length of the output. 
    
        The function outputs a sentence (in the form of a list) generated according to the language model.
        Generation stops either when an end of sequence symbol is samples or when the limit provided as input is 
        reached.
        """
    
        i = 0
        r = 1 if self.ngram_size == 1 else self.ngram_size - 1
        n = self.ngram_size
        word = ['#bos#']*r
        current = word[-(self.ngram_size-1):]
    
        while i < limit:
        
            # create a vector of the possible words with relative probabilities: for the unigram model, just 
            # take each unigram probability, for ngram models of higher orders, condition on the current ngram.
            letters = []
            probabilities = []
            continuations = self.counts[self.ngram_size] if n == 1 else self.counts[self.ngram_size][tuple(current)]
            tot = sum(list(continuations.values()))
            for w, v in continuations.items():
                letters.append(w)
                probabilities.append(v/tot)
        
            # generate a new token according to the probabiity distribution
            new = np.random.choice(letters, size=1, p=probabilities)[0]
        
            # stop generating if we hit an end of sequence token.
            if new != '#eos#': 
                word.append(new) 
            else: 
                return ''.join(word[n-1:])
        
            # update the current ngram to proceed generating and increment the counter so that we don't keep 
            # generating forever and we can stop if we hit the maximum value we provided as input
            current = word[-(n-1):]   
            i += 1
    
        # return the generated sentence if no end of sequence symbol is generated.
        return ''.join(word[n-1:])
    
    
    def generate_likeliest(self, limit):
    
        """
        This function takes as input an integer specifying the maximum length of the output. 
    
        The function outputs a sentence (in the form of a list) generated according to the language model.
        Generation stops either when an end of sequence symbol is samples or when the limit provided as input is 
        reached.
        """
    
        i = 0
        r = 1 if self.ngram_size == 1 else self.ngram_size - 1
        n = self.ngram_size
        word = ['#bos#']*r
        current = word[-(self.ngram_size-1):]
    
        while i < limit:
        
            # create a vector of the possible words with relative probabilities: for the unigram model, just 
            # take each unigram probability, for ngram models of higher orders, condition on the current ngram.
            letters = []
            probabilities = []
            continuations = self.counts[self.ngram_size] if n == 1 else self.counts[self.ngram_size][tuple(current)]
            tot = sum(list(continuations.values()))
            for w, v in continuations.items():
                letters.append(w)
                probabilities.append(v/tot)
        
            # generate a new token according to the probabiity distribution, but appending the letter with the
            # highest probability, rather than choosing randomly based on the probabiity distribution
            new = letters[probabilities.index(max(probabilities))]     
        
            # stop generating if we hit an end of sequence token.
            if new != '#eos#': 
                word.append(new) 
            else: 
                return ''.join(word[n-1:])
        
            # update the current ngram to proceed generating and increment the counter so that we don't keep 
            # generating forever and we can stop if we hit the maximum value we provided as input
            current = word[-(n-1):]   
            i += 1
    
        # return the generated sentence if no end of sequence symbol is generated.
        return ''.join(word[n-1:])

# Here I initiate all 4 models to be used 

The implementation of the 'Update Counts' method is quite memory heavy, so this block of code can take a while time to run.

In [9]:
n = 3
#trigram token based
tri_token_corpus = Corpus(n,tokens)                             #create corpus
tri_token_lm = LM(n)                                            #initiate model
tri_token_lm.update_counts(tri_token_corpus,n)                  #update model counts using corpus


#trigram type based
tri_type_corpus = Corpus(n,types)
tri_type_lm = LM(n)
tri_type_lm.update_counts(tri_type_corpus,n)

n = 4
#tetragram token based
tetra_token_corpus = Corpus(n,tokens)
tetra_token_lm = LM(n)
tetra_token_lm.update_counts(tetra_token_corpus,n)


#tetragram type based
tetra_type_corpus = Corpus(n,types)
tetra_type_lm = LM(n)
tetra_type_lm.update_counts(tetra_type_corpus,n)



# TASK 4

Create lists of all words of lengths 3, 8 and 13 in SUBTLEX

In [10]:

l3 = []                         #create list
for word in types:              #iterate over all words in type
    if  isinstance(word,str):   #check if the word is a string(i.e. not a number)
        if len(word) == 3:      #filter for words with length 3
            l3.append(word)     #append 3 letter words to list
        else:
            continue
    else: 
        continue
        
        
        
l8 = []                         #same as before but for 8 letter words
for word in types:
    if  isinstance(word,str):
        
        if len(word) == 8:
            l8.append(word)
        else:
            continue
    else: 
        continue
        
        
l13 = []                        #same as before but for 13 letter words
for word in types:
    if  isinstance(word,str):
        
        if len(word) == 13:
            l13.append(word)
        else:
            continue
    else: 
        continue
    

In [11]:
# Initiate lists to store the data which will be put into dataframe
perplexities = []
words = []
n_gram_size = []
input_data = []
from itertools import cycle

Get the lowest perplexity words for each language model:

In [12]:
#trigram token based model - find lowest perplexity words
dictperp3 = {}                                            #initiate dictionaries to store key-value pairs
dictperp8 = {}                                            #keys are words and values are the perplexities for those words
dictperp13 = {}                                           #the number in the name of the dictionary specifies word length
n = 3                                                     #specify n-gram size
inp = 'token'                                             #specify input type


for i in l3:                                              #loop over 3 letter words
    test_corpus = Corpus(n, [i])                          #create a corpus consisting of one word
    perplexity3 = tri_token_lm.perplexity(test_corpus)    #calculate perplexity of model when fed that corpus/word
    dictperp3[i] = perplexity3                            #add word and perplexity to dictionary  of 3 letter words 
                                                          #and their perplexities

for j in l8:                                              #same for 8 letter words
    test_corpus = Corpus(n, [j])
    perplexity8 = tri_token_lm.perplexity(test_corpus)
    dictperp8[j] = perplexity8


for k in l13:                                             #same for 13 letter words
    test_corpus = Corpus(n, [k])
    perplexity13 = tri_token_lm.perplexity(test_corpus)
    dictperp13[k] = perplexity13
    
min_perp3 = (min(dictperp3.values()))                     #find smallest perplexity from dictionary of 3 letter words
word_3 = min(dictperp3, key=dictperp3.get)                #get the corresponding 3 letter word 

min_perp8 = (min(dictperp8.values()))
word_8 = min(dictperp8, key=dictperp8.get)

min_perp13 = (min(dictperp13.values()))
word_13 = min(dictperp13, key=dictperp13.get)

perplexities.append(min_perp3)                             #append all the perplexities to a list 
perplexities.append(min_perp8)
perplexities.append(min_perp13)

words.append(word_3)                                       #append all the words to a words list
words.append(word_8)
words.append(word_13)

n_gram_size.append(n)                                      #append the n-gram sizes to a list
n_gram_size.append(n)
n_gram_size.append(n)

input_data.append(inp )                                    #append the input data type to a list
input_data.append(inp )
input_data.append(inp )

The process for the rest  of the models is exactly the same, so I will not comment it again


In [13]:
#trigram type based model - find lowest perplexity words
dictperp3 = {}
dictperp8 = {}
dictperp13 = {}
n = 3
inp = 'type'
for i in l3 :
    test_corpus = Corpus(n, [i])
    perplexity3 = tri_type_lm.perplexity(test_corpus)
    dictperp3[i] = perplexity3
for j in l8:
    test_corpus = Corpus(n, [j])
    perplexity8 = tri_type_lm.perplexity(test_corpus)
    dictperp8[j] = perplexity8
for k in l13:
    test_corpus = Corpus(n, [k])
    perplexity13 = tri_type_lm.perplexity(test_corpus)
    dictperp13[k] = perplexity13
    
min_perp3 = (min(dictperp3.values()))
word_3 = min(dictperp3, key=dictperp3.get)
min_perp8 = (min(dictperp8.values()))
word_8 = min(dictperp8, key=dictperp8.get)
min_perp13 = (min(dictperp13.values()))
word_13 = min(dictperp13, key=dictperp13.get)

perplexities.append(min_perp3)
perplexities.append(min_perp8)
perplexities.append(min_perp13)

words.append(word_3)
words.append(word_8)
words.append(word_13)

n_gram_size.append(n)
n_gram_size.append(n)
n_gram_size.append(n)

input_data.append(inp )
input_data.append(inp )
input_data.append(inp )

In [14]:
#tetragram token based model - find lowest perplexity words
dictperp3 = {}
dictperp8 = {}
dictperp13 = {}
n = 4
inp = 'token'
for i in l3:
    test_corpus = Corpus(n, [i])
    perplexity3 = tetra_token_lm.perplexity(test_corpus)
    dictperp3[i] = perplexity3
for j in l8:
    test_corpus = Corpus(n, [j])
    perplexity8 = tetra_token_lm.perplexity(test_corpus)
    dictperp8[j] = perplexity8
for k in l13:
    test_corpus = Corpus(n, [k])
    perplexity13 = tetra_token_lm.perplexity(test_corpus)
    dictperp13[k] = perplexity13
    
min_perp3 = (min(dictperp3.values()))
word_3 = min(dictperp3, key=dictperp3.get)
min_perp8 = (min(dictperp8.values()))
word_8 = min(dictperp8, key=dictperp8.get)
min_perp13 = (min(dictperp13.values()))
word_13 = min(dictperp13, key=dictperp13.get)

perplexities.append(min_perp3)
perplexities.append(min_perp8)
perplexities.append(min_perp13)

words.append(word_3)
words.append(word_8)
words.append(word_13)
n_gram_size.append(n)
n_gram_size.append(n)
n_gram_size.append(n)

input_data.append(inp )
input_data.append(inp )
input_data.append(inp )

In [15]:
#tetragram type based model - find lowest perplexity words
dictperp3 = {}
dictperp8 = {}
dictperp13 = {}
n = 4
inp = 'type'
for i in l3:
    test_corpus = Corpus(n, [i])
    perplexity3 = tetra_type_lm.perplexity(test_corpus)
    dictperp3[i] = perplexity3

for j in l8:
    test_corpus = Corpus(n, [j])
    perplexity8 = tetra_type_lm.perplexity(test_corpus)
    dictperp8[j] = perplexity8
for k in l13:
    test_corpus = Corpus(n, [k])
    perplexity13 = tetra_type_lm.perplexity(test_corpus)
    dictperp13[k] = perplexity13
    
min_perp3 = (min(dictperp3.values()))
word_3 = min(dictperp3, key=dictperp3.get)
min_perp8 = (min(dictperp8.values()))
word_8 = min(dictperp8, key=dictperp8.get)
min_perp13 = (min(dictperp13.values()))
word_13 = min(dictperp13, key=dictperp13.get)

perplexities.append(min_perp3)
perplexities.append(min_perp8)
perplexities.append(min_perp13)

words.append(word_3)
words.append(word_8)
words.append(word_13)

n_gram_size.append(n)
n_gram_size.append(n)
n_gram_size.append(n)

input_data.append(inp )
input_data.append(inp )
input_data.append(inp )


Insert all the lists into a dataframe and display that dataframe

In [16]:
df = pd.DataFrame(list(zip( n_gram_size, input_data,perplexities,words)),
                  columns = [ "n_gram_size", "input_data","perplexities","words"])
df

Unnamed: 0,n_gram_size,input_data,perplexities,words
0,3,token,2.451944,you
1,3,token,3.681957,anything
2,3,token,4.900838,motherfucking
3,3,type,3.239946,ing
4,3,type,4.053892,mentions
5,3,type,4.384162,fractionating
6,4,token,2.196918,you
7,4,token,2.524541,anything
8,4,token,3.249898,backgrounders
9,4,type,5.29052,man


Looking at  the perplexities it becomes evident that the least perplexing words for each model are slightly less perplexing to token based models to type based models. This is probably because token based models have 'seen' more words and have transmission probabilites that slightly better model the english language as a result.

Words ending in 'ing' seemed to give low perplexities in all models except for the type based tetragram model. This is quite logical, as 'ing' is a very common ending for verbs and nouns alike and verbs that end with 'ing' are very usual in dialogue, which is what SUBTLEX largely consists of. 

The word 'you' appearing as the 3 letter word with the lowest perplexity in token based models is not at all surprising, as it appears 2.1 million times in the SUBTLEX dataset(more than  4% of the tokens in SUBTLEX are thus 'you').

# Task 5

In [17]:
#generate lists for 2 first columns of the dataframe
input_data = ['token','type','token','type']
n_gram_size = [3,3,4,4]

#generate likeliest words for each model, add them to a list
likeliest_words = [tri_token_lm.generate_likeliest(50), tri_type_lm.generate_likeliest(50),
                    tetra_token_lm.generate_likeliest(50), tetra_type_lm.generate_likeliest(50)]

#generate dataframe with model info and likeliest words generated
df = pd.DataFrame(list(zip( n_gram_size, input_data,likeliest_words)),
                  columns = [ "n_gram_size", "input_data","likeliest_word"])
df

Unnamed: 0,n_gram_size,input_data,likeliest_word
0,3,token,the
1,3,type,st
2,4,token,the
3,4,type,stant


Seeing the word 'the' as the likeliest string in the token based models is not very surprising, as out of the 49 million tokens in SUBTLEX, around 1.5 million are 'the'. While it may not be the most popular token ('you' appears 2.1 million times), 'the' is the first three letters in many other very frequently used words such as 'they', 'there', 'then', etc. 

The fact that type based models generated words beginning with the letter 's' is also expected, as the in the dictionary also the section of words starting with 's' is the longest. It is also logical that the trigram based type model outputs just 'st', as that particular combination of letters is extremely frequently found in the ends of superlative forms of adjectives and as the trigram looks at the previous 2 letters as 'history', the model has no idea that the current word consists of only 'st' and thinks it likely to be the end of a word. This is 'fixed' in the tetragram based model, as it uses 'BoS' and 'st' as history and thus continues to extend the word instead of ending it.

# Task 6

In [10]:
i = 0                                                                                  #initiate counter
tri_token_words = []                                                                   #initiate lists to store words
tri_type_words = []                                                                    #generate by each model
tetra_type_words = []
tetra_token_words = []
while i < 10:                                                                          #have each model generate 10 words and
    tri_token_words.append(str(tri_token_lm.generate(50)))                             #append them to their corresponding lists
    tetra_token_words.append(str(tetra_token_lm.generate(50)))                       
    tri_type_words.append(str(tri_type_lm.generate(50)))
    tetra_type_words.append(str(tetra_type_lm.generate(50)))
    i+=1




In [11]:
#import module to find Levenshtein distance 
import sys
!{sys.executable} -m pip install stringdist
import stringdist

stringdist.levenshtein('test', 'testing')   #execute a test to see if the module works properly




3

In [20]:
distances = []                                                                      #initiate list for average Levenshtein
                                                                                     #distances
    
for word in tri_token_words:                                                         #for each word generated:
    leven_dist = []
    for type_ in types:                                                              #calculate Levenshtein distance for 
        if isinstance(type_,str):                                                    #each word in types 
            leven_dist.append([type_,stringdist.levenshtein(word,type_ )])           #add type and Levenshtein distance to a list
    leven_dist.sort( key=lambda x: x[1])                                             #sort the list from lowest to highest 
                                                                                     #based on Levenshtein distances
    dist_sum = 0
    for x in leven_dist[:20]:                                                        #add up lowest 20 Levenshtein distances
        dist_sum += x[1]                                                           
                                                                                     #OLD20 = sum of 20 lowest L-distances / 20
    distances.append(dist_sum/20)                                                    #append OLD20 for each word generated to 
                                                                                     # list
for word in tri_type_words:
    leven_dist = []
    for type_ in types:
        if isinstance(type_,str):
            leven_dist.append([type_,stringdist.levenshtein(word,type_ )])
    leven_dist.sort( key=lambda x: x[1])
    
    dist_sum = 0
    for x in leven_dist[:20]:
        dist_sum += x[1]

    distances.append(dist_sum/20)

for word in tetra_token_words:
    leven_dist = []
    for type_ in types:
        if isinstance(type_,str):
            leven_dist.append([type_,stringdist.levenshtein(word,type_ )])
    leven_dist.sort( key=lambda x: x[1])
    
    dist_sum = 0
    for x in leven_dist[:20]:
        dist_sum += x[1]

    distances.append(dist_sum/20)

for word in tetra_type_words:
    leven_dist = []
    for type_ in types:
        if isinstance(type_,str):
            leven_dist.append([type_,stringdist.levenshtein(word,type_ )])
    leven_dist.sort( key=lambda x: x[1])
    
    dist_sum = 0
    for x in leven_dist[:20]:
        dist_sum += x[1]

    distances.append(dist_sum/20)

In [22]:
tri_token_perp = np.mean(distances[:10])
tri_type_perp = np.mean(distances[10:20])
tetra_token_perp = np.mean(distances[20:30])
tetra_type_perp = np.mean(distances[30:])

In [23]:
distances = [tri_token_perp,tri_type_perp,tetra_token_perp, tetra_type_perp]           #create list of words
                                                                                                    #generated by models
                                                            
n_gram_size =  [3,3,4,4]    #generate lists of model
input_data = ['token','type', 'token', 'type']

df = pd.DataFrame(list(zip( n_gram_size, input_data, distances)),                    #create dataframe
                  columns = [ "n_gram_size", "input_data","old_20"])
df

Unnamed: 0,n_gram_size,input_data,old_20
0,3,token,1.135
1,3,type,2.92
2,4,token,1.205
3,4,type,1.905



Token based models tend to generate strings with denser orthographic neighbourhoods. They tend to generate shorter words that more often than not are actual words that exist in the english language. These shorter words have lower OLD20 values because often changing only one or two letters in the word can lead to a new short word, whereas for longer words more changes would have to be made in the form of letter deletions or replacements to receive a new word. 

Token based models tend to generate shorter strings because the list of tokens contains many instances shorter words with more general meanings that are often used, whereas the types list contains these words just once, which is just as often as longer words with very specific definitions. This accurately depicts what we know from orthotactics - words used more frequently are shorter and carry a more general meaning and longer words tend to be more specific in meaning and less frequently used in natural language.


# Task 7

In [59]:
english_words = ['mouth', 'dream', 'sun', 'apple', 'bridge', 'mirror', 'sky', 'fish', 'rooster', 'son']
basque_words = ['aho', 'amets', 'eguzki', 'sagar', 'zubi', 'mirail', 'zeru', 'arrain', 'oilar', 'seme']
czech_words = ['pusa','sen', 'slunce', 'jablko', 'most', 'zrcadlo', 'nebe', 'ryba',  'kohout', 'syn']
dutch_words = ['mond', 'droom', 'zon', 'appel', 'brug', 'spiegel', 'hemel', 'vis', 'haan',  'zoon']
finnish_words = ['suu', 'uni', 'aurinko', 'omena', 'silta', 'peili', 'taivas', 'kala', 'kukko', 'poika']
italian_words = ['bocca', 'sogno', 'sole', 'mela', 'ponte', 'specchio', 'cielo', 'pesce', 'gallo', 'figlio']

non_english_words = basque_words + czech_words + dutch_words + finnish_words + italian_words

words = ['mouth', 'dream', 'sun', 'apple', 'bridge', 'mirror', 'sky', 'fish', 'rooster', 'son','aho', 
         'amets', 'eguzki', 'sagar', 'zubi', 'mirail', 'zeru', 'arrain', 'oilar', 'seme',
         'pusa','sen', 'slunce', 'jablko', 'most', 'zrcadlo', 'nebe', 'ryba',  'kohout', 'syn',
         'mond', 'droom', 'zon', 'appel', 'brug', 'spiegel', 'hemel', 'vis', 'haan',  'zoon',
         'suu', 'uni', 'aurinko', 'omena', 'silta', 'peili', 'taivas', 'kala', 'kukko', 'poika',
         'bocca', 'sogno', 'sole', 'mela', 'ponte', 'specchio', 'cielo', 'pesce', 'gallo', 'figlio']

languages = ['english', 'basque', 'czech','dutch','finnish','italian']

In [60]:
tri_token_perplexities = []
tetra_token_perplexities = []
tri_type_perplexities = []
tetra_type_perplexities = []
for word in non_english_words:                                            #loop over non-english words
    tri_corpus = Corpus(3, [word])                                        #append 2 BoS symbols and an EoS symbol to word
    tetra_corpus = Corpus(4,[word])                                       #append 3 BoS symbols and an EoS symbol to word
    tri_token_perplexities.append(tri_token_lm.perplexity(tri_corpus))    #get perplexities from trigram based models
    tri_type_perplexities.append(tri_type_lm.perplexity(tri_corpus))      #    and append them to a list
    tetra_token_perplexities.append(tetra_token_lm.perplexity(tetra_corpus))#get perplexities from tetragram based models
    tetra_type_perplexities.append(tetra_type_lm.perplexity(tetra_corpus))  #    and append them to a list
   

In [94]:
min_perplexities = []
max_perplexities = []
min_concepts = []
max_concepts = []
words_min_perplexities = []
words_max_perplexities = []
language = []
n_gram_size = ['3','3','4','4']
input_data = ['token','type','token','type']


In [95]:
# append lowest perplexity from each model to one list and append the corresponding word to a different list

min_perplexities.append(min(tri_token_perplexities))
words_min_perplexities.append(non_english_words[tri_token_perplexities.index(min(tri_token_perplexities))])
min_perplexities.append(min(tri_type_perplexities))
words_min_perplexities.append(non_english_words[tri_type_perplexities.index(min(tri_type_perplexities))])
min_perplexities.append(min(tetra_token_perplexities))
words_min_perplexities.append(non_english_words[tetra_token_perplexities.index(min(tetra_token_perplexities))])
min_perplexities.append(min(tetra_type_perplexities))
words_min_perplexities.append(non_english_words[tetra_type_perplexities.index(min(tetra_type_perplexities))])

In [96]:
for word in words_min_perplexities:   #loop over words with minimal perplexities
    idx = words.index(word)           #    get the index in the list of all words
    lang_counter = 0
    while idx > 10:                   #    subtract 10 from index until the index is
        idx -= 10                     #        less than 10, to get the concept in English
                                      #    (English is at the beginning of the list at indexes 0-9)
        lang_counter += 1
    min_concepts.append(words[idx])   #    append concepts to a list
    language.append(languages[lang_counter])

In [101]:
# append highest perplexity from each model to one list and append the corresponding word to a different list

max_perplexities.append(max(tri_token_perplexities))
words_max_perplexities.append(non_english_words[tri_token_perplexities.index(max(tri_token_perplexities))])
max_perplexities.append(max(tri_type_perplexities))
words_max_perplexities.append(non_english_words[tri_type_perplexities.index(max(tri_type_perplexities))])
max_perplexities.append(max(tetra_token_perplexities))
words_max_perplexities.append(non_english_words[tetra_token_perplexities.index(max(tetra_token_perplexities))])
max_perplexities.append(max(tetra_type_perplexities))
words_max_perplexities.append(non_english_words[tetra_type_perplexities.index(max(tetra_type_perplexities))])

In [102]:
for word in words_max_perplexities:   #loop over words with minimal perplexities
    idx = words.index(word)           #    get the index in the list of all words
    lang_counter = 0
    while idx > 10:                   #    subtract 10 from index until the index is
        idx -= 10                     #        less than 10, to get the concept in English
                                      #    (English is at the beginning of the list at indexes 0-9)
        lang_counter += 1
    max_concepts.append(words[idx])   #    append concepts to a list
    language.append(languages[lang_counter])

In [103]:
df = pd.DataFrame(list(zip(n_gram_size, input_data, max_concepts,language[4:], words_max_perplexities,max_perplexities)), 
                  columns = ["N-Gram Size", "Input Data", "Concepts","Language", "Max Perplexity Word","Max Perplexity"])
df

Unnamed: 0,N-Gram Size,Input Data,Concepts,Language,Max Perplexity Word,Max Perplexity
0,3,token,mirror,czech,zrcadlo,786.150174
1,3,type,mirror,czech,zrcadlo,121.213535
2,4,token,sky,italian,cielo,2709.020584
3,4,type,sun,basque,eguzki,265.476661


The czech word 'zrcadlo' appearing as the word that perplexes trigram models the most is likely due to the fact that it contains two trigrams that appear either never or very infrequently in the english language - 'zrc' which is found in no english word and 'dlo' which appears only in some compound words such as 'landlord' or 'padlock'.
As for the tetragram based models, I am not sure why 'cielo' perplexes the token based model as much as it does(which is a lot). Neither of the tetragrams 'ciel' and 'ielo' appear often in  the english language, but it seems quite weird to me that 'cielo' draws a higher perplexity from the model than 'zrcadlo' for instance. The high perplexity for 'eguzki' is understandable, as the transmission between 'k' and 'z' is extremely rare in english (it pretty much only appears in the word 'blitzkrieg' and the sequence of 'eguz' is also highly irregular in english

In [99]:
df = pd.DataFrame(list(zip(n_gram_size, input_data,min_concepts,language[:4], words_min_perplexities,min_perplexities)), 
                  columns = ["N-Gram Size", "Input Data", "Concepts","Language","Min Perplexity Word","Min Perplexity"])
df

Unnamed: 0,N-Gram Size,Input Data,Concepts,Language,Min Perplexity Word,Min Perplexity
0,3,token,bridge,czech,most,7.158576
1,3,type,sun,italian,sole,7.775624
2,4,token,bridge,czech,most,5.094633
3,4,type,dream,dutch,droom,6.673025


'Most' and 'Sole' are also words in english, so their perplexities are understandably quite low for models trained on an english corpus. 'Droom' - although not a word in english - is quite close to several english words such as 'room' or 'drool'. Furthermroe it does not contain any transmissions between letters or combinations of letters that are not frequently used in english,  resulting in a low perplexity.

In [33]:
tri_token_perplexities = []
tetra_token_perplexities = []
tri_type_perplexities = []
tetra_type_perplexities = []

for word in words:
    tri_corpus = Corpus(3, [word])                                                    #for every word create 2 corpora to test
    tetra_corpus = Corpus(4, [word])                                                  #to test with the models(trigram and
    tri_token_perplexities.append(tri_token_lm.perplexity(tri_corpus))                #tetragram models need separate corpora)
    tri_type_perplexities.append(tri_type_lm.perplexity(tri_corpus))
    tetra_token_perplexities.append(tetra_token_lm.perplexity(tetra_corpus))          #calculate perplexities per model 
    tetra_type_perplexities.append(tetra_type_lm.perplexity(tetra_corpus))            #and append to a list
    
perp_english = np.mean(tri_token_perplexities[:10])                                   #for each language:
perp_basque = np.mean(tri_token_perplexities[10:20])                                  #    average up the perplexities for words
perp_czech = np.mean(tri_token_perplexities[20:30])                                   #    in that language and save variables
perp_dutch = np.mean(tri_token_perplexities[30:40])
perp_finnish = np.mean(tri_token_perplexities[40:50])
perp_italian = np.mean(tri_token_perplexities[50:])
language_perplexities = [perp_english, perp_basque, perp_czech, perp_dutch, perp_finnish, perp_italian #add perplexities to list

perp_english = np.mean(tri_type_perplexities[:10])
perp_basque = np.mean(tri_type_perplexities[10:20])
perp_czech = np.mean(tri_type_perplexities[20:30])
perp_dutch = np.mean(tri_type_perplexities[30:40])
perp_finnish = np.mean(tri_type_perplexities[40:50])
perp_italian = np.mean(tri_type_perplexities[50:])
language_perplexities += [perp_english, perp_basque, perp_czech, perp_dutch, perp_finnish, perp_italian]

perp_english = np.mean(tetra_token_perplexities[:10])
perp_basque = np.mean(tetra_token_perplexities[10:20])
perp_czech = np.mean(tetra_token_perplexities[20:30])
perp_dutch = np.mean(tetra_token_perplexities[30:40])
perp_finnish = np.mean(tetra_token_perplexities[40:50])
perp_italian = np.mean(tetra_token_perplexities[50:])
language_perplexities += [perp_english, perp_basque, perp_czech, perp_dutch, perp_finnish, perp_italian]

perp_english = np.mean(tetra_type_perplexities[:10])
perp_basque = np.mean(tetra_type_perplexities[10:20])
perp_czech = np.mean(tetra_type_perplexities[20:30])
perp_dutch = np.mean(tetra_type_perplexities[30:40])
perp_finnish = np.mean(tetra_type_perplexities[40:50])
perp_italian = np.mean(tetra_type_perplexities[50:])
language_perplexities += [perp_english, perp_basque, perp_czech, perp_dutch, perp_finnish, perp_italian]

n_gramsize = [3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,]
inputdata = ['token','token','token','token','token','token','type','type','type','type','type','type',
             'token','token','token','token','token','token','type','type','type','type','type','type']

In [107]:
df = pd.DataFrame(list(zip( n_gramsize, inputdata,languages*4,language_perplexities)),
                  columns = [ "n_gram_size", "input_data","language","average_perplexity"])

df

Unnamed: 0,n_gram_size,input_data,language,average_perplexity
0,3,token,english,13.152731
1,3,token,basque,134.651118
2,3,token,czech,140.82967
3,3,token,dutch,47.456496
4,3,token,finnish,160.789513
5,3,token,italian,28.159439
6,3,type,english,10.275433
7,3,type,basque,34.700293
8,3,type,czech,32.750262
9,3,type,dutch,17.384658


All models seem to 'agree' that basque, czech and finnish are more different from english than dutch and italian. This lines up well with what we know of the origins and families of these languages - english and dutch are both west germanic languages. Italian and english are somewhat similar in that they both have many words with latin origins. The rest are however quite distant from english, with finnish being in the finno-ugric language family, czech is slavic and basque is totally unrelated to any other language in the world. 

The differences in perplexities are bigger for token based models because the frequency distribution of tokens more accurately represent english in how it is realistically used than the even distribution of frequencies of types (1 occurrance per word).