In [1]:
import re
from  collections import defaultdict

In [2]:
def open_and_clean_file_data(filename):
    sentances = []
    with open(filename,'r') as fs:
        for line in fs:
            line = line.strip().lower()
            for key, val in dict({'“': '"', '”': '"', '’': "'", '--': ','}).items():
                line = line.replace(key,val)
            line = ' '.join([word for word in re.findall("([a-z\-]+)", line)])
            if line:
                line+=" [END]"
                sentances.append(line)
    return sentances

In [3]:
data = open_and_clean_file_data('./data_raw.txt')

In [4]:
def get_tokenized_sentences(sentences):
    for sentence in sentences:
        yield sentence.split()

In [5]:
class UnigramCounter:
    
    def __init__(self, sentences):
        self.sentence_generator = get_tokenized_sentences(sentences)
        self.count()
        
    
    def count(self):
        self.token_count = 0
        self.counts = defaultdict(int)
        self.countOfCounts = defaultdict(int)
        
        "Actual count"
        for sentence in self.sentence_generator:
            self.token_count += len(sentence)
            for unigram in sentence: self.counts[unigram] += 1
        
        "Count of counts"
        for count in self.counts: self.countOfCounts[self.counts[count]] += 1
    
    
    
    def view_normalised_counts(self,k = 0 ,which_type='simple'):
        """"
        Normalised/ smoothing count
        
        -in case of simple unigram model    
            (actual count + k) * total_words) / (total_words + k * distict_unigrams)
    
       -in case of good turing
            ((that_unigram + 1) * number of unigrams having having count 1 extra count )/ number of unigrams having same count   
        """
        normalised_counts = {}
        if which_type.lower() == "good turing":
            for key, count in self.counts.items():
                count_num = (self.counts[key] + 1) * self.countOfCounts[count + 1]
                count_deno = self.countOfCounts[self.counts[key]]
                normalised_counts[key] = count_num/ count_deno
  
        elif which_type.lower() == 'simple':
            for count in self.counts: normalised_counts[count] = ((self.counts[count] + k) * self.token_count) / (self.token_count + k * len(self.counts))
        else:
            print("!!!!!!!!!!!!!!!!!!!!!!!ERROR!!!!!!!!!!!!!!!!!")
        return normalised_counts    


In [6]:
class UnigramModel:
    def __init__(self, train_counter):
        self.counter = train_counter
        self.counts = train_counter.counts.copy()
        self.counts['[UNK]'] = 0
        self.distinct_words = set(self.counts.keys())

    def train(self, k=0, which_type="simple") :
        self.probs = {}
        
        for unigram, unigram_count in self.counts.items():
            if which_type == "simple":
                prob_nom = unigram_count + k
                prob_denom = self.counter.token_count + k * len(self.distinct_words)
            elif which_type == 'good turing':
                prob_nom = (unigram_count + 1) * self.counter.countOfCounts[unigram_count+1]
                prob_denom = self.counter.token_count
            else:
                print("!!!!!!!!ERROR!!!!!!!!")
            
            self.probs[unigram] = prob_nom / prob_denom
        

    def evaluate(self, evaluation_counter):
        probablity = 1
        test_counts = evaluation_counter.counts
        for unigram, test_count in test_counts.items():
            if unigram not in self.distinct_words:
                unigram = '[UNK]'
            train_prob = self.probs[unigram]
            print(f"{unigram} -> {self.probs[unigram]} probability")
            probablity = probablity*train_prob
            

        return probablity

# Total words and unique words

In [7]:
counter = UnigramCounter(data)
print(f"there are {counter.token_count} total words")
print(f"there are {len(counter.counts)} unique words")

there are 993 total words
there are 336 unique words


## No smoothing count

In [8]:
counter.view_normalised_counts()

{'abstract': 1.0,
 '[END]': 35.0,
 'fitness': 1.0,
 'functions': 1.0,
 'based': 1.0,
 'on': 11.0,
 'test': 4.0,
 'cases': 2.0,
 'are': 8.0,
 'very': 1.0,
 'common': 3.0,
 'in': 14.0,
 'genetic': 1.0,
 'programming': 1.0,
 'gp': 10.0,
 'this': 16.0,
 'process': 2.0,
 'can': 10.0,
 'be': 11.0,
 'assimilated': 2.0,
 'to': 28.0,
 'a': 28.0,
 'learning': 22.0,
 'task': 1.0,
 'with': 7.0,
 'the': 84.0,
 'inference': 1.0,
 'of': 40.0,
 'models': 3.0,
 'from': 5.0,
 'limited': 2.0,
 'number': 2.0,
 'samples': 1.0,
 'paper': 4.0,
 'is': 24.0,
 'an': 5.0,
 'investigation': 1.0,
 'two': 4.0,
 'methods': 2.0,
 'improve': 1.0,
 'generalization': 5.0,
 'gp-based': 4.0,
 'selection': 1.0,
 'best-of-run': 3.0,
 'individuals': 3.0,
 'using': 5.0,
 'three': 1.0,
 'data': 17.0,
 'sets': 7.0,
 'methodology': 5.0,
 'and': 21.0,
 'application': 3.0,
 'parsimony': 2.0,
 'pressure': 2.0,
 'order': 1.0,
 'reduce': 2.0,
 'complexity': 8.0,
 'solutions': 9.0,
 'results': 6.0,
 'binary': 3.0,
 'classification': 3

## No smoothing probability

In [9]:
model_no_smoothing = UnigramModel(counter)
model_no_smoothing.train()
model_no_smoothing.probs

{'abstract': 0.0010070493454179255,
 '[END]': 0.035246727089627394,
 'fitness': 0.0010070493454179255,
 'functions': 0.0010070493454179255,
 'based': 0.0010070493454179255,
 'on': 0.011077542799597181,
 'test': 0.004028197381671702,
 'cases': 0.002014098690835851,
 'are': 0.008056394763343404,
 'very': 0.0010070493454179255,
 'common': 0.0030211480362537764,
 'in': 0.014098690835850957,
 'genetic': 0.0010070493454179255,
 'programming': 0.0010070493454179255,
 'gp': 0.010070493454179255,
 'this': 0.016112789526686808,
 'process': 0.002014098690835851,
 'can': 0.010070493454179255,
 'be': 0.011077542799597181,
 'assimilated': 0.002014098690835851,
 'to': 0.028197381671701913,
 'a': 0.028197381671701913,
 'learning': 0.022155085599194362,
 'task': 0.0010070493454179255,
 'with': 0.007049345417925478,
 'the': 0.08459214501510574,
 'inference': 0.0010070493454179255,
 'of': 0.04028197381671702,
 'models': 0.0030211480362537764,
 'from': 0.005035246727089627,
 'limited': 0.00201409869083585

## Add 1 count

In [10]:
counter.view_normalised_counts(k=1)

{'abstract': 1.4943566591422122,
 '[END]': 26.89841986455982,
 'fitness': 1.4943566591422122,
 'functions': 1.4943566591422122,
 'based': 1.4943566591422122,
 'on': 8.966139954853274,
 'test': 3.7358916478555306,
 'cases': 2.2415349887133185,
 'are': 6.724604966139955,
 'very': 1.4943566591422122,
 'common': 2.9887133182844243,
 'in': 11.207674943566591,
 'genetic': 1.4943566591422122,
 'programming': 1.4943566591422122,
 'gp': 8.218961625282168,
 'this': 12.702031602708804,
 'process': 2.2415349887133185,
 'can': 8.218961625282168,
 'be': 8.966139954853274,
 'assimilated': 2.2415349887133185,
 'to': 21.668171557562076,
 'a': 21.668171557562076,
 'learning': 17.18510158013544,
 'task': 1.4943566591422122,
 'with': 5.977426636568849,
 'the': 63.51015801354402,
 'inference': 1.4943566591422122,
 'of': 30.63431151241535,
 'models': 2.9887133182844243,
 'from': 4.483069977426637,
 'limited': 2.2415349887133185,
 'number': 2.2415349887133185,
 'samples': 1.4943566591422122,
 'paper': 3.7358

## Add 1 smoothing probability

In [11]:
model_add_one = UnigramModel(counter)
model_add_one.train(k=1)
model_add_one.probs

{'abstract': 0.0015037593984962407,
 '[END]': 0.02706766917293233,
 'fitness': 0.0015037593984962407,
 'functions': 0.0015037593984962407,
 'based': 0.0015037593984962407,
 'on': 0.009022556390977444,
 'test': 0.0037593984962406013,
 'cases': 0.002255639097744361,
 'are': 0.006766917293233083,
 'very': 0.0015037593984962407,
 'common': 0.0030075187969924814,
 'in': 0.011278195488721804,
 'genetic': 0.0015037593984962407,
 'programming': 0.0015037593984962407,
 'gp': 0.008270676691729323,
 'this': 0.012781954887218045,
 'process': 0.002255639097744361,
 'can': 0.008270676691729323,
 'be': 0.009022556390977444,
 'assimilated': 0.002255639097744361,
 'to': 0.02180451127819549,
 'a': 0.02180451127819549,
 'learning': 0.017293233082706767,
 'task': 0.0015037593984962407,
 'with': 0.006015037593984963,
 'the': 0.06390977443609022,
 'inference': 0.0015037593984962407,
 'of': 0.030827067669172932,
 'models': 0.0030075187969924814,
 'from': 0.004511278195488722,
 'limited': 0.002255639097744361

## Good Turing unigram smoothed counts

In [12]:
counter.view_normalised_counts(which_type = 'good turing')

{'abstract': 0.5658536585365853,
 '[END]': 0.0,
 'fitness': 0.5658536585365853,
 'functions': 0.5658536585365853,
 'based': 0.5658536585365853,
 'on': 0.0,
 'test': 5.625,
 'cases': 1.2413793103448276,
 'are': 9.0,
 'very': 0.5658536585365853,
 'common': 1.3333333333333333,
 'in': 0.0,
 'genetic': 0.5658536585365853,
 'programming': 0.5658536585365853,
 'gp': 5.5,
 'this': 34.0,
 'process': 1.2413793103448276,
 'can': 5.5,
 'be': 0.0,
 'assimilated': 1.2413793103448276,
 'to': 0.0,
 'a': 0.0,
 'learning': 0.0,
 'task': 0.5658536585365853,
 'with': 12.0,
 'the': 0.0,
 'inference': 0.5658536585365853,
 'of': 0.0,
 'models': 1.3333333333333333,
 'from': 3.3333333333333335,
 'limited': 1.2413793103448276,
 'number': 1.2413793103448276,
 'samples': 0.5658536585365853,
 'paper': 5.625,
 'is': 0.0,
 'an': 3.3333333333333335,
 'investigation': 0.5658536585365853,
 'two': 5.625,
 'methods': 1.2413793103448276,
 'improve': 0.5658536585365853,
 'generalization': 3.3333333333333335,
 'gp-based': 5

In [13]:
model_good_turing = UnigramModel(counter)
model_good_turing.train(which_type = 'good turing')
model_good_turing.probs

{'abstract': 0.11681772406847936,
 '[END]': 0.0,
 'fitness': 0.11681772406847936,
 'functions': 0.11681772406847936,
 'based': 0.11681772406847936,
 'on': 0.0,
 'test': 0.045317220543806644,
 'cases': 0.07250755287009064,
 'are': 0.027190332326283987,
 'very': 0.11681772406847936,
 'common': 0.032225579053373615,
 'in': 0.0,
 'genetic': 0.11681772406847936,
 'programming': 0.11681772406847936,
 'gp': 0.022155085599194362,
 'this': 0.03423967774420947,
 'process': 0.07250755287009064,
 'can': 0.022155085599194362,
 'be': 0.0,
 'assimilated': 0.07250755287009064,
 'to': 0.0,
 'a': 0.0,
 'learning': 0.0,
 'task': 0.11681772406847936,
 'with': 0.02416918429003021,
 'the': 0.0,
 'inference': 0.11681772406847936,
 'of': 0.0,
 'models': 0.032225579053373615,
 'from': 0.030211480362537766,
 'limited': 0.07250755287009064,
 'number': 0.07250755287009064,
 'samples': 0.11681772406847936,
 'paper': 0.045317220543806644,
 'is': 0.0,
 'an': 0.030211480362537766,
 'investigation': 0.1168177240684793

In [14]:
test_data = open_and_clean_file_data('./test_raw.txt') 

In [15]:
test_data

['the goal of this project is to translate the wonderful resource [END]']

In [16]:
evaluationCounter = UnigramCounter(test_data)
print(evaluationCounter.token_count)
print(evaluationCounter.counts)

12
defaultdict(<class 'int'>, {'the': 2, 'goal': 1, 'of': 1, 'this': 1, 'project': 1, 'is': 1, 'to': 1, 'translate': 1, 'wonderful': 1, 'resource': 1, '[END]': 1})


In [17]:
model_no_smoothing.evaluate(evaluationCounter)

the -> 0.08459214501510574 probability
[UNK] -> 0.0 probability
of -> 0.04028197381671702 probability
this -> 0.016112789526686808 probability
[UNK] -> 0.0 probability
is -> 0.02416918429003021 probability
to -> 0.028197381671701913 probability
[UNK] -> 0.0 probability
[UNK] -> 0.0 probability
[UNK] -> 0.0 probability
[END] -> 0.035246727089627394 probability


0.0

In [18]:
model_add_one.evaluate(evaluationCounter)

the -> 0.06390977443609022 probability
[UNK] -> 0.0007518796992481203 probability
of -> 0.030827067669172932 probability
this -> 0.012781954887218045 probability
[UNK] -> 0.0007518796992481203 probability
is -> 0.018796992481203006 probability
to -> 0.02180451127819549 probability
[UNK] -> 0.0007518796992481203 probability
[UNK] -> 0.0007518796992481203 probability
[UNK] -> 0.0007518796992481203 probability
[END] -> 0.02706766917293233 probability


6.713116136417485e-26

In [19]:
model_good_turing.evaluate(evaluationCounter)

the -> 0.0 probability
[UNK] -> 0.20644511581067473 probability
of -> 0.0 probability
this -> 0.03423967774420947 probability
[UNK] -> 0.20644511581067473 probability
is -> 0.0 probability
to -> 0.0 probability
[UNK] -> 0.20644511581067473 probability
[UNK] -> 0.20644511581067473 probability
[UNK] -> 0.20644511581067473 probability
[END] -> 0.0 probability


0.0