<h4> Datasets and Resources </h4> 

* WikiText-2 (raw/unprocessed), Train, Dev, Test

In [None]:
import numpy as np
import pandas as pd
import re

In [132]:
class MyLanguageModel:
    def __init__(self) -> None:
        self.tokens = []
        self.word_frequency = {}
        self.bigrams = {}
        self.predictions = []

    def tokenize(self,data):
        '''
        Data preprocessing
        '''
        words = re.findall(r'\b[A-Za-z]+\b', data.lower())
        return words

    def train(self, text):
        '''
        Train the model by:
        - Tokenization
        - Getting the frequency of each word
        - Getting the frequency of each pair (Bigrams)
        '''
        # Tokenization
        words = self.tokenize(text)
        self.tokens = words

        # Word frequency
        for i in self.tokens:
            if i in self.word_frequency.keys():
                self.word_frequency[i] += 1
            else:
                self.word_frequency.update({i:1})

        # Bigram creation
        for i in range(len(words)-1):
            pair = (words[i],words[i+1])
            if pair not in self.bigrams:
                self.bigrams.update({pair:1})
            else:
                self.bigrams[pair] += 1
            
        # Sorting the dictionary by value (frequency) so that when i predict the next word it should find the searched word with the highest frequency
        self.bigrams = sorted(self.bigrams.items(), key=lambda x: x[1], reverse=True)
        self.bigrams = dict(self.bigrams)

        print('Model Trained Successfully!')

    def predict_next_word(self,start_word:str):
        '''
        The prediction of the most frequent next word
        '''
        max_freq = 0
        next_word = None
        for bigr, freq in self.bigrams.items():
            if bigr[0] == start_word and freq > max_freq:
                max_freq = freq
                next_word = bigr[1]

        return next_word
    
    def predict(self,epochs,start_word:str):
        '''
        Prediction of the whole sequence with 'epochs' length
        '''
        if ' ' in start_word:
            test_text = self.tokenize(start_word)
            start_word = test_text[0]
            
        if epochs > 0:
            nextW = self.predict_next_word(start_word)
            self.predictions.append(nextW)
            print(nextW)
            return self.predict(epochs=epochs-1,start_word=nextW)
        else:
            print(f'\n The prediction is done')

    def error(self):
        '''
        Error between the testing text and our predictions
        A better estimator is perplexity
        '''
        if self.tokens == []:
            print('Please train the model, then make some predictions to be able to get the error!')

        if self.predictions != []:
            tr_len = len(self.tokens)
            min_len = min(len(self.tokens),len(self.predictions))
            counter = 0
            for i in range(min_len):
                if self.tokens[i] == self.predictions[i]:
                    counter += 1

            error = counter / tr_len
            return error
        else:
            print('You need to predict first!')
            

In [123]:
with open('PATH','r', encoding='utf-8') as train_file:
    train_data = train_file.read()

with open('PATH','r', encoding='utf-8') as test_file:
    test_data = test_file.read()

with open('PATH','r', encoding='utf-8') as valid_file:
    valid_data = valid_file.read()

In [124]:
# Create the model and train the data

lmO = MyLanguageModel()
lmO.train(train_data)

Model Trained Successfully!


In [126]:
# Predictions

lmO.predict(50,test_data)

le
soir
jeunesse
was
the
first
time
the
first
time
the
first
time
the
first
time
the
first
time
the
first
time
the
first
time
the
first
time
the
first
time
the
first
time
the
first
time
the
first
time
the
first
time
the
first
time
the
first
time
the

 The prediction is done


In [127]:
# Check how good our predictions are. [Hint: they are much better at around 1.0 and really bad at around 0.0

lmO.error()

5.953600022861824e-07