In [1]:
from lxml import etree
import re
from tqdm.notebook import tqdm
from itertools import combinations
import os

In [2]:
class Word(object):
    def __init__(self, kelma, tip, mamma, gherq):
        self.kelma = kelma
        self.tip = tip
        self.mamma = mamma
        self.gherq = gherq

    def __str__(self):
        return  'Kelma: ' + self.kelma + '\n' + \
                ', tip: ' + self.tip + '\n' + \
                ', mamma: ' + self.mamma + '\n' + \
                ', gherq: ' + self.gherq

    def __repr__(self):
        return self.kelma

    def __iter__(self):
        for each in self.__dict__.values():
            yield each

    def __getitem__(self, index):
        if index < 4:
            return list(self.__dict__.values())[index]
        else:
            raise IndexError('Max index is 3')

    def __eq__(self, word):
        return  self.kelma == word.kelma and \
                self.tip == word.tip and \
                self.mamma == word.mamma and \
                self.gherq == word.gherq

    def __ne__(self, word):
        return not self == word

    def __hash__(self):
        return hash((self.kelma, self.tip, self.mamma, self.gherq))

In [3]:
# Define Sentence class with some functionality
class Sentence(object):
    def __init__(self, words=None):
        if words is None:
            words = []
        self.words = words

    def __str__(self):
        s = ''
        for word in self.words:
            s += word.kelma + " "
        return s

    def __repr__(self):
        return self.__str__()

    def __iter__(self):
        for word in self.words:
            yield word

    def __getitem__(self, index):
        return self.words[index]

    def append(self, word: Word):
        self.words.append(word)

    def insert(self,index, word: Word):
        self.words.insert(index, word)

    def __eq__(self, sentence):
        for w1, w2 in self, sentence:
            if w1 != w2:
                return False
        return True

    def __ne__(self, sentence):
        return not self != sentence

    def __len__(self):
        return len(self.words)


In [4]:
class Corpus(object):
    def __init__(self, directory="Corpus/"):
        self._corpus = self._CorpusAsListOfSentences(directory)
        self.words = set([w for s in self for w in s])
        self._vanilla = None
        self._laplace = None
        self._unk = None

        x_counts = {}
        N = 0
        for s in tqdm(self, desc='Counting x counts'):
            for i in range(len(s)):
                N+=0
                count = tuple([s[i]])

                if count in x_counts:
                    x_counts[count] += 1
                else:
                    x_counts[count] = 1

        gram = {}
        for x in x_counts:
            gram[x] = {"count":x_counts[x]}

        self._ngrams = {1: {"gram": gram,
                            "flags":
                                {
                                    "limit": None, "til": True, "model": "vanilla"
                                }
                            }}
        model = {}
        for x in x_counts:
            model[x] = {"probability": 1/N}

        self._models = {1: model,
                        "type":"vanilla"}

    def __len__(self):
        return len(self._corpus)

    def __iter__(self):
        for word in self._corpus:
            yield word

    def __getitem__(self, index):
        return self._corpus[index]

    # Create functions to free memory once function scope is left
    @staticmethod
    def _readCorpus(root="Corpus/"):
        if not os.access(root, os.R_OK):
            print("Check root!!")

        xml_data = []

        for file in tqdm(os.listdir(root), desc='Reading Files'):
            xml_data.append(open(os.path.join(root, file), 'r', encoding='utf8').read())  # Read file
        return xml_data

    @staticmethod
    def _ParseAsXML(root="Corpus/"):
        parser = etree.XMLParser(recover=True)
        roots = []
        xml_data = Corpus._readCorpus(root)
        for xml in tqdm(xml_data, desc='Parsing XML'):
            roots.append(etree.fromstring(xml, parser=parser))
        return roots

    # Mention garbage collection effort and try outs for pd and numpy and why i settled with list of lists (pd matrix, list of lists no)
    @staticmethod
    def _CorpusAsListOfSentences(root="Corpus/"):
        roots = Corpus._ParseAsXML(root)
        sentences = []
        for root in tqdm(roots, desc='XML File'):
            for i, p in tqdm(enumerate(root), desc='Paragraph'):
                for k, s in enumerate(p):
                    unfiltered_sentence = re.split(r'\n', s.text.lstrip('\n'))
                    sentence = Sentence()
                    for unfiltered_word in unfiltered_sentence:
                        if unfiltered_word is not "":
                            filtered_word = unfiltered_word.split('\t')
                            sentence.append(Word(   filtered_word[0],
                                                    filtered_word[1],
                                                    filtered_word[2],
                                                    filtered_word[3]))
                    if sentence is not []:
                        sentence.insert(0, Word("<s>", "Bidu", "null", "null"))
                        sentences.append(sentence)
                        sentence.append(Word("</s>", "Tmiem", "null", "null"))
        return sentences

    def NGram(self, n=2, limit=None, til=True, replace=False, model="vanilla"):
        if n == 1:
            #return unigram()


        if n < 1:
            raise Exception("Unigrams and up are supported, otherwise no.")

        if model != "vanilla" or \
            model != "laplace" or \
            model != "unk":
            raise Exception("Only 'vanilla'/'laplace'/'unk' models are supported.")

        if n in self._ngrams:
            if  self._ngrams[n]["flags"]["limit"] == limit and self._ngrams[n]["flags"]["til"] == til \
                    and self._ngrams[n]["flags"]["model"] == model:
                return self._ngrams[n]
        if n not in self._ngrams:
            replace = True

        laplace = 0

        if model == "laplace":
            laplace = 1

        local_corpus = [s for s in self if len(s) >= n]

        if limit is None and til is not True:
            raise Exception("Cannot access index: None\nDefine limit to use til")
        elif limit is not None and til is True:
            local_corpus = local_corpus[:(limit+1)]
        elif limit is not None and til is not True:
            local_corpus = [local_corpus[limit]]



        x_counts = {}
        for s in tqdm(local_corpus, desc='Counting x counts'):
            for i in range(len(s)):
                if i < n:
                    continue
                count = []
                for x in range(n,0,-1):
                    count.append(s[i - x])
                count = tuple(count)

                if count in x_counts:
                    x_counts[count] += (1 + laplace)
                else:
                    x_counts[count] = (1 + laplace)

        if model == "unk":
            # write function that works for unigram


        gram = {}
        for x in x_counts:
            gram[x] = {"count":x_counts[x]}

        result =    {   "gram":gram,
                        "flags":
                        {
                            "limit": limit, "til": til, "model": model
                        }
                    }

        if replace:
            self._ngrams[n] = result
            return self._ngrams[n]
        else:
            return result

    def Model(self, n, model, replace):
        if n in self._models:
            if  self._models[n]["type"] == "model":
                return self._models[n]
        if n not in self._models:
            replace = True

        x_counts = self.NGram(n,model=model)['gram']
        previous = self.NGram(n-1,model=model)['gram']

        probabilities = {}
        for x in x_counts:
            probabilities[x] = {"probability":x_counts[x]["count"] / previous[x[1:]]["count"]}

        result = {n:probabilities,
                  "type":"model"}

        if replace:
            self._models[n] = result
            return self._models[n]
        else:
            return result


#http://pages.di.unipi.it/pibiri/papers/NGrams18.pdf

In [5]:
corpus = Corpus()

Reading Files:   0%|          | 0/5 [00:00<?, ?it/s]

Parsing XML:   0%|          | 0/5 [00:00<?, ?it/s]

XML File:   0%|          | 0/5 [00:00<?, ?it/s]

Paragraph: 0it [00:00, ?it/s]

Paragraph: 0it [00:00, ?it/s]

Paragraph: 0it [00:00, ?it/s]

Paragraph: 0it [00:00, ?it/s]

Paragraph: 0it [00:00, ?it/s]

Counting x counts:   0%|          | 0/3408 [00:00<?, ?it/s]

In [6]:
bigram = corpus.NGram()

Counting x counts:   0%|          | 0/3408 [00:00<?, ?it/s]

In [7]:
trigram = corpus.NGram(3)

Counting x counts:   0%|          | 0/3408 [00:00<?, ?it/s]

In [9]:
#, "probability":x_counts[x] / previous[x[1:]]["count"]}
words = set([w for s in corpus for w in s])
# def vanilla(self, n):
