In [13]:
from lxml import etree
import re
from tqdm.notebook import tqdm
from itertools import combinations
import os



class Word(object):
    def __init__(self, kelma, tip, mamma, gherq):
        self.kelma = kelma
        self.tip = tip
        self.mamma = mamma
        self.gherq = gherq

    def __str__(self):
        return  self.kelma

    def __repr__(self): # remove
        return self.kelma

    def __iter__(self):
        for each in self.__dict__.values():
            yield each

    def __getitem__(self, index):
        if index < 4:
            return list(self.__dict__.values())[index]
        else:
            raise IndexError('Max index is 3')

    def __eq__(self, word):
        return  self.kelma == word.kelma and \
                self.tip == word.tip and \
                self.mamma == word.mamma and \
                self.gherq == word.gherq

    def __ne__(self, word):
        return not self == word

    def __hash__(self):
        return hash((self.kelma, self.tip, self.mamma, self.gherq))



# Define Sentence class with some functionality
class Sentence(object):
    def __init__(self, words=None):
        if words is None:
            words = []
        self.words = words

    def __str__(self):
        s = ''
        for word in self.words:
            s += word.kelma + " "
        return s

    def __repr__(self):
        return self.__str__()

    def __iter__(self):
        for word in self.words:
            yield word

    def __getitem__(self, index):
        return self.words[index]

    def append(self, word: Word):
        self.words.append(word)

    def insert(self,index, word: Word):
        self.words.insert(index, word)

    def __eq__(self, sentence):
        for w1, w2 in self, sentence:
            if w1 != w2:
                return False
        return True

    def __ne__(self, sentence):
        return not self != sentence

    def __len__(self):
        return len(self.words)




class Corpus(object):
    def __init__(self, corpus=None, directory="Corpus/"):
        if corpus is None:
            self._corpus = self._CorpusAsListOfSentences(directory)
        else:
            self._corpus = corpus
        self._ngrams = {}
        self._ngrams[1] = self.NGram(n=1)
        self._models = {}
        self._models[1] = self.Model(n=1)

    def __len__(self):
        return len(self._corpus)

    def __iter__(self):
        for word in self._corpus:
            yield word

    def __getitem__(self, index):
        return self._corpus[index]

    # Create functions to free memory once function scope is left
    @staticmethod
    def _readCorpus(root="Corpus/"):
        if not os.access(root, os.R_OK):
            print("Check root!!")

        xml_data = []

        for file in tqdm(os.listdir(root), desc='Reading Files'):
            xml_data.append(open(os.path.join(root, file), 'r', encoding='utf8').read())  # Read file
        return xml_data

    @staticmethod
    def _ParseAsXML(root="Corpus/"):
        parser = etree.XMLParser(recover=True)
        roots = []
        xml_data = Corpus._readCorpus(root)
        for xml in tqdm(xml_data, desc='Parsing XML'):
            roots.append(etree.fromstring(xml, parser=parser))
        return roots

    # Mention garbage collection effort and try outs for pd and numpy and why i settled with list of lists (pd matrix, list of lists no)
    @staticmethod
    def _CorpusAsListOfSentences(root="Corpus/"):
        roots = Corpus._ParseAsXML(root)
        sentences = []
        for root in tqdm(roots, desc='XML File'):
            for i, p in tqdm(enumerate(root), desc='Paragraph'):
                for k, s in enumerate(p):
                    unfiltered_sentence = re.split(r'\n', s.text.lstrip('\n'))
                    sentence = Sentence()
                    for unfiltered_word in unfiltered_sentence:
                        if unfiltered_word is not "":
                            filtered_word = unfiltered_word.split('\t')
                            sentence.append(Word(   filtered_word[0],
                                                    filtered_word[1],
                                                    filtered_word[2],
                                                    filtered_word[3]))
                    if sentence is not []:
                        sentence.insert(0, Word("<s>", "Bidu", "null", "null"))
                        sentences.append(sentence)
                        sentence.append(Word("</s>", "Tmiem", "null", "null"))
        return sentences

    def X_Counts(self, n, laplace=0):
        x_counts = {}
        for s in tqdm(self, desc='Counting x counts'):
            for i in range(len(s) + 1):
                if i < n - 1 or i == 0:
                    continue
                count = []
                for x in range(n,0,-1):
                    count.append(s[i - x])
                count = tuple(count)

                if count in x_counts:
                    x_counts[count] += (1 + laplace)
                else:
                    x_counts[count] = (1 + laplace)
        return x_counts

    def NGram(self, n=2, model="vanilla", replace=False):
        if n < 1:
            raise Exception("Unigrams and up are supported, otherwise no.")

        if model != "vanilla" and \
            model != "laplace" and \
            model != "unk":
            raise Exception("Only 'vanilla'/'laplace'/'unk' models are supported.")

        if not replace:
            if n in self._ngrams:
                if  self._ngrams[n]["flags"]["model"] == model:
                    return self._ngrams[n]
            if n not in self._ngrams:
                replace = True


        laplace = 0

        if model == "laplace":
            laplace = 1
        x_counts = self.X_Counts(n=n, laplace=laplace)

        if model == "unk":
            _count = self.X_Counts(1)
            tc = []
            for s in self:
                ts = []
                for w in s:
                    if _count[tuple([w])] < 3:
                        ts.append(Word("UNK", "UNK", "null", "null"))
                    else:
                        ts.append(w)
                tc.append(ts)

            temp = Corpus(corpus=tc)
            x_counts = temp.X_Counts(n=n, laplace=laplace)


        gram = {}
        for x in x_counts:
            gram[x] = {"count":x_counts[x]}

        result =    {   "gram":gram,
                        "flags":
                        {
                            "model": model
                        }
                    }

        if replace:
            self._ngrams[n] = result
            return self._ngrams[n]
        else:
            return result

    def Model(self, n=2, model="vanilla", replace=False):
        if n in self._models:
            if  self._models[n]["type"] == "model":
                return self._models[n]
        x_counts = self.NGram(n,model=model,replace=replace)['gram']
        if n not in self._models:
            replace = True


        probabilities = {}
        if n is not 1:
            previous = self.NGram(n-1,model=model,replace=replace)['gram']
            for x in x_counts:
                probabilities[x] = {"probability":x_counts[x]["count"] / previous[x[1:]]["count"]}
        else:
            N = len([w for s in self for w in s])
            for x in x_counts:
                probabilities[x] = {"probability":x_counts[x]["count"] / N}

        result = {"probabilities": probabilities, "type": model}

        if replace:
            self._models[n] = result
            return self._models[n]
        else:
            return result


    def GetProbability(self, forX, givenY:tuple, model):
        sequence = tuple(forX) + givenY
        print(sequence)
        print(list(model['probabilities'].keys())[0][0].type)
        print(tuple('a,b') in model['probabilities'])
        if sequence in model['probabilities']:
            return model['probabilities'][sequence]["probability"]
        else:
            return 0

#http://pages.di.unipi.it/pibiri/papers/NGrams18.pdf

corpus = Corpus(directory="Test Corpus/")
#
#
#
# bigram = corpus.NGram()
#
#
#
# trigram = corpus.NGram(3)




vanilla = corpus.Model(model="vanilla")

#
#
# laplace = corpus.Model(model="laplace")
#
#
#
# unk = corpus.Model(model="unk")
y = tuple(["b"])
print(corpus.GetProbability("a",y,vanilla))

Reading Files:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing XML:   0%|          | 0/1 [00:00<?, ?it/s]

XML File:   0%|          | 0/1 [00:00<?, ?it/s]

Paragraph: 0it [00:00, ?it/s]

Counting x counts:   0%|          | 0/1 [00:00<?, ?it/s]

Counting x counts:   0%|          | 0/1 [00:00<?, ?it/s]

Counting x counts:   0%|          | 0/1 [00:00<?, ?it/s]

('a', 'b')


AttributeError: 'Word' object has no attribute 'type'