In [1]:
from lxml import etree
import re
from tqdm.notebook import tqdm
import torch

In [2]:
# Create functions to free memory once function scope is left
def _readCorpus(corpus="Corpus/", files=None):
    if files is None:
        files = ["malti03.parl.1.txt",
                 "malti03.parl.2.txt",
                 "malti03.parl.3.txt",
                 "malti03.parl.4.txt",
                 "malti03.parl.5.txt"]
    xml_data = []
    for file in tqdm(files, desc='Reading Files'):
        xml_data.append(open(corpus+file, 'r', encoding='utf8').read())  # Read file
    return xml_data

In [3]:
def _ParseAsXML():
    parser = etree.XMLParser(recover=True)
    roots = []
    xml_data = _readCorpus()
    for xml in tqdm(xml_data, desc='Parsing XML'):
        roots.append(etree.fromstring(xml, parser=parser))
    return roots

In [4]:
# Define a word class with some functionality
class Word:
    def __init__(self, kelma, tip, mamma, gherq):
        self.kelma = kelma
        self.tip = tip
        self.mamma = mamma
        self.gherq = gherq

    def __str__(self):
        return  'Kelma: ' + self.kelma + '\n' + \
                'tip: ' + self.tip + '\n' + \
                'mamma: ' + self.mamma + '\n' + \
                'gherq: ' + self.gherq

    def __repr__(self):
        return self.__str__()

    def __iter__(self):
        for each in self.__dict__.values():
            yield each

    def __getitem__(self, index):
        if index < 4:
            return list(self.__dict__.values())[index]
        else:
            raise IndexError('Max index is 3')

    def __eq__(self, word):
        return  self.kelma == word.kelma and \
                self.tip == word.tip and \
                self.mamma == word.mamma and \
                self.gherq == word.gherq

    def __ne__(self, word):
        return not self == word

    def __hash__(self):
        return hash((self.kelma, self.tip, self.mamma, self.gherq))

In [5]:
# Define Sentence class with some functionality
class Sentence:
    def __init__(self, words=None):
        if words is None:
            words = []
        self.words = words

    def __str__(self):
        s = ''
        for word in self.words:
            s += word.kelma + " "
        return s

    def __repr__(self):
        return self.__str__()

    def __iter__(self):
        for word in self.words:
            yield word

    def __getitem__(self, index):
        return self.words[index]

    def append(self, word: Word):
        self.words.append(word)

    def insert(self,index, word: Word):
        self.words.insert(index, word)

    def __eq__(self, sentence):
        for w1, w2 in self, sentence:
            if w1 != w2:
                return False
        return True

    def __ne__(self, sentence):
        return not self != sentence

    def __len__(self):
        return len(self.words)

In [6]:
# Mention garbage collection effort and try outs for pd and numpy and why i settled with list of lists (pd matrix, list of lists no)
def CorpusAsListOfLists():
    roots = _ParseAsXML()
    sentences = []
    for root in tqdm(roots, desc='XML File'):
        for i, p in tqdm(enumerate(root), desc='Paragraph'):
            for k, s in enumerate(p):
                unfiltered_sentence = re.split(r'\n', s.text.lstrip('\n'))
                sentence = Sentence()
                for unfiltered_word in unfiltered_sentence:
                    if unfiltered_word is not "":
                        filtered_word = unfiltered_word.split('\t')
                        sentence.append(Word(   filtered_word[0],
                                                filtered_word[1],
                                                filtered_word[2],
                                                filtered_word[3]))
                if sentence is not []:
                    sentence.insert(0, Word("<s>", "Bidu", "null", "null"))
                    sentences.append(sentence)
                    sentence.insert(0, Word("</s>", "Tmiem", "null", "null"))
    return sentences

In [7]:
# sen = Sentence([Word("lewwel", "2", "3", "4"), Word("lewwel", "2", "3", "4")])
# print(sen)
# sen2 = Sentence([Word("Monkey", "2", "3", "4"), Word("Monkey", "2", "3", "4"),Word("lewwel", "2", "3", "4"), Word("lewwel", "2", "3", "4")])
# print(sen2)
# sen3 = Sentence([Word("Monkey", "2", "3", "4"), Word("lewwel", "2", "3", "4"), Word("Monkey", "2", "3", "4"), Word("lewwel", "2", "3", "4")])
# print(sen3)
# test = [sen, sen2, sen3]



In [8]:
# prepare [sentences] into cols x rows encoded tensor
def _CorpusToEncodedTensor(corpus, cols, n=2, limit=None, til=True):
    local_corpus = [s for s in corpus if len(s.words) >= n]

    if limit is None and til is not True:
        raise Exception("Cannot access index: None")
    elif limit is not None and til is True:
        local_corpus = local_corpus[:(limit+1)]
    elif limit is not None and til is not True:
        local_corpus = [local_corpus[limit]]

    encoded = []
    for sentence in local_corpus:
        e = [w.__hash__() for w in sentence]
        difference = cols - len(e)
        e += [0] * difference
        encoded.append(e)

    return encoded

In [11]:
def NGram(corpus, n=2, limit=None, til=True):
    local_corpus = [s for s in corpus if len(s.words) >= n]

    if limit is None and til is not True:
        raise Exception("Cannot access index: None")
    elif limit is not None and til is True:
        local_corpus = local_corpus[:(limit+1)]
    elif limit is not None and til is not True:
        local_corpus = [local_corpus[limit]]

    frequencies = {}
    #encoded = torch.tensor(_CorpusToEncodedTensor(local_corpus, cols))
    words = [w for s in local_corpus for w in s]
    uniquewords = set(words)#{w.__hash__():w for w in words}
    _ngrams = []
    n = 2

    for i in uniquewords:
        for j in uniquewords:
            _ngrams.append(tuple(i, j))#torch.tensor([i, j]))

    for _ngram in _ngrams:
        x_count = 0
        for r in local_corpus:#encoded:
            for i in range(len(r) - 1):
                if r[i] == _ngram[0] and r[i+1] == _ngram[1]:
                    x_count+=1

        starting = []
        for i in range(cols):
            possibility = [0] * cols
            possibility[i] = _ngram[0]
            starting.append(possibility)

        intermediate = []
        remaining = n - 1
        runs = []
        for j in range(remaining):
            for i in range(len(starting)):
                cpy = starting[i].copy()
                for k in range(cols):
                    if cpy[k] != 0:
                        continue
                    cpy[k] = _ngram[j+1]
                    intermediate.append(cpy)
                    cpy = starting[i].copy()

            runs.append(len(intermediate))

            starting = intermediate.copy()

        if n == 1:
            final = starting
        elif n == 2:
            final = intermediate
        else:
            final = intermediate[runs[-2]:]

        y_count = 0
        for r in final:
            for i in range(len(r) - 1):
                if r[i] == _ngram[0] and r[i+1] == _ngram[1]:
                    y_count+=1

        frequency = x_count / y_count

        nice_label = []
        for i in range(n):
            nice_label.append(uniquewords[_ngram[i].tolist()].kelma)

        frequencies[tuple(nice_label)] = frequency
    return frequencies

In [12]:
%%timeit
corpus = CorpusAsListOfLists()
cols = 0
for sentence in corpus:
    if len(sentence) > cols:
        cols = len(sentence)
ngram = NGram(corpus, cols)
for n in ngram:
    print(str(n) + " :  " + str(ngram[n]))

Reading Files:   0%|          | 0/5 [00:00<?, ?it/s]

Parsing XML:   0%|          | 0/5 [00:00<?, ?it/s]

XML File:   0%|          | 0/5 [00:00<?, ?it/s]

Paragraph: 0it [00:00, ?it/s]

Paragraph: 0it [00:00, ?it/s]

Paragraph: 0it [00:00, ?it/s]

Paragraph: 0it [00:00, ?it/s]

Paragraph: 0it [00:00, ?it/s]

TypeError: tuple expected at most 1 arguments, got 2

In [None]:
n = 1
lc = [['1'], ['2', '3', '4', '5', '6'], ['7', '8', '9'], ['10', '11', '12', '13']]
local_corpus = [s for s in lc if len(s) >= n]

In [None]:
words = [w for s in local_corpus for w in s]
uniquewords = set(words)

_ngrams = []#'#list(combinations(uniquewords, n))
print(_ngrams[0])

In [None]:
x_counts = {}
for s in local_corpus:
    for i in range(len(s)):
        if i < n-1:
            continue
        count = []
        for x in range(n,0,-1):
            count.append(s[i - x])
        count = tuple(count)

        if count in x_counts:
            x_counts[count] += 1
        else:
            x_counts[count] = 1

In [None]:
print(x_counts)
_ngrams = {1: {"x": x_counts, "y": x_counts}}

In [None]:
print(_ngrams[1]["y"])


In [None]:
n = 2
x_counts = {}
for s in local_corpus:
    for i in range(len(s)):
        if i < n-1:
            continue
        count = []
        for x in range(n,0,-1):
            count.append(s[i - x])
        count = tuple(count)

        if count in x_counts:
            x_counts[count] += 1
        else:
            x_counts[count] = 1
print(x_counts)

In [None]:
_ngrams = {1: {"x": x_counts, "y": x_counts,
                "flags":
                    {
                    "limit": None, "til": True, "frequency": False
                    }
                }
           }
print(_ngrams[1]["flags"])

In [None]:
_ngrams[n] = {"x": x_counts, "y": _ngrams[n-1]["y"]}
print(_ngrams[n])

In [None]:
# <s> I am Sam </s>
# <s> Sam I am </s>
# <s> I do not like green eggs and ham </s>
#
# sen = Sentence([Word("<s>", "", "", ""), Word("I", "", "", ""), Word("am", "", "", ""), Word("Sam", "", "", ""), Word("</s>", "", "", "")])
# sen2 = Sentence([Word("<s>", "", "", ""), Word("Sam", "", "", ""),Word("I", "", "", ""), Word("am", "", "", ""), Word("</s>", "", "", "")])
# sen3 = Sentence([Word("<s>", "", "", ""), Word("I", "", "", ""), Word("do", "", "", ""), Word("not", "", "", ""), Word("like", "", "", ""), Word("green", "", "", ""), Word("eggs", "", "", ""), Word("and", "", "", ""), Word("ham", "", "", ""), Word("</s>", "", "", "")])
#
# test = [sen, sen2, sen3]

In [None]:
# x_counts = {}
# for s in tqdm(test, desc='Counting x counts'):
#     for i in range(len(s)):
#         count = tuple([s[i]])
#
#         if count in x_counts:
#             x_counts[count] += 1
#         else:
#             x_counts[count] = 1
#
# _ngrams = {1: {"gram": x_counts, "y": x_counts,
#                     "flags":
#                         {
#                         "limit": None, "til": True
#                         }
#                     }
#                 }

In [None]:
# n = 2
# previous = _ngrams[1]["gram"]
# # x_counts = {}
# for s in tqdm(test, desc='Counting x counts'):
#     for i in range(len(s)):
#         if i < n:
#             continue
#         count = []
#         for x in range(n,0,-1):
#             count.append(s[i - x])
#         count = tuple(count)
#
#         if count in x_counts:
#             x_counts[count] += 1
#         else:
#             x_counts[count] = 1
#
# gram = {}
# for x in x_counts:
#     gram[x] = {"count":x_counts[x] , "probability":x_counts[x] / previous[x[1:]]}
#
# _ngrams[n]  = {"gram": gram,
#                         "flags":
#                             {
#                             "limit": None, "til": True
#                             }
#                         }