In [1]:

from lxml import etree
import re
from tqdm.notebook import tqdm
import torch
from torch import nn

In [2]:
# Create functions to free memory once function scope is left
def _readCorpus(corpus="Corpus/", files=None):
    if files is None:
        files = ["malti03.parl.1.txt",
                 "malti03.parl.2.txt",
                 "malti03.parl.3.txt",
                 "malti03.parl.4.txt",
                 "malti03.parl.5.txt"]
    xml_data = []
    for file in tqdm(files, desc='Reading Files'):
        xml_data.append(open(corpus+file, 'r', encoding='utf8').read())  # Read file
    return xml_data

In [3]:
def _ParseAsXML():
    parser = etree.XMLParser(recover=True)
    roots = []
    xml_data = _readCorpus()
    for xml in tqdm(xml_data, desc='Parsing XML'):
        roots.append(etree.fromstring(xml, parser=parser))
    return roots

In [4]:
# Define a word class with some functionality
class Word:
    def __init__(self, kelma, tip, mamma, gherq):
        self.kelma = kelma
        self.tip = tip
        self.mamma = mamma
        self.gherq = gherq

    def __str__(self):
        return  'Kelma: ' + self.kelma + '\n' + \
                'tip: ' + self.tip + '\n' + \
                'mamma: ' + self.mamma + '\n' + \
                'gherq: ' + self.gherq

    def __repr__(self):
        return self.__str__()

    def __iter__(self):
        for each in self.__dict__.values():
            yield each

    def __getitem__(self, index):
        if index < 4:
            return list(self.__dict__.values())[index]
        else:
            raise IndexError('Max index is 3')

    def __eq__(self, word):
        return  self.kelma == word.kelma and \
                self.tip == word.tip and \
                self.mamma == word.mamma and \
                self.gherq == word.gherq

    def __ne__(self, word):
        return not self == word

    def __hash__(self):
        return hash((self.kelma, self.tip, self.mamma, self.gherq))

In [5]:
# Define Sentence class with some functionality
class Sentence:
    def __init__(self, words=None):
        if words is None:
            words = []
        self.words = words

    def __str__(self):
        s = ''
        for word in self.words:
            s += word.kelma + " "
        return s

    def __repr__(self):
        return self.__str__()

    def __iter__(self):
        for word in self.words:
            yield word

    def __getitem__(self, index):
        return self.words[index]

    def append(self, word: Word):
        self.words.append(word)

    def insert(self,index, word: Word):
        self.words.insert(index, word)

    def __eq__(self, sentence):
        for w1, w2 in self, sentence:
            if w1 != w2:
                return False
        return True

    def __ne__(self, sentence):
        return not self != sentence

    def __len__(self):
        return len(self.words)

In [6]:
# Mention garbage collection effort and try outs for pd and numpy and why i settled with list of lists (pd matrix, list of lists no)
def CorpusAsListOfLists():
    roots = _ParseAsXML()
    sentences = []
    for root in tqdm(roots, desc='XML File'):
        for i, p in tqdm(enumerate(root), desc='Paragraph'):
            for k, s in enumerate(p):
                unfiltered_sentence = re.split(r'\n', s.text.lstrip('\n'))
                sentence = Sentence()
                for unfiltered_word in unfiltered_sentence:
                    if unfiltered_word is not "":
                        filtered_word = unfiltered_word.split('\t')
                        sentence.append(Word(   filtered_word[0],
                                                filtered_word[1],
                                                filtered_word[2],
                                                filtered_word[3]))
                if sentence is not []:
                    sentence.insert(0, Word("<s>", "Bidu", "null", "null"))
                    sentences.append(sentence)
    return sentences

In [7]:
# corp = CorpusAsListOfLists()
# rows = len(corp)
# print(rows)
# cols = 0
# for sentence in corp:
#     if len(sentence) > cols:
#         cols = len(sentence)
# print(cols)

In [33]:
# sen = Sentence([Word("lewwel", "2", "3", "4"), Word("lewwel", "2", "3", "4"), Word("Tielet", "2", "3", "4"), Word("Raba", "2", "3", "4")])
# sen2 = Sentence([Word("Hames", "2", "3", "4"), Word("Sitt", "2", "3", "4"), Word("Tielet", "2", "3", "4"), Word("Seba", "2", "3", "4")])
# sen3 = Sentence([Word("lewwel", "2", "3", "4"), Word("lewwel", "2", "3", "4"), Word("Sitt", "2", "3", "4"), Word("Tielet", "2", "3", "4")])
sen = Sentence([Word("lewwel", "2", "3", "4"), Word("lewwel", "2", "3", "4")])
sen2 = Sentence([Word("Monkey", "2", "3", "4"), Word("Monkey", "2", "3", "4"),Word("lewwel", "2", "3", "4"), Word("lewwel", "2", "3", "4")])
sen3 = Sentence([Word("Monkey", "2", "3", "4"), Word("lewwel", "2", "3", "4"), Word("Monkey", "2", "3", "4"), Word("lewwel", "2", "3", "4")])

test = [sen, sen2, sen3]
cols = 0
for sentence in test:
    if len(sentence) > cols:
        cols = len(sentence)


In [8]:
# prepare [sentences] into cols x rows encoded tensor
def _CorpusToEncodedTensor(corpus, cols, n=2, limit=None, til=True):
    local_corpus = [s for s in corpus if len(s.words) >= n]

    if limit is None and til is not True:
        raise Exception("Cannot access index: None")
    elif limit is None and til is True:
        limit = len(local_corpus)
    elif limit is not None and til is True:
        local_corpus = local_corpus[:(limit+1)]
    elif limit is not None and til is not True:
        local_corpus = [local_corpus[limit]]

    encoded = []
    for sentence in local_corpus:
        e = [w.__hash__() for w in sentence]
        difference = cols - len(e)
        e += [0] * difference
        encoded.append(e)

    return encoded

In [49]:
def ConvNGram(corpus, n=2, limit=None, til=True):
    local_corpus = [s for s in corpus if len(s.words) >= n]

    if limit is None and til is not True:
        raise Exception("Cannot access index: None")
    elif limit is None and til is True:
        limit = len(local_corpus)
    elif limit is not None and til is True:
        local_corpus = local_corpus[:(limit+1)]
    elif limit is not None and til is not True:
        local_corpus = [local_corpus[limit]]

    encoded = torch.tensor(_CorpusToEncodedTensor(local_corpus, cols))
    print(encoded)
    # frequencies = {}
    # empty = torch.tensor([0] * n)
    # sentence_done = False
    # found = False
    #
    # for sentence in encoded:
    #     for i in range((len(sentence)-n)):
    #         seq = sentence[i:i+n]
    #         seq_hash = seq.__hash__()
    #         if seq.equal(empty):
    #             sentence_done = True
    #             break
    #
    #         if seq_hash not in frequencies.keys():
    #             frequencies[seq_hash] = 0
    #
    #         frequencies[seq_hash] = frequencies[seq_hash] + 1
    #
    #     if sentence_done:
    #         sentence_done = False
    #         continue
    #
    # return frequencies

ngram = ConvNGram(test)
# for key in ngram:
#     print(str(key) +" "+ str(ngram.get(key)))

tensor([[-8984703981310397998, -8984703981310397998,                    0,
                            0],
        [ 7365980133500933082,  7365980133500933082,  7365980133500933082,
         -8984703981310397998],
        [ 7365980133500933082, -8984703981310397998, -8984703981310397998,
          7365980133500933082]])


In [34]:

encoded = torch.tensor(_CorpusToEncodedTensor(test, cols))
#encoded = torch.tensor(test)
print(encoded)

tensor([[2026861111953651343, 2026861111953651343,                   0,
                           0],
        [6536053230923925812, 6536053230923925812, 2026861111953651343,
         2026861111953651343],
        [6536053230923925812, 2026861111953651343, 6536053230923925812,
         2026861111953651343]])


In [41]:
testgram = torch.tensor([2026861111953651343, 2026861111953651343, 0, 0])
testgrams = []
# n = 2
for y in range(cols-2+1):
    testgrams.append(testgram.roll(y))

print(testgrams)

TypeError: object of type 'Word' has no len()

In [36]:
# results = []
# for gram in testgrams:
#     results.append(encoded / gram)


In [37]:
count = 0
for r in encoded:
    for i in range(len(r) - 1):
        if r[i] == testgram[0] and r[i+1] == testgram[1]:
            count+=1

print(count)

2


In [38]:
words =  [2026861111953651343, 2026861111953651343]
cols = 4
n = len(words)
starting = []
for i in range(cols):
    possibility = [0] * cols
    possibility[i] = words[0]
    starting.append(possibility)

intermediate = []
remaining = n - 1
runs = []
for j in range(remaining):
    for i in range(len(starting)):
        cpy = starting[i].copy()
        for k in range(cols):
            if cpy[k] != 0:
                continue
            cpy[k] = words[j+1]
            intermediate.append(cpy)
            cpy = starting[i].copy()

    runs.append(len(intermediate))

    starting = intermediate.copy()

if n == 1:
    final = starting
elif n == 2:
    final = intermediate
else:
    final = intermediate[runs[-2]:]
#print(final)

In [40]:
count = 0
for r in final:
    for i in range(len(r) - 1):
        if r[i] == testgram[0] and r[i+1] == testgram[1]:
            count+=1

print(count//2)


3
