In [1]:
from lxml import etree
import re
from tqdm.notebook import tqdm
import torch
from torch import nn

In [2]:
# Create functions to free memory once function scope is left
def _readCorpus(corpus="Corpus/", files=None ):
    if files is None:
        files = ["malti03.parl.1.txt",
                 "malti03.parl.2.txt",
                 "malti03.parl.3.txt",
                 "malti03.parl.4.txt",
                 "malti03.parl.5.txt"]
    xml_data = []
    for file in tqdm(files, desc='Reading Files'):
        xml_data.append(open(corpus+file, 'r', encoding='utf8').read())  # Read file
    return xml_data

In [3]:
def _ParseAsXML():
    parser = etree.XMLParser(recover=True)
    roots = []
    xml_data = _readCorpus()
    for xml in tqdm(xml_data, desc='Parsing XML'):
        roots.append(etree.fromstring(xml, parser=parser))
    return roots

In [4]:
# Define a word class with some functionality
class Word:
    def __init__(self, kelma, tip, mamma, gherq):
        self.kelma = kelma
        self.tip = tip
        self.mamma = mamma
        self.gherq = gherq

    def __str__(self):
        return  'Kelma: ' + self.kelma + '\n' + \
                'tip: ' + self.tip + '\n' + \
                'mamma: ' + self.mamma + '\n' + \
                'gherq: ' + self.gherq

    def __repr__(self):
        return self.__str__()

    def __iter__(self):
        for each in self.__dict__.values():
            yield each

    def __getitem__(self, index):
        if index < 4:
            return list(self.__dict__.values())[index]
        else:
            raise IndexError('Max index is 3')

    def __eq__(self, word):
        return  self.kelma == word.kelma and \
                self.tip == word.tip and \
                self.mamma == word.mamma and \
                self.gherq == word.gherq

    def __ne__(self, word):
        return not self == word

    def __hash__(self):
        return hash((self.kelma, self.tip, self.mamma, self.gherq))

In [5]:
# Define Sentence class with some functionality
class Sentence:
    def __init__(self, words=None):
        if words is None:
            words = []
        self.words = words

    def __str__(self):
        s = ''
        for word in self.words:
            s += word.kelma + " "
        return s

    def __repr__(self):
        return self.__str__()

    def __iter__(self):
        for word in self.words:
            yield word

    def __getitem__(self, index):
        return self.words[index]

    def append(self, word: Word):
        self.words.append(word)

    def insert(self,index, word: Word):
        self.words.insert(index, word)

    def __eq__(self, sentence):
        for w1, w2 in self, sentence:
            if w1 != w2:
                return False
        return True

    def __ne__(self, sentence):
        return not self != sentence

    def __len__(self):
        return len(self.words)

In [6]:
# Mention garbage collection effort and try outs for pd and numpy and why i settled with list of lists (pd matrix, list of lists no)
def CorpusAsListOfLists():
    roots = _ParseAsXML()
    sentences = []
    for root in tqdm(roots, desc='XML File'):
        for i, p in tqdm(enumerate(root), desc='Paragraph'):
            for k, s in enumerate(p):
                unfiltered_sentence = re.split(r'\n', s.text.lstrip('\n'))
                sentence = Sentence()
                for unfiltered_word in unfiltered_sentence:
                    if unfiltered_word is not "":
                        filtered_word = unfiltered_word.split('\t')
                        sentence.append(Word(   filtered_word[0],
                                                filtered_word[1],
                                                filtered_word[2],
                                                filtered_word[3]))
                if sentence is not []:
                    sentence.insert(0, Word("<s>", "Bidu", "null", "null"))
                    sentence.append( Word("</s>", "Tmiem", "null", "null"))
                    sentences.append(sentence)
    return sentences

In [8]:
def _EncodeWords(corpus):
    return {w.__hash__():w for s in corpus for w in s}

def SequenceFrequency(corpus, n=2, limit=None, til=True):
    # remove sentences in corpus with length < n
    local_corpus = [s for s in corpus if len(s.words) >= n]

    if limit is None and til is not True:
        raise Exception("Cannot access index: None")
    elif limit is None and til is True:
        limit = len(local_corpus)
    elif limit is not None and til is True:
        local_corpus = local_corpus[:(limit+1)]
    elif limit is not None and til is not True:
        local_corpus = [local_corpus[limit]]

    encoded_words = _EncodeWords(local_corpus)

    # Start by getting frequency of (n - 1) sized sequence is first sequence in a given sentence
    model = {}
    for sentence_index in range(len(local_corpus)):
        if (sentence_index + n) % n != 0:
            continue



def ngram(corpus, n=2, limit=None, til=True):
    # remove sentences in corpus with length < n
    local_corpus = [s for s in corpus if len(s.words) >= n]

    if limit is None and til is not True:
        raise Exception("Cannot access index: None")
    elif limit is None and til is True:
        limit = len(local_corpus)
    elif limit is not None and til is True:
        local_corpus = local_corpus[:(limit+1)]
    elif limit is not None and til is not True:
        local_corpus = [local_corpus[limit]]

    print(local_corpus)

# sen = Sentence([Word("lewwel", "2", "3", "4"), Word("Tieni", "2", "3", "4"), Word("Tielet", "2", "3", "4"), Word("Raba", "2", "3", "4")])
# sen2 = Sentence([Word("Hames", "2", "3", "4"), Word("Sitt", "2", "3", "4"), Word("Tielet", "2", "3", "4"), Word("Seba", "2", "3", "4")])
# test = [sen, sen2]
# #print(sen[2] == sen2[2] )
# print(SequenceFrequency(test))
# # print(test)

In [9]:
print(torch.cuda.is_available())

True
tensor([[ 0.8028,  1.3280,  0.7041],
        [-0.4863, -0.4136, -0.1424]])
tensor([[ 0.8028,  1.3280,  0.7041],
        [-0.4863, -0.4136, -0.1424]], device='cuda:0')
1


In [10]:
corp = CorpusAsListOfLists()
rows = len(corp)
print(rows)
cols = 0
for sentence in corp:
    if len(sentence) > cols:
        cols = len(sentence)
print(cols)

# shape of tensore will be these two values

Reading Files:   0%|          | 0/5 [00:00<?, ?it/s]

Parsing XML:   0%|          | 0/5 [00:00<?, ?it/s]

XML File:   0%|          | 0/5 [00:00<?, ?it/s]

Paragraph: 0it [00:00, ?it/s]

Paragraph: 0it [00:00, ?it/s]

Paragraph: 0it [00:00, ?it/s]

Paragraph: 0it [00:00, ?it/s]

Paragraph: 0it [00:00, ?it/s]

In [15]:
test = [[1,2,3,4],
        [5,6,7,8],
        [9,10,11,12],]

testing = torch.tensor(test)
print(testing)
print(testing.size())

ValueError: expected sequence of length 4 at dim 1 (got 3)

In [14]:
# prepare [sentences] into cols x rows encoded tensor
def _CorpusToEncodedTensor(corpus, cols, n=2, limit=None, til=True):
    local_corpus = [s for s in corpus if len(s.words) >= n]

    if limit is None and til is not True:
        raise Exception("Cannot access index: None")
    elif limit is None and til is True:
        limit = len(local_corpus)
    elif limit is not None and til is True:
        local_corpus = local_corpus[:(limit+1)]
    elif limit is not None and til is not True:
        local_corpus = [local_corpus[limit]]

    encoded = []
    for sentence in local_corpus:
        e = [w.__hash__() for w in sentence]
        difference = cols - len(e)
        e += [0] * difference
        encoded.append(e)

    return encoded

sen = Sentence([Word("lewwel", "2", "3", "4"), Word("lewwel", "2", "3", "4"), Word("Tielet", "2", "3", "4"), Word("Raba", "2", "3", "4")])
sen2 = Sentence([Word("Hames", "2", "3", "4"), Word("Sitt", "2", "3", "4"), Word("Tielet", "2", "3", "4"), Word("Seba", "2", "3", "4")])
sen3 = Sentence([Word("lewwel", "2", "3", "4"), Word("lewwel", "2", "3", "4"), Word("Sitt", "2", "3", "4"), Word("Tielet", "2", "3", "4")])
test = [sen, sen2, sen3]
cols = 7

In [17]:
def ConvNGram(corpus, n=2, limit=None, til=True):
    local_corpus = [s for s in corpus if len(s.words) >= n]

    if limit is None and til is not True:
        raise Exception("Cannot access index: None")
    elif limit is None and til is True:
        limit = len(local_corpus)
    elif limit is not None and til is True:
        local_corpus = local_corpus[:(limit+1)]
    elif limit is not None and til is not True:
        local_corpus = [local_corpus[limit]]

    encoded = torch.tensor(_CorpusToEncodedTensor(local_corpus, cols))
    frequencies = {}
    empty = torch.tensor([0] * n)
    sentence_done = False
    for sentence in encoded:
        for i in range((len(sentence)-n)):
            seq = sentence[i:i+n]
            if seq.equal(empty):
                sentence_done = True
                continue
            if seq not in frequencies.keys():
                frequencies[seq] = 0
            frequencies[seq] = frequencies.get(seq) + 1
        if sentence_done:
            sentence_done = False
            continue
    return frequencies

ngram = ConvNGram(test)
for key in ngram:
    print(str(key) +" "+ str(ngram.get(key)))

tensor([-2503414856370251983, -2503414856370251983]) 1
tensor([-2503414856370251983,  8566423479208599055]) 1
tensor([8566423479208599055, 8145379322995423848]) 1
tensor([8145379322995423848,                   0]) 1
tensor([ 4495160836449559054, -7602474522058741849]) 1
tensor([-7602474522058741849,  8566423479208599055]) 1
tensor([8566423479208599055, 8113697844016283039]) 1
tensor([8113697844016283039,                   0]) 1
tensor([-2503414856370251983, -2503414856370251983]) 1
tensor([-2503414856370251983, -7602474522058741849]) 1
tensor([-7602474522058741849,  8566423479208599055]) 1
tensor([8566423479208599055,                   0]) 1
