In [1]:
import pandas as pd
import numpy as np
import nltk
from collections import defaultdict
from collections import Counter
from src.classes.database.offer import MongoOffer
from tqdm import tqdm
from enum import Enum
import json

tokenizer = nltk.RegexpTokenizer(r'\w+')

In [7]:
token_list = [tokenizer.tokenize(x.selling) for x in MongoOffer.objects()]
tagged = nltk.pos_tag(token_list[0])

tag = Enum('tag', ['BRAND'])

In [25]:
def get_all_words(token_list: list[list]) -> list:
    words = []
    for sentence in token_list:
        for word in sentence:
            words.append(word)
    return sorted(set(words))

def next_word_occurrence(key: str, look_ahead: int, tokens: list[list]) -> dict:
    counter = defaultdict(int)
    for sentence in tokens:
        for x in range(0, len(sentence)):
            if key == sentence[x]: #if we find the key in a sentence
                if (x+look_ahead) < len(sentence):
                    counter[sentence[x+look_ahead]] += 1
    return counter

def p_next_word(given: str, looking: str, look_ahead: int, tokens: list[list]) -> float:
    nwo: dict = next_word_occurrence(given, look_ahead, tokens)
    return nwo[looking] / sum(nwo.values()) if nwo[looking] != 0 else 0

# Old method, slow and not used anymore
def calculate_all_probabilities(tokens: list[list], look_ahead) -> dict:
    all_words = get_all_words(tokens)

    #Generating a list of all words
    word_canvas = defaultdict()
    for word in all_words:
        word_canvas[word] = None

    p_word = word_canvas.copy()
    for word in tqdm(p_word):
        p_word[word] = [defaultdict(float) for x in range(0,look_ahead)]
        for x in range(0,look_ahead):
            for sub_word in all_words:
                p = p_next_word(word, sub_word, x+1, tokens)
                if p > 0:
                    p_word[word][x][sub_word] = p_next_word(word, sub_word, x+1, tokens)
                #print(f"{word} | {str(x)} | {sub_word}: {p_word[word][x][sub_word]}")
    return p_word
#data = calculate_all_probabilities(token_list, 1)

## HMaxtrix
Used for storing all word occurrences in a 3 dimensional matrix

### Data we can get from matrix
 1. p(start, find) = probability that 'find' is a n-order forward word for 'start' | single cell value / sum of 'start' row
 2. p(start, find) = probability that 'find' is a n-order backward word for 'start' | single cell value / sum of 'start' column
 3. p(row_word) = probability that 'row word' will be a n-order forward word | sum of row / entire table sum
 4. p(col_word) = probability that 'col word' will be a n-order backward word | sum of column / entire table sum
 5. p(word, order) = probability that 'word' will be n-order word compared to the other orders| single cell value at order / all order values of that word tallied (through the table)
 6.  p(word-row, order) = probability that 'word' will be n-order forward word | all of word-row order sum / all order values of that word-row
 7. p(col-row, order) = probability that 'word' will be n-order backward word | all of word-col order sum / all order values of that word-col

In [8]:
class HMatrix:
    def __init__(self) -> None:
        self.labels = None
        self.reverse_labels = None
        self.order = 0
        self.matrix = None

    def create_matrix(self, tokens: list[list], order: int):
        #Setup
        self.labels = get_all_words(token_list)
        self.reverse_labels = {self.labels[x]: x for x in range(0, len(self.labels))}
        self.order = order
        self.matrix = np.zeros((order, len(self.labels), len(self.labels)))

        #Iteration
        for sentence in tqdm(tokens):
            for x in range(0, len(sentence)):
                word = sentence[x]
                print(f"Word: {word} ({str(self.reverse_labels[word])})")
                for y in range(0, self.order):
                    if x+y+1 < len(sentence):
                        print(f"Lookahead {str(y+1)}: {sentence[x+y+1]} ({str(self.reverse_labels[sentence[x+y+1]])})")
                        self.matrix[y][self.reverse_labels[word]][self.reverse_labels[sentence[x+y+1]]] += 1

In [26]:
test = HMatrix()
test.create_matrix(token_list, 5)

100%|██████████| 1194/1194 [00:00<00:00, 15114.56it/s]

Word: logitech (1187)
Lookahead 1: g (919)
Lookahead 2: pro (1469)
Lookahead 3: x (1952)
Lookahead 4: superlight (1736)
Word: g (919)
Lookahead 1: pro (1469)
Lookahead 2: x (1952)
Lookahead 3: superlight (1736)
Word: pro (1469)
Lookahead 1: x (1952)
Lookahead 2: superlight (1736)
Word: x (1952)
Lookahead 1: superlight (1736)
Word: superlight (1736)
Word: nintendo (1325)
Lookahead 1: switch (1748)
Lookahead 2: lite (1179)
Lookahead 3: gray (986)
Lookahead 4: with (1941)
Lookahead 5: original (1379)
Word: switch (1748)
Lookahead 1: lite (1179)
Lookahead 2: gray (986)
Lookahead 3: with (1941)
Lookahead 4: original (1379)
Lookahead 5: box (606)
Word: lite (1179)
Lookahead 1: gray (986)
Lookahead 2: with (1941)
Lookahead 3: original (1379)
Lookahead 4: box (606)
Lookahead 5: extras (854)
Word: gray (986)
Lookahead 1: with (1941)
Lookahead 2: original (1379)
Lookahead 3: box (606)
Lookahead 4: extras (854)
Lookahead 5: anbernic (515)
Word: with (1941)
Lookahead 1: original (1379)
Lookahead 2




In [31]:
test.matrix[0][1169]

array([0., 0., 0., ..., 0., 0., 0.])

In [29]:
test.reverse_labels['lian']

1169

In [14]:
test.labels[725]

'cpus'

In [17]:
token_list[7]

['parts', 'psu', 'and', 'motherboard', 'of', 'my', 'own', 'xps', '8940']

In [21]:
test.matrix[0, :, 423][1994]

1.0