In [2]:
import numpy as np
import nltk
from collections import defaultdict
from collections import Counter
from src.classes.database.offer import MongoOffer
from tqdm import tqdm
from enum import Enum
import re
import json

tokenizer = nltk.RegexpTokenizer(r'\w+')

In [3]:
token_list = [tokenizer.tokenize(x.selling) for x in MongoOffer.objects()]
tagged = nltk.pos_tag(token_list[0])

direction = Enum('direction', ['FORWARD', 'BACKWARD', 'SELF'])

In [4]:
def get_all_words(token_list: list[list]) -> list:
    words = []
    for sentence in token_list:
        for word in sentence:
            words.append(word)
    return sorted(set(words))

def next_word_occurrence(key: str, look_ahead: int, tokens: list[list]) -> dict:
    counter = defaultdict(int)
    for sentence in tokens:
        for x in range(0, len(sentence)):
            if key == sentence[x]: #if we find the key in a sentence
                if (x+look_ahead) < len(sentence):
                    counter[sentence[x+look_ahead]] += 1
    return counter

def p_next_word(given: str, looking: str, look_ahead: int, tokens: list[list]) -> float:
    nwo: dict = next_word_occurrence(given, look_ahead, tokens)
    return nwo[looking] / sum(nwo.values()) if nwo[looking] != 0 else 0

# Old method, slow and not used anymore
def calculate_all_probabilities(tokens: list[list], look_ahead) -> dict:
    all_words = get_all_words(tokens)

    #Generating a list of all words
    word_canvas = defaultdict()
    for word in all_words:
        word_canvas[word] = None

    p_word = word_canvas.copy()
    for word in tqdm(p_word):
        p_word[word] = [defaultdict(float) for x in range(0,look_ahead)]
        for x in range(0,look_ahead):
            for sub_word in all_words:
                p = p_next_word(word, sub_word, x+1, tokens)
                if p > 0:
                    p_word[word][x][sub_word] = p_next_word(word, sub_word, x+1, tokens)
                #print(f"{word} | {str(x)} | {sub_word}: {p_word[word][x][sub_word]}")
    return p_word
#data = calculate_all_probabilities(token_list, 1)

## HMaxtrix
Used for storing all word occurrences in a 3 dimensional matrix

### Data we can get from matrix
 1. p(start, find) = probability that 'find' is a n-order forward word for 'start' | single cell value / sum of 'start' row
 2. p(start, find) = probability that 'find' is a n-order backward word for 'start' | single cell value / sum of 'start' column
 3. p(row_word) = probability that 'row word' will be a n-order forward word | sum of row / entire table sum
 4. p(col_word) = probability that 'col word' will be a n-order backward word | sum of column / entire table sum
 5. p(word, order) = probability that 'word' will be n-order word compared to the other orders| single cell value at order / all order values of that word tallied (through the table)
 6.  p(word-row, order) = probability that 'word' will be n-order forward word | all of word-row order sum / all order values of that word-row
 7. p(col-row, order) = probability that 'word' will be n-order backward word | all of word-col order sum / all order values of that word-col

In [5]:
class HMatrix:
    def __init__(self) -> None:
        self.labels = None
        self.reverse_labels = None
        self.order = 0
        self.matrix = None

    def p_row_word(self, order: int, row_word: str, word: str) -> float:
        if row_word not in self.reverse_labels or word not in self.reverse_labels:
            return 0
        row_label = self.reverse_labels[row_word]
        word_label = self.reverse_labels[word]

        dividend = self.matrix[order, row_label, :][word_label]
        divisor = sum(self.matrix[order, row_label, :])

        return dividend/divisor if divisor > 0 else 0

    def p_col_word(self, order: int, col_word: str, word: str) -> float:
        if col_word not in self.reverse_labels or word not in self.reverse_labels:
            return 0
        col_label = self.reverse_labels[col_word]
        word_label = self.reverse_labels[word]

        dividend = self.matrix[order, :, col_label][word_label]
        divisor = sum(self.matrix[order, :, col_label])

        return dividend/divisor if divisor > 0 else 0


    def create_matrix(self, tokens: list[list], order: int):
        #Setup
        self.labels = get_all_words(token_list)
        self.reverse_labels = {self.labels[x]: x for x in range(0, len(self.labels))}
        self.order = order
        self.matrix = np.zeros((order, len(self.labels), len(self.labels)))

        #Iteration
        for sentence in tqdm(tokens):
            for x in range(0, len(sentence)):
                word = sentence[x]
                print(f"Word: {word} ({str(self.reverse_labels[word])})")
                for y in range(0, self.order):
                    if x+y+1 < len(sentence):
                        print(f"Lookahead {str(y+1)}: {sentence[x+y+1]} ({str(self.reverse_labels[sentence[x+y+1]])})")
                        self.matrix[y][self.reverse_labels[word]][self.reverse_labels[sentence[x+y+1]]] += 1

In [6]:
matrix = HMatrix()
matrix.create_matrix(token_list, 8)

 66%|██████▌   | 2314/3494 [00:00<00:00, 11539.44it/s]

Word: logitech (2136)
Lookahead 1: g (1644)
Lookahead 2: pro (2635)
Lookahead 3: x (3513)
Lookahead 4: superlight (3136)
Word: g (1644)
Lookahead 1: pro (2635)
Lookahead 2: x (3513)
Lookahead 3: superlight (3136)
Word: pro (2635)
Lookahead 1: x (3513)
Lookahead 2: superlight (3136)
Word: x (3513)
Lookahead 1: superlight (3136)
Word: superlight (3136)
Word: nintendo (2377)
Lookahead 1: switch (3153)
Lookahead 2: lite (2123)
Lookahead 3: gray (1770)
Lookahead 4: with (3497)
Lookahead 5: original (2465)
Lookahead 6: box (1101)
Lookahead 7: extras (1535)
Lookahead 8: anbernic (914)
Word: switch (3153)
Lookahead 1: lite (2123)
Lookahead 2: gray (1770)
Lookahead 3: with (3497)
Lookahead 4: original (2465)
Lookahead 5: box (1101)
Lookahead 6: extras (1535)
Lookahead 7: anbernic (914)
Lookahead 8: rg351p (2791)
Word: lite (2123)
Lookahead 1: gray (1770)
Lookahead 2: with (3497)
Lookahead 3: original (2465)
Lookahead 4: box (1101)
Lookahead 5: extras (1535)
Lookahead 6: anbernic (914)
Lookahead

100%|██████████| 3494/3494 [00:00<00:00, 10513.74it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)




## Sentence Class

Input a sentence, it will then use the HMatrix to see which path results the most likely outcome for products

In [178]:
class Sentence:
    def __init__(self, matrix: HMatrix, tokenizer, raw_sentence: str, verbose=False):
        self.verbose = verbose
        self.raw_sentence = tokenizer.tokenize(raw_sentence.lower())
        self.sentence = []

        # Creating all the blank word classes
        for x, word in enumerate(self.raw_sentence):
            self.sentence.append(Word(word, x))

        # Creating all the forward connections
        for x, word in enumerate(self.raw_sentence):
            if verbose:
                print(f"--- {word} ---")
            for neighbor in self.sentence:
                mapped_pos = self.map_word_pos_to_order(neighbor.position-x)

                if mapped_pos[1] is not direction.SELF and mapped_pos[0] < matrix.order: #Looking at itself and order is within scope
                    p_value = 0

                    if mapped_pos[1] is direction.FORWARD and self.verbose:
                        p_value = matrix.p_row_word(mapped_pos[0], word, neighbor.key)
                    else:
                        p_value = matrix.p_col_word(mapped_pos[0], word, neighbor.key)

                    self.sentence[x].neighbors.append(
                            {'word': neighbor.key,
                             'ref': neighbor,
                             'p': p_value,
                             'direction': mapped_pos[1].name
                            })

                    if verbose:
                        print(f"{'  '*mapped_pos[0]}"
                              f"{word} "
                              f"-{str(mapped_pos[0])}-> "
                              f"{neighbor.key} "
                              f"({str(p_value)})")



    def map_word_pos_to_order(self, position) -> tuple:
        #Forward word positions
        if position > 0:
            return position-1, direction.FORWARD

        # Backwards word positions
        if position < 0:
            return abs(position)-1, direction.BACKWARD

        #Pos looking at self
        if position == 0:
            return -1, direction.SELF



class Word:
    def __init__(self, key: str, position: int):
        self.key = key
        self.position = position
        self.neighbors = []

    # This method will find this word's neighbor by taking the words position + an input position, returns None if out of bounds.
    def get_neighbor(self, pos):
        # Bound Checking
        if pos == 0:
            return self

        if pos+self.position-1 > len(self.neighbors)-1 or pos+self.position < 0:
            return None

        return self.neighbors[pos+self.position-1] if pos > 0 else self.neighbors[pos+self.position]

    def __str__(self):
        return f"Word: {self.key} | Position: {str(self.position)}"


In [179]:
test = Sentence(matrix, tokenizer, "logitech g pro x superlight intel i7 8700k", verbose=True)

--- logitech ---
logitech -0-> g (0.21568627450980393)
  logitech -1-> pro (0.22916666666666666)
    logitech -2-> x (0.15217391304347827)
      logitech -3-> superlight (0.09090909090909091)
        logitech -4-> intel (0.0)
          logitech -5-> i7 (0.0)
            logitech -6-> 8700k (0.0)
--- g ---
g -0-> logitech (0.13253012048192772)
g -0-> pro (0.11224489795918367)
  g -1-> x (0.08860759493670886)
    g -2-> superlight (0.04054054054054054)
      g -3-> intel (0.0)
        g -4-> i7 (0.0)
          g -5-> 8700k (0.0)
--- pro ---
  pro -1-> logitech (0.03559870550161812)
pro -0-> g (0.029490616621983913)
pro -0-> x (0.02556818181818182)
  pro -1-> superlight (0.009433962264150943)
    pro -2-> intel (0.010452961672473868)
      pro -3-> i7 (0.0)
        pro -4-> 8700k (0.0)
--- x ---
    x -2-> logitech (0.08641975308641975)
  x -1-> g (0.07446808510638298)
x -0-> pro (0.08490566037735849)
x -0-> superlight (0.031914893617021274)
  x -1-> intel (0.0)
    x -2-> i7 (0.0)
      

In [195]:
def seperate_products(start_word: Word):
    products = []



# A method to look ahead only 1 word until the next word probability is 0 todo where I left off, this data is breaking, I must make get_neighbor return the same thing, touple?
def traverse_to_first_order_end(word: Word):
    counter = word.position
    neighbor = word.get_neighbor(1)

    while neighbor is not None and neighbor['p'] > 0:
        print(f"Visited: {neighbor['word']}")
        neighbor = neighbor.get_neighbor(1)

In [196]:
traverse_to_first_order_end(test.sentence[0])

Visited: g


AttributeError: 'dict' object has no attribute 'get_neighbor'

In [182]:
test.sentence[0].neighbors

[{'word': 'g',
  'ref': <__main__.Word at 0x1f83e4f9e10>,
  'p': 0.21568627450980393,
  'direction': 'FORWARD'},
 {'word': 'pro',
  'ref': <__main__.Word at 0x1f83e4fb810>,
  'p': 0.22916666666666666,
  'direction': 'FORWARD'},
 {'word': 'x',
  'ref': <__main__.Word at 0x1f83e4f93d0>,
  'p': 0.15217391304347827,
  'direction': 'FORWARD'},
 {'word': 'superlight',
  'ref': <__main__.Word at 0x1f83e4fa550>,
  'p': 0.09090909090909091,
  'direction': 'FORWARD'},
 {'word': 'intel',
  'ref': <__main__.Word at 0x1f83e4fb910>,
  'p': 0.0,
  'direction': 'FORWARD'},
 {'word': 'i7',
  'ref': <__main__.Word at 0x1f83e4fad10>,
  'p': 0.0,
  'direction': 'FORWARD'},
 {'word': '8700k',
  'ref': <__main__.Word at 0x1f83e4f9c50>,
  'p': 0.0,
  'direction': 'FORWARD'}]

In [183]:
"logitech g pro x superlight intel i7 8700k"

'logitech g pro x superlight intel i7 8700k'