In [114]:
import pandas as pd
import numpy as np
import nltk
from collections import defaultdict
from collections import Counter
from src.classes.database.offer import MongoOffer
from tqdm import tqdm
from enum import Enum
import json

tokenizer = nltk.RegexpTokenizer(r'\w+')

In [105]:
token_list = [tokenizer.tokenize(x.selling) for x in MongoOffer.objects()]
tagged = nltk.pos_tag(token_list[0])

tag = Enum('tag', ['BRAND'])

In [106]:
def get_all_words(token_list: list[list]) -> list:
    words = []
    for sentence in token_list:
        for word in sentence:
            words.append(word)
    return sorted(set(words))

def next_word_occurrence(key: str, look_ahead: int, tokens: list[list]) -> dict:
    counter = defaultdict(int)
    for sentence in tokens:
        for x in range(0, len(sentence)):
            if key == sentence[x]: #if we find the key in a sentence
                if (x+look_ahead) < len(sentence):
                    counter[sentence[x+look_ahead]] += 1
    return counter

def p_next_word(given: str, looking: str, look_ahead: int, tokens: list[list]) -> float:
    nwo: dict = next_word_occurrence(given, look_ahead, tokens)
    return nwo[looking] / sum(nwo.values()) if nwo[looking] != 0 else 0

def calculate_all_probabilities(tokens: list[list], look_ahead) -> dict:
    all_words = get_all_words(tokens)

    #Generating a list of all words
    word_canvas = defaultdict()
    for word in all_words:
        word_canvas[word] = None

    p_word = word_canvas.copy()
    for word in tqdm(p_word):
        p_word[word] = [defaultdict(float) for x in range(0,look_ahead)]
        for x in range(0,look_ahead):
            for sub_word in all_words:
                p = p_next_word(word, sub_word, x+1, tokens)
                if p > 0:
                    p_word[word][x][sub_word] = p_next_word(word, sub_word, x+1, tokens)
                #print(f"{word} | {str(x)} | {sub_word}: {p_word[word][x][sub_word]}")
    return p_word


In [107]:
data = calculate_all_probabilities(token_list, 1)

100%|██████████| 1787/1787 [11:47<00:00,  2.53it/s]


In [113]:
data['i7']

[defaultdict(float,
             {'10700k': 0.05555555555555555,
              '10870h': 0.027777777777777776,
              '11800h': 0.05555555555555555,
              '1265u': 0.027777777777777776,
              '12700h': 0.027777777777777776,
              '12700k': 0.2222222222222222,
              '12700t': 0.027777777777777776,
              '12th': 0.05555555555555555,
              '13700k': 0.08333333333333333,
              '16gb': 0.08333333333333333,
              '6850k': 0.027777777777777776,
              '7700hq': 0.027777777777777776,
              '7700k': 0.05555555555555555,
              '8086k': 0.027777777777777776,
              '8700k': 0.08333333333333333,
              '8700t': 0.027777777777777776,
              '9700': 0.027777777777777776,
              '9700k': 0.027777777777777776,
              'upgraded': 0.027777777777777776})]

In [115]:
# Serializing json
json_object = json.dumps(data, indent=4)

# Writing to sample.json
with open("probabilities.json", "w") as outfile:
    outfile.write(json_object)

In [None]:
class Sentence:
    def __init__(self):


class Word:
    def __init__(self, word: str):
        self.word = word
        self.p_of_next_word = [next_word_occurrence(word, x, token_list) for x in range(1,6)]


In [None]:
test = Word('i5')

In [None]:
test.p_of_next_word[1]

p_of_next_word['word']['looking_for_word'][INDEX HERE OF LOOKAHEAD]
return object like {occurrences: 1, probability: 0.5}

word1
    looking_for1
        0: {occurenecs:1, probability: 0.5}
        1:  {occurenecs:1, probability: 0.5}
        2:  {occurenecs:0, probability: 5}
    looking_for2