In [1]:
from classes import *

In [2]:
import tensorflow as tf
import tensorflow.keras as keras

In [3]:
corpus_removal_threshold     = 1
vectorizer_removal_threshold = 2
grouping_similarity          = .6
filepath = 'recipes'

In [4]:
corpus = Corpus(
    filepath          = f'corpora/{filepath}',
    spell_correct     = 'corpora/spelling_dictionary',
    lemmatize         = False,
    removal_threshold = corpus_removal_threshold)

vectorizer = corpus.create_vectorizer(removal_threshold = vectorizer_removal_threshold)

Data collected:
    - 4107 unique words found.
    - most common words: ['\n', '-', '.', '1', ',']
    - least common words: ['www', 'yet', 'zinfandel', 'zip', 'zipper']


Corpus loaded:
    - 5000 sentences.
    - longest sentence: 

<START> whole wheat yeast bread 
 
 ingredients : 
 - 1 cup ( 2 2 5 ml ) scalded milk 
 - 1 / 4 cup ( 6 0 ml ) lard 
 - 4 tsp ( 2 0 ml ) . salt 
 - 1 / 4 cup ( 6 0 ml ) honey 
 - 1 cup ( 2 2 5 ml ) water , 1 1 0 - 1 1 5 degrees 
 - 2 pkg . dry yeast 
 - 2 tbsp ( 3 0 ml ) . brown sugar 
 - 2 cups ( 4 7 5 ml ) whole wheat flour 
 - 3 cups ( 7 0 0 ml ) or more white flour 
 
 directions : 
 - mix scalded milk , lard , salt & honey and let cool to lukewarm - about 8 5 degrees ( 3 0 c . ) . 
 - mix water , yeast & brown sugar and let stand to dissolve for 2 - 3 minutes . 
 - add yeast mixture to milk mixture . 
 - add whole wheat flour and enough white flour to make dough . 
 - mix well . 
 - knead 5 - 1 0 minutes . 
 - put in bowl to rise until double in size

In [5]:
# get matrix of similarities between all words

twentieth = vectorizer.vsize // 20
differences = np.zeros((vectorizer.vsize, vectorizer.vsize,))

for i in range(vectorizer.vsize):
    word_diffs = keras.ops.einsum('v,bv->bv', vectorizer.matrix[i], vectorizer.matrix)
    word_diffs = keras.ops.sqrt(word_diffs)
    word_diffs = keras.ops.sum(word_diffs, axis = -1)
    word_diffs *= word_diffs
    differences[i] = word_diffs.numpy()
    
    if (i % twentieth == 0):
        print('twentieth')

differences *= (1 - np.eye(differences.shape[0], differences.shape[1]))
similar_words = np.asarray(differences >= grouping_similarity)


twentieth
twentieth
twentieth
twentieth
twentieth
twentieth
twentieth
twentieth
twentieth
twentieth
twentieth
twentieth
twentieth
twentieth
twentieth
twentieth
twentieth
twentieth
twentieth
twentieth
twentieth


In [6]:
coords = np.array(similar_words.nonzero())
print(len(coords[0]))

1450


In [7]:
# https://www.geeksforgeeks.org/dsa/maximal-clique-problem-recursive-solution/

def bron_kerbosch(R, P, X, graph):
    if not P and not X:
        yield R
    while P:
        v = P.pop()
        yield from bron_kerbosch(
            R.union({v}),
            P.intersection(graph[v]),
            X.intersection(graph[v]),
            graph
        )
        X.add(v)


edges = [(int(coords[0,i])+1, int(coords[1,i])+1) for i in range(coords.shape[1])]
n = len(corpus.vocab)  # Number of nodes

# Create an adjacency list from the edges
graph = {i: set() for i in range(1, n + 1)}
for u, v in edges:
    graph[u].add(v)
    graph[v].add(u)
# Graph [x] is all words that are near x

# Convert set keys into sorted lists for consistent ordering
graph = {key: set(graph[key]) for key in graph}
# Graph [x] is now a set still, not a list. ???

# Calling bron_kerbosch with an empty set, the graph keys (range(1, n+1)), and the whole graph
all_cliques = list(bron_kerbosch(set(), set(graph.keys()), set(), graph))

# sort and clip all len-1 cliques
all_cliques = list(sorted([clique for clique in all_cliques if len(clique) > 1], key = lambda x: -len(x)))

print(f'Cliques Found: {len(all_cliques)}; Longest Clique: {len(max(all_cliques, key = len))}')


Cliques Found: 282; Longest Clique: 10


In [8]:
# for clique in all_cliques:
#     print(set(vectorizer.to_str(idx-1) for idx in clique))

In [9]:
# write to a file

with open(f'discoveries/cliques/{filepath}.txt', 'w') as f:

    cliques_string = '\n'.join([str(set(vectorizer.to_str(idx-1) for idx in clique)) for clique in all_cliques])

    cliques_string = f'''Corpus Removal Threshold: {corpus_removal_threshold}\n
Vectorizer Removal Threshold: {vectorizer_removal_threshold}\n
Grouping Similarity: {grouping_similarity}\n
                         \n''' + cliques_string
    
    f.write(cliques_string)

In [10]:
search_word = '0'

if search_word not in corpus.vocab:
    raise Exception("Word not in vocabulary")

word_index = vectorizer.to_int(search_word) + 1
found = False

for clique in all_cliques:
    if word_index in clique:
        found = True
        print(set(vectorizer.to_str(idx-1) for idx in clique))

if not found:
    print('None found')

{'3', '7', '5', '0'}
