In [1]:
import eec
import numpy as np
import gensim
import pandas as pd
from neo4j import GraphDatabase

In [2]:
keyed_vectors = gensim.models.KeyedVectors.load('./data/word2vec.model')

In [3]:
keyed_vectors.wv.most_similar('lettuce')

[('romaine', 0.8329761028289795),
 ('head', 0.7326541543006897),
 ('boston', 0.7135635614395142),
 ('cabbage', 0.7096205353736877),
 ('rack', 0.6752064228057861),
 ('spring', 0.6698594093322754),
 ('turnip', 0.6696361303329468),
 ('pig', 0.6667503714561462),
 ('cup', 0.6314897537231445),
 ('kaffir', 0.6175163388252258)]

In [4]:
driver = GraphDatabase.driver("bolt://localhost:7687", auth=("neo4j", "test"))

In [5]:
entity_repository = eec.Neo4JEntityRepository(
    driver=driver,
    keyed_vectors=keyed_vectors, 
)

In [6]:
entity_repository.add_entity(
    eec.Neo4JEntity(
        entity_id='',
        entity_source='demo',
        entity_source_id='0',
        mention='lettuce',
        mention_vector=keyed_vectors.wv['lettuce'],
        has_mention_vector=True,)

)

6b165ea3-9812-47dc-a322-6783010a36e1-lettuce

In [7]:
entity_repository.get_all_entities()

[6b165ea3-9812-47dc-a322-6783010a36e1-lettuce]

In [8]:
entity_repository.add_entities([
    eec.Neo4JEntity(
        entity_id='',
        entity_source='demo',
        entity_source_id='1',
        mention='apple',
        mention_vector=keyed_vectors.wv['apple'],
        has_mention_vector=True,
    ),
    eec.Neo4JEntity(
        entity_id='',
        entity_source='demo',
        entity_source_id='2',
        mention='banana',
        mention_vector=keyed_vectors.wv['banana'],
        has_mention_vector=True,
    ),
])

In [10]:
entity_repository.get_all_entities()

[6b165ea3-9812-47dc-a322-6783010a36e1-lettuce,
 1caea45f-8fa4-4158-b09b-c387771966f9-apple,
 db15c83d-532e-472a-b154-435c3d69c9a2-banana]

In [13]:
entity_repository.get_entity_by_id('6b165ea3-9812-47dc-a322-6783010a36e1')

6b165ea3-9812-47dc-a322-6783010a36e1-lettuce

In [21]:
entity_repository.get_random_unlabeled_entity()

6b165ea3-9812-47dc-a322-6783010a36e1-lettuce

In [22]:
entity_repository.get_random_unlabeled_entities(2)

[6b165ea3-9812-47dc-a322-6783010a36e1-lettuce,
 db15c83d-532e-472a-b154-435c3d69c9a2-banana]

In [23]:
entity_repository.calculate_all_entity_vectors()

In [29]:
food_com_df = pd.read_csv('./data/all.csv')
food_com_df

Unnamed: 0,id,phrase,target,id_phrase
0,4308,"medium heads bibb or red leaf lettuce, washed,...",lettuce,0
1,4308,mixed baby lettuces and spring greens,lettuce,1
2,4308,romaine lettuce leaf,lettuce,2
3,4308,iceberg lettuce leaf,lettuce,3
4,4308,red romaine lettuce,lettuce,4
...,...,...,...,...
11654,6702,soybeans,soybean,11654
11655,3318,goose,goose,11655
11656,47,ajwain,ajwain,11656
11657,750,brinjals,brinjal,11657


In [30]:
entities : list[eec.Neo4JEntity] = []
for index, row in food_com_df.iterrows():
    entities.append(eec.Neo4JEntity(
        entity_id=str(index),
        entity_source='food_com',
        entity_source_id=str(index),
        mention=row['phrase'],
    ))

entity_repository.add_entities(entities)

In [8]:
cluster_repository = eec.BaseClusterRepository(
    entity_repository=entity_repository,
    clusters=[],
)

In [9]:
mention_clustering_method = eec.BaseMentionClusteringMethod(
    name='mention_clustering_method',
    entity_repository=entity_repository,
    cluster_repository=cluster_repository,
)

In [10]:
eec.EntityClustererBridge().set_cluster_repository(cluster_repository)
eec.EntityClustererBridge().set_entity_repository(entity_repository)
eec.EntityClustererBridge().set_mention_clustering_method(mention_clustering_method)

In [11]:
entity = eec.EntityClustererBridge().entity_repository.get_random_unlabeled_entity()

In [12]:
entity

30-romaine lettuce

In [13]:
eec.EntityClustererBridge().mention_clustering_method.getPossibleClusters(entity)

No cluster vectors found Fallback to only string similarity
Fallback failed! Create new cluster!


[]

In [22]:
cluster_repository.add_cluster(eec.BaseCluster(
    cluster_id='2',
    cluster_name='dip sauce',
    entities=[],
))

Cluster 1-dip sauce

In [23]:
cluster_repository.add_entity_to_cluster('2', entity2.entity_id)

NotFoundException: Cluster with id {cluster_id} not found.

In [19]:
entity2 = eec.EntityClustererBridge().entity_repository.get_random_unlabeled_entity()

In [20]:
entity2

8883-spinach dip

In [21]:
eec.EntityClustererBridge().mention_clustering_method.getPossibleClusters(entity2)

[Cluster 0-cabbage]