# Training the poincare embedding

In [1]:
%load_ext autoreload
%autoreload 2

import os
import logging
import numpy as np
import pandas as pd

from gensim.models.poincare import PoincareModel, PoincareKeyedVectors, PoincareRelations

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

logging.basicConfig(level=logging.INFO)

In [2]:
from pathlib import Path

In [3]:
data_directory = Path('../results/reviews_Cell_Phones_and_Accessories/')
wordnet_mammal_file = os.path.join(data_directory, 'aspect-rules.csv')

In [9]:
aspect_df_min_10_times = pd.read_csv(data_directory / 'aspects_per_edu_filtered_min_10_freq.csv')
aspect_df_min_10_times.columns = ['aspect', 'count']

The model can be initialized using an iterable of relations, where a relation is simply a pair of nodes

In [11]:
aspect_df_min_10_times.head()

Unnamed: 0,aspect,count
0,price,6383
1,motorola,4246
2,battery,3925
3,amazon,2805
4,sound quality,2636


In [12]:
aspect_rules_df = pd.read_csv(wordnet_mammal_file)
aspect_rules_df = aspect_rules_df[['id1', 'id2']]
aspect_rules_df.sample(2)

Unnamed: 0,id1,id2
1578,belkin,hp
815,quality,features


In [15]:
aspect_rules_df = aspect_rules_df[aspect_rules_df.id1.isin(aspect_df_min_10_times.aspect.values) & aspect_rules_df.id2.isin(aspect_df_min_10_times.aspect.values)]

In [16]:
aspect_rules_df

Unnamed: 0,id1,id2
0,mike,accord
2,outlook,sidekick ii
3,bluetooth,sidekick ii
4,web browsing,sidekick ii
5,java,sidekick ii
7,amazon,sidekick ii
8,forever,sidekick ii
9,amazon,sidekick ii
13,volume,motorola
15,price,tl


In [17]:
relations = list(zip(aspect_rules_df.id1, aspect_rules_df.id2))

In [18]:
model = PoincareModel(train_data=relations, size=2, burn_in=0)

INFO:gensim.models.poincare:loading relations from train data..
INFO:gensim.models.poincare:loaded 2282 relations from train data, 481 nodes


In [None]:
model.train(epochs=100, print_every=500)

INFO:gensim.models.poincare:training model of size 2 with 1 workers on 2282 relations for 100 epochs and 0 burn-in epochs, using lr=0.10000 burn-in lr=0.01000 negative=10
INFO:gensim.models.poincare:starting training (100 epochs)----------------------------------------


The model can be saved and loaded using two different methods - 

In [None]:
# Saves the entire PoincareModel instance, the loaded model can be trained further
model.save('aspect_rules_model')
PoincareModel.load('aspect_rules_model')

In [None]:
model.kv.distance('phone', 'battery')

In [None]:
model.kv.distance('battery', 'sound')

In [None]:
model.kv.most_similar('battery')

In [None]:
model.kv.closest_child('battery')

In [None]:
model.kv.closest_parent('battery')

In [None]:
model.kv.ancestors('phone')

In [None]:
model.kv.descendants('phone')

In [None]:
# Saves only the vectors from the PoincareModel instance, in the commonly used word2vec format
# model.kv.save_word2vec_format('aspect_rules_vectors')
# PoincareKeyedVectors.load_word2vec_format('aspect_rules_vectors')

In [None]:
# Rank of distance of node 2 from node 1 in relation to distances of all nodes from node 1
model.kv.rank('phone', 'battery')

In [None]:
# Closest child node
model.kv.closest_child('sound')

In [None]:
# Closest child node
model.kv.closest_parent('bluetooth')

In [None]:
# # Position in hierarchy - lower values represent that the node is higher in the hierarchy
# print(model.kv.norm('virginia_deer.n.01'))
# print(model.kv.norm('sheep.n.01'))
# print(model.kv.norm('dog.n.01'))
# print(model.kv.norm('placental.n.01'))
# print(model.kv.norm('mammal.n.01'))

In [None]:
# Difference in hierarchy between the first node and the second node
# Positive values indicate the first node is higher in the hierarchy
print(model.kv.difference_in_hierarchy('phone', 'battery'))

In [None]:
model.kv.difference_in_hierarchy('bluetooth', 'headset')

In [None]:
# One possible descendant chain
model.kv.descendants('sound')

In [None]:
# One possible ancestor chain
model.kv.ancestors('sound')

# Visualization

In [None]:
from gensim.viz.poincare import poincare_2d_visualization, poincare_distance_heatmap

In [None]:
all_relations = list(set(relations))

In [None]:
show_node_labels = ['phone', 'battery', 'sound', 'bluetooth', 'headset', 'price', 'disc']
filtered_set = set()
for relation in all_relations:
    if relation[0] in show_node_labels and relation[1] in show_node_labels:
        filtered_set.add(relation)

In [None]:
filtered_set

In [102]:
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

In [103]:
init_notebook_mode(connected=True)

In [104]:
fig = poincare_2d_visualization(model, filtered_set, "Poincare Hierarchy", show_node_labels=show_node_labels,)

In [105]:
iplot(fig)

In [87]:
iplot(fig, filename='poincare_viz.png')