# Imports and settings

In [1]:
%load_ext autoreload
%autoreload 2

import os
import logging
import numpy as np
import pandas as pd

from gensim.models.poincare import PoincareModel, PoincareKeyedVectors, PoincareRelations

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

logging.basicConfig(level=logging.INFO)

In [2]:
from pathlib import Path

In [3]:
import spacy 
nlp = spacy.load('en')

In [4]:
from tqdm import tqdm  # for notebooks
tqdm.pandas()

# Load data and models

In [5]:
data_directory = Path('../results/reviews_Cell_Phones_and_Accessories/')
aspect_rules_file = os.path.join(data_directory, 'aspect-rules.csv')

In [6]:
aspect_df_min_10_times = pd.read_csv(data_directory / 'aspects_per_edu_filtered_min_10_freq.csv')
aspect_df_min_10_times.columns = ['aspect', 'count']

The model can be initialized using an iterable of relations, where a relation is simply a pair of nodes

In [9]:
aspect_df_min_10_times.shape

(946, 2)

In [7]:
aspect_df_min_10_times.head()

Unnamed: 0,aspect,count
0,price,6383
1,motorola,4246
2,battery,3925
3,amazon,2805
4,sound quality,2636


## Unique aspects

In [8]:
len(aspect_df_min_10_times.aspect.unique())

946

## Get relations 

In [10]:
aspect_rules_df = pd.read_csv(aspect_rules_file)
aspect_rules_df = aspect_rules_df[['id1', 'id2']]
aspect_rules_df.sample(2)

Unnamed: 0,id1,id2
2012,phone case,boxwave
2363,screen,nokia


In [12]:
aspect_rules_df.shape

(3651, 2)

## Add aspects from ConceptNet 

In [13]:
from aspects.enrichments import conceptnets

In [14]:
conceptnet = conceptnets.load_conceptnet_io()

INFO:aspects.enrichments.conceptnets:ConceptNet.io temp files will be load from: /home/laugustyniak/github/phd/sentiment-backend/aspects/data/conceptnet/conceptnet_io.pkl


 'car' -> 'HasPrerequisite' -> 'tire'

In [15]:
conceptnet['tire']

[{'end': 'active',
  'end-lang': 'en',
  'relation': 'Antonym',
  'start': 'tire',
  'start-lang': 'en',
  'weight': 0.15},
 {'end': 'bus_depot',
  'end-lang': 'en',
  'relation': 'AtLocation',
  'start': 'tire',
  'start-lang': 'en',
  'weight': 1.0},
 {'end': 'car',
  'end-lang': 'en',
  'relation': 'AtLocation',
  'start': 'tire',
  'start-lang': 'en',
  'weight': 1.0},
 {'end': 'car_show',
  'end-lang': 'en',
  'relation': 'AtLocation',
  'start': 'tire',
  'start-lang': 'en',
  'weight': 1.0},
 {'end': 'garage',
  'end-lang': 'en',
  'relation': 'AtLocation',
  'start': 'tire',
  'start-lang': 'en',
  'weight': 1.0},
 {'end': 'trunk',
  'end-lang': 'en',
  'relation': 'AtLocation',
  'start': 'tire',
  'start-lang': 'en',
  'weight': 1.0},
 {'end': 'get_leak',
  'end-lang': 'en',
  'relation': 'CapableOf',
  'start': 'tire',
  'start-lang': 'en',
  'weight': 1.0},
 {'end': 'need_attention_soon',
  'end-lang': 'en',
  'relation': 'CapableOf',
  'start': 'tire',
  'start-lang': 'en'

In [16]:
hierarchical_relations_parent_child = ['HasA', 'MadeOf', 'HasPrerequisite']
hierarchical_relations_child_parent = ['PartOf', 'IsA']

In [17]:
def get_neighbours_child_and_parents(conceptnet, concept, relation_types_get_child, relation_types_get_parent):
    neighbours_childs = set(
        (concept, concept_info['relation'], "---->", concept_info['end'])
        for concept_info
        in conceptnet[concept]
        if concept_info['relation'] in relation_types_get_child
    )
    neighbours_parents = set(
        (concept_info['start'], concept_info['relation'],  "<----", concept)
        
        for concept_info
        in conceptnet[concept]
        if concept_info['relation'] in relation_types_get_parent
    )

    return list(neighbours_childs.union(neighbours_parents))

In [18]:
get_neighbours_child_and_parents(conceptnet, 'tire', hierarchical_relations_child_parent, hierarchical_relations_parent_child)

[('tire', 'IsA', '---->', 'part_for_wheeled_vehicles'),
 ('car', 'HasPrerequisite', '<----', 'tire'),
 ('tire', 'IsA', '---->', 'hoop'),
 ('tire', 'IsA', '---->', 'indispose'),
 ('tire', 'IsA', '---->', 'non_powered_device'),
 ('tire', 'PartOf', '---->', "car's_wheel"),
 ('tire', 'IsA', '---->', 'tire'),
 ('tire', 'IsA', '---->', 'wheel'),
 ('tire', 'PartOf', '---->', 'car'),
 ('tire', 'IsA', '---->', 'devolve'),
 ('tire', 'MadeOf', '<----', 'tire'),
 ('tire', 'PartOf', '---->', 'automobile')]

In [19]:
def get_concepts_based_on_hierarchy_relation(conceptnet, concept, relation_types_get_child, relation_types_get_parent):
    neighbours_childs = set(
        (concept_info['end'].replace('_', ' '), concept)
        for concept_info
        in conceptnet[concept]
        if concept_info['relation'] in relation_types_get_child
    )
    neighbours_parents = set(
        (concept_info['start'].replace('_', ' '), concept)
        for concept_info
        in conceptnet[concept]
        if concept_info['relation'] in relation_types_get_parent
    )

    return list(neighbours_childs.union(neighbours_parents))

In [20]:
get_concepts_based_on_hierarchy_relation(conceptnet, 'battery', hierarchical_relations_child_parent, hierarchical_relations_parent_child)

[('electrical component', 'battery'),
 ('directed route', 'battery'),
 ('using calculator', 'battery'),
 ('battery', 'battery'),
 ('voltage source', 'battery'),
 ('source of current', 'battery'),
 ('device stores energy chemically', 'battery'),
 ('solar energy system', 'battery'),
 ('collection', 'battery'),
 ('electrical device', 'battery'),
 ('artillery', 'battery'),
 ('stamp mill', 'battery'),
 ('team', 'battery'),
 ('man made thing', 'battery'),
 ('start car', 'battery'),
 ('baseball team', 'battery'),
 ('assault', 'battery'),
 ('electrochemical cell', 'battery'),
 ('power source', 'battery')]

## Filter rules and aspects

In [21]:
aspect_rules_df = aspect_rules_df[aspect_rules_df.id1.isin(aspect_df_min_10_times.aspect.values) & aspect_rules_df.id2.isin(aspect_df_min_10_times.aspect.values)]

In [22]:
len(aspect_df_min_10_times.aspect.values)

946

In [23]:
# aspect_rules_df.drop_duplicates(inplace=True)

In [24]:
relations = list(zip(aspect_rules_df.id2, aspect_rules_df.id1))

In [25]:
len(relations)

2282

In [22]:
# relations

### Add concepts from conceptnet 

In [23]:
from more_itertools import flatten

In [24]:
conceptnet_relations = list(flatten([
    get_concepts_based_on_hierarchy_relation(conceptnet, aspect, hierarchical_relations_child_parent, hierarchical_relations_parent_child)
    for aspect
    in tqdm(aspect_df_min_10_times.aspect.values)
]))

In [25]:
len(conceptnet_relations)

3099

In [26]:
all_relations = relations + conceptnet_relations

In [27]:
len(all_relations)

4650

## Filter aspects that are Named Entities [wrongly extracted during aspects extraction]

In [28]:
def is_named_entity(text):
    return True if nlp(text).ents else False

In [29]:
def is_named_entity(texts):
    return [
        text
        for text in texts
        if not nlp(text).ents   
    ]

In [30]:
nlp('amazon.com').ents

()

In [31]:
is_named_entity((
    'amazon',
    'tire'
))

['amazon', 'tire']

In [32]:
# aspect_rules_df = aspect_rules_df[~aspect_rules_df.progress_apply(lambda row: any([is_named_entity(row.id1), is_named_entity(row.id2)]), axis=1)]

In [33]:
# aspect_rules_df

## Train or load pre-trained embedding

In [34]:
model = PoincareModel(train_data=all_relations, size=2, burn_in=0)

INFO:gensim.models.poincare:loading relations from train data..
INFO:gensim.models.poincare:loaded 4650 relations from train data, 2707 nodes


In [36]:
model.train(epochs=100, print_every=500)

INFO:gensim.models.poincare:training model of size 2 with 1 workers on 4650 relations for 100 epochs and 0 burn-in epochs, using lr=0.10000 burn-in lr=0.01000 negative=10
INFO:gensim.models.poincare:starting training (100 epochs)----------------------------------------
INFO:gensim.models.poincare:training finished


In [37]:
# Saves the entire PoincareModel instance, the loaded model can be trained further
model.save('aspect_rules_model_with_conceptnet_relations')

INFO:gensim.utils:saving PoincareModel object under aspect_rules_model_with_conceptnet_relations, separately None
INFO:gensim.utils:not storing attribute _node_probabilities
INFO:gensim.utils:not storing attribute _node_counts_cumsum
INFO:gensim.utils:saved aspect_rules_model_with_conceptnet_relations


In [38]:
model = PoincareModel.load('aspect_rules_model_with_conceptnet_relations')

INFO:gensim.utils:loading PoincareModel object from aspect_rules_model_with_conceptnet_relations
INFO:gensim.utils:loading kv recursively from aspect_rules_model_with_conceptnet_relations.kv.* with mmap=None
INFO:gensim.utils:setting ignored attribute _node_probabilities to None
INFO:gensim.utils:setting ignored attribute _node_counts_cumsum to None
INFO:gensim.utils:loaded aspect_rules_model_with_conceptnet_relations


# Experiment with embeddings distances and other attributes

In [80]:
model.kv.distance('battery life', 'battery')

0.6535688230830381

In [83]:
model.kv.distance('case', 'battery')

0.9756102605956604

In [39]:
model.kv.distance('phone', 'battery')

0.7873491157117497

In [40]:
model.kv.distance('battery', 'phone')

0.7873491157117497

In [41]:
model.kv.distance('battery', 'sound')

0.9831219229245527

In [42]:
model.kv.most_similar('battery')

[('extended life battery', 0.03928540762000633),
 ('durability', 0.06150953437530923),
 ('bluetooth connection', 0.11546349192917825),
 ('vonage', 0.12504089675208713),
 ('car charger', 0.1310651807269191),
 ('alltel', 0.13405388030240115),
 ('krzr', 0.16060392675240817),
 ('never', 0.16268109612134493),
 ('voice dial', 0.19329367543601206),
 ('at & t', 0.2023409538045585)]

In [43]:
model.kv.closest_child('battery')

'extended life battery'

In [44]:
model.kv.closest_parent('battery')

'durability'

In [45]:
model.kv.ancestors('phone')

['touch screen',
 'pictures',
 'functions',
 'ear speaker',
 'interface',
 'qwerty',
 'features',
 'wap',
 'sync',
 'ericson',
 'samsung',
 'camera quality',
 'consume',
 'electric blanket',
 'cord',
 'fashionable textile',
 'volume unit',
 'earbuds',
 'adapter',
 'earphone',
 'protocol stack layer',
 'mac']

In [46]:
model.kv.descendants('phone')

['motorola droid', 'settings', 'color screen', 'cingular', 'sd']

In [47]:
# Saves only the vectors from the PoincareModel instance, in the commonly used word2vec format
# model.kv.save_word2vec_format('aspect_rules_vectors')
# PoincareKeyedVectors.load_word2vec_format('aspect_rules_vectors')

In [48]:
# Rank of distance of node 2 from node 1 in relation to distances of all nodes from node 1
model.kv.rank('phone', 'battery')

133

In [49]:
# Closest child node
model.kv.closest_child('sound')

'blackberry curve'

In [50]:
# Closest child node
model.kv.closest_parent('bluetooth')

'screen quality'

In [51]:
# Difference in hierarchy between the first node and the second node
# Positive values indicate the first node is higher in the hierarchy
print(model.kv.difference_in_hierarchy('phone', 'battery'))

0.0894106380464228


In [52]:
print(model.kv.difference_in_hierarchy('battery', 'battery life'))

-0.2588433525831279


In [53]:
model.kv.difference_in_hierarchy('bluetooth', 'headset')

0.4335900804303457

In [54]:
model.kv.difference_in_hierarchy('charger', 'phone')

0.14925883625996417

In [55]:
model.kv.difference_in_hierarchy('signal', 'phone')

-0.07389436421619844

In [56]:
# One possible descendant chain
model.kv.descendants('sound')

['blackberry curve', 'priced', 'scosche', 'cardo scala', 'controls']

In [57]:
# One possible ancestor chain
model.kv.ancestors('sound')

['body glove',
 'sony',
 'plug',
 'droid',
 'rma',
 'dc',
 'quantity',
 'military company',
 'blackberry 7100',
 'scala',
 'mac']

# Visualization

In [58]:
from gensim.viz.poincare import poincare_2d_visualization, poincare_distance_heatmap

In [59]:
all_relations = list(set(all_relations))

In [74]:
show_node_labels = ['battery', 'sound', 'bluetooth', 'headset', 'price', 'battery life', 'charging', 'charger', 'earphone']
filtered_set = set()
for relation in all_relations:
    if relation[0] in show_node_labels and relation[1] in show_node_labels:
        filtered_set.add(relation)

In [75]:
filtered_set

{('battery', 'battery'),
 ('battery', 'battery life'),
 ('battery', 'charger'),
 ('battery', 'price'),
 ('battery life', 'bluetooth'),
 ('battery life', 'sound'),
 ('bluetooth', 'battery life'),
 ('bluetooth', 'price'),
 ('bluetooth', 'sound'),
 ('charger', 'price'),
 ('charger', 'sound'),
 ('headset', 'bluetooth'),
 ('headset', 'headset'),
 ('price', 'battery'),
 ('price', 'battery life'),
 ('price', 'bluetooth'),
 ('price', 'charger'),
 ('price', 'headset'),
 ('price', 'price'),
 ('price', 'sound'),
 ('sound', 'battery life'),
 ('sound', 'price'),
 ('sound', 'sound')}

In [76]:
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

In [77]:
init_notebook_mode(connected=True)

In [78]:
fig = poincare_2d_visualization(model, filtered_set, "Poincare Hierarchy", show_node_labels=show_node_labels)

In [79]:
iplot(fig)

In [66]:
iplot(fig, filename='poincare_viz_with_conceptnet_relations.png')