In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import json

from pathlib import Path

import pandas as pd
import plotly
from more_itertools import flatten
from sklearn.cluster import AgglomerativeClustering
plotly.offline.init_notebook_mode(connected=True)

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

from gensim.models import fasttext

%matplotlib inline

In [3]:
from aspects.analysis import draw_embeddings

In [15]:
import spacy
nlp = spacy.load('en_vectors_web_lg')
# nlp = spacy.load('en')

# Load prodigy manually created aspect lexicons

In [25]:
PRODIGY_PATH = Path('/home/laugustyniak/github/nlp/prodigy-trainings/')

In [26]:
aspect_jsonls = list(PRODIGY_PATH.glob('*.jsonl'))

In [27]:
aspect_jsonls

[PosixPath('/home/laugustyniak/github/nlp/prodigy-trainings/connection-aspects-patterns.jsonl'),
 PosixPath('/home/laugustyniak/github/nlp/prodigy-trainings/screen-aspects-patterns.jsonl'),
 PosixPath('/home/laugustyniak/github/nlp/prodigy-trainings/cpu-aspects-patterns.jsonl'),
 PosixPath('/home/laugustyniak/github/nlp/prodigy-trainings/sound-aspects-patterns.jsonl'),
 PosixPath('/home/laugustyniak/github/nlp/prodigy-trainings/memory-aspects-patterns.jsonl'),
 PosixPath('/home/laugustyniak/github/nlp/prodigy-trainings/battery-aspects-patterns.jsonl'),
 PosixPath('/home/laugustyniak/github/nlp/prodigy-trainings/price-aspects-patterns.jsonl'),
 PosixPath('/home/laugustyniak/github/nlp/prodigy-trainings/technical-support-aspects-patterns.jsonl')]

In [28]:
aspects_prodigy = []
for j in aspect_jsonls:
    print(j)
    with open(j.as_posix(), 'r') as lines:
        for line in lines:
            aspects_prodigy.append(json.loads(line))

/home/laugustyniak/github/nlp/prodigy-trainings/connection-aspects-patterns.jsonl
/home/laugustyniak/github/nlp/prodigy-trainings/screen-aspects-patterns.jsonl
/home/laugustyniak/github/nlp/prodigy-trainings/cpu-aspects-patterns.jsonl
/home/laugustyniak/github/nlp/prodigy-trainings/sound-aspects-patterns.jsonl
/home/laugustyniak/github/nlp/prodigy-trainings/memory-aspects-patterns.jsonl
/home/laugustyniak/github/nlp/prodigy-trainings/battery-aspects-patterns.jsonl
/home/laugustyniak/github/nlp/prodigy-trainings/price-aspects-patterns.jsonl
/home/laugustyniak/github/nlp/prodigy-trainings/technical-support-aspects-patterns.jsonl


In [29]:
prodigy_aspects_df = pd.DataFrame([
    {
        'label': label_pattern['label'],
        'term': label_pattern['pattern'][0]['lower']
    } 
    for label_pattern 
    in aspects_prodigy
])

In [30]:
prodigy_aspects_df

Unnamed: 0,label,term
0,CONNECTION,bluetooth
1,CONNECTION,ethernet
2,CONNECTION,wlan
3,CONNECTION,connection
4,CONNECTION,signal
5,CONNECTION,signal range
6,CONNECTION,network
7,CONNECTION,wifi
8,CONNECTION,usb
9,CONNECTION,wireless


In [31]:
prodigy_aspects_df['embedding'] = prodigy_aspects_df.term.apply(lambda text: nlp(text).vector)

In [32]:
prodigy_aspects_df['tooltip'] = prodigy_aspects_df.label + ' : ' +  prodigy_aspects_df.term

In [33]:
prodigy_aspects_df['cluster'] = AgglomerativeClustering(n_clusters=len(prodigy_aspects_df.label.unique())).fit(prodigy_aspects_df.embedding.tolist()).labels_

In [34]:
draw_embeddings.draw(draw_embeddings.get_tsne(prodigy_aspects_df, text='term', tooltip_col='tooltip'))

# Load SemEval dataset with aspects and categories

In [5]:
from utilities.settings import SEMEVAL_DATASETS_2016

In [6]:
semeval_df = pd.read_csv(SEMEVAL_DATASETS_2016 / 'all-entities-and-aspects.csv')

In [9]:
semeval_df.target = semeval_df.target.apply(lambda t: t.lower() if isinstance(t, str) else t)

In [16]:
semeval_df.dropna(how='all', inplace=True)

In [24]:
semeval_df.groupby(['category', 'target']).count().to_excel('restaurant-aspect-categories-and-targets.xlsx')

In [6]:
with open('../results/reviews_Cell_Phones_and_Accessories/aspects_per_edu.json') as f:
    aspects = json.load(f)

In [7]:
from bert_embedding import BertEmbedding

In [8]:
# model = fasttext.load_facebook_model('/home/laugustyniak/data/embeddings/cc.en.300.bin')

In [9]:
aspects['0']

['phone case']

In [10]:
bert_embedding = BertEmbedding(model='bert_24_1024_16', dataset_name='book_corpus_wiki_en_cased')

In [11]:
bert_embedding(['battery life'])[0][1][0].shape

(1024,)

In [12]:
len(aspects)

114536

In [13]:
aspects_flatten = list(flatten(aspects.values()))

In [14]:
aspects_df = pd.DataFrame(aspects_flatten, columns=['aspect'])

In [15]:
aspects_unique_df = pd.DataFrame.from_dict(aspects_df.aspect.value_counts())

In [16]:
aspects_unique_filtered_df = aspects_unique_df[aspects_unique_df.aspect >= 10]

In [17]:
aspects_unique_filtered_df = aspects_unique_filtered_df.applymap(lambda x: x.encode('unicode_escape').decode('utf-8') if isinstance(x, str) else x)

In [18]:
aspects_unique_filtered_df.to_csv('../results/reviews_Cell_Phones_and_Accessories/aspects_per_edu_filtered_min_10_freq.csv')

In [19]:
aspects_unique_filtered_df = aspects_unique_filtered_df.reset_index()
aspects_unique_filtered_df.columns = ['aspect', 'count']
aspects_unique_filtered_df.head(10)

Unnamed: 0,aspect,count
0,price,6383
1,motorola,4246
2,battery,3925
3,amazon,2805
4,sound quality,2636
5,charger,2605
6,screen,2506
7,quality,2350
8,sound,2317
9,use,2169


In [20]:
import numpy as np

In [21]:
# def get_vectors(text):
#     return nlp(text).vector

# def get_vectors(text):
#     return np.mean([model.wv[token.text] for token in nlp(text)], axis=0)

def get_vectors(text):
    return np.mean([bert_embedding([text])[0][1][0] for token in nlp(text)], axis=0)

In [22]:
get_vectors('test and there')

array([-0.6021834 , -0.90436697, -0.05011599, ..., -0.5272503 ,
        0.2523726 , -0.39430416], dtype=float32)

In [23]:
aspects_unique_filtered_df['embedding'] = aspects_unique_filtered_df.aspect.apply(get_vectors)

In [24]:
hierarchical_cluster = AgglomerativeClustering(n_clusters=10).fit(aspects_unique_filtered_df.embedding.tolist())
aspects_unique_filtered_df['cluster'] = hierarchical_cluster.labels_

In [25]:
aspects_unique_filtered_df

Unnamed: 0,aspect,count,embedding,cluster
0,price,6383,"[-0.4442973, -1.0091343, 0.060599357, -0.11182...",1
1,motorola,4246,"[-0.25731423, -0.4119842, -0.2841267, -0.03631...",5
2,battery,3925,"[-0.31355655, -1.0403382, -0.00390834, -0.0332...",1
3,amazon,2805,"[-0.807282, -0.48361394, -0.04476956, 0.025126...",2
4,sound quality,2636,"[-0.14678049, -0.3433536, -0.41184878, -0.7414...",0
5,charger,2605,"[-0.5523349, -0.75383353, -0.061513424, 0.1178...",5
6,screen,2506,"[-0.28711498, -0.83723056, -0.012677126, -0.13...",1
7,quality,2350,"[-0.08396894, -0.008126093, 0.057925284, -0.07...",0
8,sound,2317,"[-0.44968295, -0.9441416, 0.007318586, -0.0798...",1
9,use,2169,"[-0.21992154, -0.71526617, 0.03515914, 0.17506...",1


In [26]:
manual_clusters = {
    1001: ['screen', 'monitor', 'lcd'],
    1002: ['phone', 'telephone', 'mobile phone'],
    1003: ['value', 'price', 'cost', 'expensive', 'cheap', 'money'],
    1004: ['volume', 'sound', 'voice', 
#            'echo' za daleko od innych
          ],
    1005: [],
    1006: ['bluetooth', 'usb', 'port', 'connection', 'wifi', 'network', 'signal range', 'signal'],
    1007: ['cpu', 'processor'],
    1008: ['memory', 'hard drive', 'hard disk', 'floppy disc', 'ssd drive', 'ram', 'rom', 'sd card', 'memory card', 'usb stick', 'pendrive'],
    1009: ['battery', 'charger', 'power', 'charging', 'power plug', 'power bank'],
    1010: ['technical documentation', 'support' ,'update']
}

In [27]:
manual_clusters_tuples = [
    (aspect, cluster_id)
    for cluster_id, aspects
    in manual_clusters.items()
    for aspect in aspects 
]

In [28]:
manual_clusters_df = pd.DataFrame(manual_clusters_tuples, columns=['aspect', 'cluster'])

In [29]:
manual_clusters_df['count'] = 1

In [30]:
manual_clusters_df['embedding'] = manual_clusters_df.aspect.apply(get_vectors)

In [31]:
aspects_with_manual = pd.concat([aspects_unique_filtered_df, manual_clusters_df], sort=False).reset_index(drop=True)

In [32]:
aspects_with_manual.sample(5)

Unnamed: 0,aspect,count,embedding,cluster
964,connection,1,"[-0.39574474, -0.80263203, -0.010777239, 0.079...",1006
176,dialing,62,"[-0.36496538, -0.3415268, 0.49024117, 0.025630...",5
803,application,11,"[-0.20704626, -0.80060315, -0.03289241, -0.013...",1
375,florida,27,"[0.11800394, 0.13024826, 0.2570347, -0.3787836...",2
428,rca,23,"[-0.45669454, -0.52944344, -0.026075576, -0.12...",6


In [33]:
aspects_unique_filtered_df_tsne = draw_embeddings.get_tsne(aspects_with_manual)
aspects_unique_filtered_df_tsne.sample(5)

Unnamed: 0,x,y,text,tooltip,cluster,embedding
939,-24.436518,-35.785076,camera resolution,camera resolution,1,"[-0.36682415, -0.9974448, -0.21558416, -0.2325..."
943,10.80214,18.798992,prius,prius,5,"[-0.21679571, -0.5780887, 0.32710636, -0.56092..."
695,11.590446,25.670683,accessory one,accessory one,5,"[-0.50451463, -0.4835921, -0.23530266, -0.1732..."
161,1.444517,-39.949253,graphics,graphics,1,"[-0.33875427, -1.0402431, -0.046462692, -0.106..."
830,-4.969251,12.305506,700w,700w,5,"[-0.57894605, -1.1176313, -0.052225683, -0.296..."


In [37]:
draw_embeddings.draw(aspects_unique_filtered_df_tsne)

In [35]:
import qgrid
qgrid_widget = qgrid.show_grid(aspects_unique_filtered_df_tsne, show_toolbar=True)

In [36]:
qgrid_widget

QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': True, 'defau…

In [79]:
aspects_unique_filtered_df_tsne.to_excel('../results/reviews_Cell_Phones_and_Accessories/aspects_per_edu_filtered_min_10_freq_clustered_spacy_en.xlsx')

IllegalCharacterError: 

## Get id of the biggest cluster

In [None]:
biggest_cluster_id = aspects_unique_filtered_df_tsne.cluster.value_counts().keys()[0]

In [None]:
aspects_biggest_cluster = aspects_unique_filtered_df_tsne[aspects_unique_filtered_df_tsne.cluster == biggest_cluster_id]

# Second pass of clustering inside terms already marked as aspects :D 

In [34]:
hierarchical_cluster_inner = AgglomerativeClustering(n_clusters=10).fit(aspects_biggest_cluster.embedding.tolist())
aspects_biggest_cluster['cluster'] = hierarchical_cluster_inner.labels_



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [35]:
aspects_biggest_cluster.head()

Unnamed: 0,x,y,text,tooltip,cluster,embedding
0,20.684923,36.33868,price,price,4,"[0.0025718957, 0.071242034, -0.08696756, 0.011..."
2,-11.804303,7.810336,battery,battery,4,"[-0.03261149, -0.0027218256, 0.03191225, 0.098..."
3,-18.295507,-1.03643,amazon,amazon,4,"[-0.08989111, 0.045746423, -0.04043641, 0.0190..."
4,10.001678,24.498201,sound quality,sound quality,0,"[0.057565868, -0.0043674195, 0.062438954, 0.06..."
5,-14.308353,6.374245,charger,charger,4,"[-0.002593959, -0.003030438, 0.0010705708, 0.0..."


In [36]:
aspects_biggest_cluster_tsne = draw_embeddings.get_tsne(aspects_biggest_cluster, text='intent', tooltip_col='tooltip')

KeyError: 'intent'

In [None]:
draw_embeddings.draw(aspects_biggest_cluster_tsne)

In [81]:
qgrid_widget_inner = qgrid.show_grid(aspects_biggest_cluster_tsne, show_toolbar=True)

In [82]:
qgrid_widget_inner

QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': True, 'defau…

# Get aspects from SemEval - annotated 

In [21]:
from aspects.analysis import statistics_dataset

In [22]:
aspects = statistics_dataset.get_aspects()

Corpus iterator: 49475it [00:00, 356789.73it/s]
Corpus iterator: 53781it [00:00, 229197.01it/s]
Corpus iterator: 12470it [00:00, 543995.29it/s]
Corpus iterator: 13257it [00:00, 403704.88it/s]


In [25]:
# aspects

{'Restaurants_poria-train': ['staff',
  'food',
  'food',
  'kitchen',
  'menu',
  'food',
  'perks',
  'orrechiete with sausage and chicken',
  'waiters',
  'dish',
  'meats',
  'bagels',
  'food',
  'mayonnaise',
  'toast',
  'ingredients',
  'cheese',
  'omelet',
  'bacon',
  'plate',
  'check',
  'drinks',
  'design',
  'atmosphere',
  'cuisine',
  'pizza',
  'thin crusted pizza',
  'interior decoration',
  'chefs',
  'seats',
  'seltzer with lime',
  'pickles and',
  'selection of meats and seafoods',
  'eat family style',
  'dishes',
  'vibe',
  'owner',
  'service',
  'delivery',
  'food',
  'atmosphere',
  'service',
  'food',
  'prices',
  'interior decor',
  'prices',
  'wine',
  'price',
  'service',
  'quantity',
  'sushi',
  'sushi bar',
  'fried rice',
  'courses',
  'mussels',
  'puff pastry goat cheese',
  'salad with a delicious dressing',
  'a hanger steak au poivre',
  'indian food',
  'place',
  'service',
  'with',
  'broth with noodles',
  'meal',
  'money',
  'fo