In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import json

import pandas as pd
import plotly
from more_itertools import flatten
from sklearn.cluster import AgglomerativeClustering
plotly.offline.init_notebook_mode(connected=True)

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

from gensim.models import fasttext

%matplotlib inline


paramiko missing, opening SSH/SCP/SFTP paths will be disabled.  `pip install paramiko` to suppress



In [3]:
from aspects.analysis import draw_embeddings

In [4]:
# import spacy
# nlp = spacy.load('en_vectors_web_lg')

In [5]:
with open('../results/reviews_Cell_Phones_and_Accessories/aspects_per_edu.json') as f:
    aspects = json.load(f)

In [6]:
model = fasttext.load_facebook_model('/home/laugustyniak/data/embeddings/cc.en.300.bin')

In [13]:
aspects['0']

['phone case']

In [14]:
len(aspects)

114536

In [15]:
aspects_flatten = list(flatten(aspects.values()))

In [16]:
aspects_df = pd.DataFrame(aspects_flatten, columns=['aspect'])

In [17]:
aspects_unique_df = pd.DataFrame.from_dict(aspects_df.aspect.value_counts())

In [18]:
aspects_unique_filtered_df = aspects_unique_df[aspects_unique_df.aspect >= 10]

In [19]:
aspects_unique_filtered_df = aspects_unique_filtered_df.applymap(lambda x: x.encode('unicode_escape').decode('utf-8') if isinstance(x, str) else x)

In [13]:
aspects_unique_filtered_df.to_csv('../results/reviews_Cell_Phones_and_Accessories/aspects_per_edu_filtered_min_10_freq.csv')

In [20]:
aspects_unique_filtered_df = aspects_unique_filtered_df.reset_index()
aspects_unique_filtered_df.columns = ['aspect', 'count']
aspects_unique_filtered_df.head(10)

Unnamed: 0,aspect,count
0,price,6383
1,motorola,4246
2,battery,3925
3,amazon,2805
4,sound quality,2636
5,charger,2605
6,screen,2506
7,quality,2350
8,sound,2317
9,use,2169


In [23]:
# def get_vectors(text):
#     return nlp(text).vector

def get_vectors(text):
    return model.wv[text]

In [24]:
aspects_unique_filtered_df['embedding'] = aspects_unique_filtered_df.aspect.apply(get_vectors)

In [39]:
hierarchical_cluster = AgglomerativeClustering(n_clusters=50).fit(aspects_unique_filtered_df.embedding.tolist())
aspects_unique_filtered_df['cluster'] = hierarchical_cluster.labels_

In [40]:
aspects_unique_filtered_df

Unnamed: 0,aspect,count,embedding,cluster
0,price,6383,"[0.0025718957, 0.071242034, -0.08696756, 0.011...",17
1,motorola,4246,"[0.035119545, -0.017936235, -0.038057335, 0.00...",20
2,battery,3925,"[-0.03261149, -0.0027218256, 0.03191225, 0.098...",17
3,amazon,2805,"[-0.08989111, 0.045746423, -0.04043641, 0.0190...",17
4,sound quality,2636,"[0.013252061, 0.018266795, -0.010041953, 0.034...",19
5,charger,2605,"[-0.002593959, -0.003030438, 0.0010705708, 0.0...",17
6,screen,2506,"[0.05261301, 0.053623445, 0.028286066, 0.06863...",17
7,quality,2350,"[0.041569266, 0.017836045, 0.0021298677, 0.019...",17
8,sound,2317,"[0.07356247, -0.026570885, 0.12274804, 0.11256...",17
9,use,2169,"[-0.0271657, 0.06062786, -0.06484954, 0.040234...",42


In [41]:
manual_clusters = {
    1001: ['screen', 'monitor', 'lcd'],
    1002: ['phone', 'telephone', 'mobile phone'],
    1003: ['value', 'price', 'cost', 'expensive', 'cheap', 'money'],
    1004: ['volume', 'sound', 'voice', 
#            'echo' za daleko od innych
          ],
    1005: [],
    1006: ['bluetooth', 'usb', 'port', 'connection', 'wifi', 'network', 'signal range', 'signal'],
    1007: ['cpu', 'processor', 'processing power'],
    1008: ['memory', 'hard drive', 'hard disk', 'floppy disc', 'ssd drive', 'ram', 'rom', 'sd card', 'memory card', 'usb stick', 'pendrive'],
    1009: ['battery', 'charger', 'power', 'charging', 'power plug', 'power bank'],
    1010: ['technical documentation', 'support' ,'update']
}

In [42]:
manual_clusters_tuples = [
    (aspect, cluster_id)
    for cluster_id, aspects
    in manual_clusters.items()
    for aspect in aspects 
]

In [43]:
manual_clusters_df = pd.DataFrame(manual_clusters_tuples, columns=['aspect', 'cluster'])

In [44]:
manual_clusters_df['count'] = 1

In [45]:
manual_clusters_df['embedding'] = manual_clusters_df.aspect.apply(get_vectors)

In [46]:
aspects_with_manual = pd.concat([aspects_unique_filtered_df, manual_clusters_df], sort=False).reset_index(drop=True)

In [47]:
aspects_with_manual.sample(5)

Unnamed: 0,aspect,count,embedding,cluster
202,hinge,53,"[-0.090258524, 0.03709165, 0.12392286, 0.11929...",17
189,virgin mobile,58,"[-0.021190563, 0.019408423, -0.001742857, 0.05...",19
74,sd,186,"[0.34264448, 0.07104431, 0.14408736, -0.105851...",39
197,usa,55,"[0.14131713, -0.070365265, -0.068361, -0.07894...",2
217,connectivity,49,"[-0.013129184, -0.008202838, 0.011541892, -0.0...",17


In [48]:
aspects_unique_filtered_df_tsne = draw_embeddings.get_tsne(aspects_with_manual)
aspects_unique_filtered_df_tsne.sample(5)

Unnamed: 0,x,y,intent,tooltip,cluster
641,7.516838,-7.501706,webtreo,webtreo,19
376,18.368053,24.937998,florida,florida,2
970,-5.018254,7.659589,processor,processor,1007
874,-11.643503,-38.819744,eargels,eargels,19
809,11.448671,27.094051,elliot,elliot,2


In [49]:
draw_embeddings.draw(aspects_unique_filtered_df_tsne)

In [41]:
import qgrid
qgrid_widget = qgrid.show_grid(aspects_unique_filtered_df_tsne, show_toolbar=True)

In [42]:
qgrid_widget

QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': True, 'defau…

In [None]:
2 -> sound
3 -> companies/products

In [32]:
aspects_unique_filtered_df_tsne.to_excel('../results/reviews_Cell_Phones_and_Accessories/aspects_per_edu_filtered_min_10_freq_clustered_spacy_en.xlsx')

In [34]:
aspects_unique_filtered_df_tsne[aspects_unique_filtered_df_tsne.cluster == 4]

Unnamed: 0,x,y,intent,tooltip,cluster
19,-16.618031,-8.395959,works,works,4
30,0.983883,8.936417,software,software,4
45,-25.705854,13.111171,design,design,4
55,-9.545662,-9.006220,support,support,4
59,-17.710215,31.984634,interface,interface,4
61,-28.507902,10.697180,signal,signal,4
63,3.422139,15.756899,memory,memory,4
68,-5.470905,-11.306073,outlook,outlook,4
77,-5.529461,4.885927,applications,applications,4
82,-0.239239,12.443504,install,install,4


# Get aspects from SemEval - annotated 

In [21]:
from aspects.analysis import statistics_dataset

In [22]:
aspects = statistics_dataset.get_aspects()

Corpus iterator: 49475it [00:00, 356789.73it/s]
Corpus iterator: 53781it [00:00, 229197.01it/s]
Corpus iterator: 12470it [00:00, 543995.29it/s]
Corpus iterator: 13257it [00:00, 403704.88it/s]


In [25]:
# aspects

{'Restaurants_poria-train': ['staff',
  'food',
  'food',
  'kitchen',
  'menu',
  'food',
  'perks',
  'orrechiete with sausage and chicken',
  'waiters',
  'dish',
  'meats',
  'bagels',
  'food',
  'mayonnaise',
  'toast',
  'ingredients',
  'cheese',
  'omelet',
  'bacon',
  'plate',
  'check',
  'drinks',
  'design',
  'atmosphere',
  'cuisine',
  'pizza',
  'thin crusted pizza',
  'interior decoration',
  'chefs',
  'seats',
  'seltzer with lime',
  'pickles and',
  'selection of meats and seafoods',
  'eat family style',
  'dishes',
  'vibe',
  'owner',
  'service',
  'delivery',
  'food',
  'atmosphere',
  'service',
  'food',
  'prices',
  'interior decor',
  'prices',
  'wine',
  'price',
  'service',
  'quantity',
  'sushi',
  'sushi bar',
  'fried rice',
  'courses',
  'mussels',
  'puff pastry goat cheese',
  'salad with a delicious dressing',
  'a hanger steak au poivre',
  'indian food',
  'place',
  'service',
  'with',
  'broth with noodles',
  'meal',
  'money',
  'fo