In [1]:
%load_ext autoreload
%autoreload 2

In [24]:
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import  Bar, Layout, Figure
from plotly import __version__
from tqdm import tqdm_notebook
import numpy as np
import pandas as pd
from pathlib import Path
from typing import Iterable, Set
import sys
from IPython.core.display import display, HTML

display(HTML("<style>.container { width:100% !important; }</style>"))


%matplotlib inline

sys.path.append('/home/laugustyniak/github/phd/sentiment-backend/')

from aspects.analysis import statistics_dataset
from aspects.analysis import nlp_architect

init_notebook_mode(connected=True)

In [25]:
%%capture
from tqdm import tqdm_notebook as tqdm
tqdm().pandas()

# 2014

In [3]:
annotated_aspects = statistics_dataset.get_aspects()

Corpus iterator: 49475it [00:00, 437390.14it/s]
Corpus iterator: 53781it [00:00, 863374.53it/s]
Corpus iterator: 12470it [00:00, 749809.63it/s]
Corpus iterator: 13257it [00:00, 736699.76it/s]


## Unique aspects

In [20]:
pd.DataFrame([
    (k, len(set(v)))
    for k, v 
    in annotated_aspects.items()
], columns=['subset', 'unique_aspects'])
    

Unnamed: 0,subset,unique_aspects
0,Restaurants_poria-train,1241
1,Laptops_poria-train,973
2,Laptops_poria-test,400
3,Restaurants_poria-test,530


## Aspect intersection between train and test

In [4]:
annotated_aspects

{'Restaurants_poria-train': ['staff',
  'food',
  'food',
  'kitchen',
  'menu',
  'food',
  'perks',
  'orrechiete with sausage and chicken',
  'waiters',
  'dish',
  'meats',
  'bagels',
  'food',
  'mayonnaise',
  'toast',
  'ingredients',
  'cheese',
  'omelet',
  'bacon',
  'plate',
  'check',
  'drinks',
  'design',
  'atmosphere',
  'cuisine',
  'pizza',
  'thin crusted pizza',
  'interior decoration',
  'chefs',
  'seats',
  'seltzer with lime',
  'pickles and',
  'selection of meats and seafoods',
  'eat family style',
  'dishes',
  'vibe',
  'owner',
  'service',
  'delivery',
  'food',
  'atmosphere',
  'service',
  'food',
  'prices',
  'interior decor',
  'prices',
  'wine',
  'price',
  'service',
  'quantity',
  'sushi',
  'sushi bar',
  'fried rice',
  'courses',
  'mussels',
  'puff pastry goat cheese',
  'salad with a delicious dressing',
  'a hanger steak au poivre',
  'indian food',
  'place',
  'service',
  'with',
  'broth with noodles',
  'meal',
  'money',
  'fo

In [5]:
annotated_aspects.keys()

dict_keys(['Restaurants_poria-train', 'Laptops_poria-train', 'Laptops_poria-test', 'Restaurants_poria-test'])

### Laptops

In [9]:
len(set(annotated_aspects['Laptops_poria-test']) - set(annotated_aspects['Laptops_poria-train']))

243

In [10]:
len(set(annotated_aspects['Laptops_poria-train']) - set(annotated_aspects['Laptops_poria-test']))

816

### Restaurants

In [8]:
len(set(annotated_aspects['Restaurants_poria-test']) - set(annotated_aspects['Restaurants_poria-train']))

336

In [11]:
len(set(annotated_aspects['Restaurants_poria-train']) - set(annotated_aspects['Restaurants_poria-test']))

1047

## Aspect Distribution

In [27]:
for k, v in annotated_aspects.items():
    print(k)
    print(pd.DataFrame(v, columns=['aspect'])['aspect'].value_counts()[:20])

Restaurants_poria-train
food          373
service       236
place          65
prices         64
staff          57
menu           57
dinner         56
pizza          50
atmosphere     49
price          42
table          41
meal           39
sushi          38
drinks         35
bar            32
lunch          29
dishes         28
decor          27
ambience       27
portions       27
Name: aspect, dtype: int64
Laptops_poria-train
screen          64
price           57
use             55
battery life    55
keyboard        53
battery         48
programs        37
features        35
software        34
warranty        31
hard drive      30
windows         30
quality         25
size            24
performance     23
speed           21
applications    18
graphics        18
memory          18
runs            17
Name: aspect, dtype: int64
Laptops_poria-test
price               19
performance         15
works               14
os                  13
features            12
screen              10
windo

# 2016

## Load pre-processed datasets [all in one file]

In [3]:
from utilities.settings import SEMEVAL_DATASETS_2016

In [4]:
semeval_2016_df = pd.read_csv(SEMEVAL_DATASETS_2016 / 'all-entities-and-aspects.csv')

In [16]:
semeval_2016_df[~semeval_2016_df.target.isna()].groupby(['category', 'target']).count()[['aspect']]

Unnamed: 0_level_0,Unnamed: 1_level_0,aspect
category,target,Unnamed: 2_level_1
AMBIENCE#GENERAL,Ambiance,1
AMBIENCE#GENERAL,Ambience,2
AMBIENCE#GENERAL,Atmosphere,4
AMBIENCE#GENERAL,Balcony,2
AMBIENCE#GENERAL,Cosette,1
AMBIENCE#GENERAL,DJ,1
AMBIENCE#GENERAL,Decor,4
AMBIENCE#GENERAL,Dining Garden,1
AMBIENCE#GENERAL,Downstairs lounge,1
AMBIENCE#GENERAL,Egyptian restaurant,1


In [27]:
from aspects.embeddings.bert import BertWrapper

In [28]:
bert_embedder = BertWrapper()

In [29]:
semeval_2016_df.head()

Unnamed: 0.1,Unnamed: 0,category,entity,aspect,target,polarity
0,41,AMBIENCE#GENERAL,AMBIENCE,GENERAL,tables,negative
1,38,AMBIENCE#GENERAL,AMBIENCE,GENERAL,candle-light,positive
2,43,AMBIENCE#GENERAL,AMBIENCE,GENERAL,interior decor,positive
3,56,AMBIENCE#GENERAL,AMBIENCE,GENERAL,interior,negative
4,57,AMBIENCE#GENERAL,AMBIENCE,GENERAL,space,negative


In [32]:
semeval_2016_df.target = semeval_2016_df.target.progress_apply(str)

HBox(children=(IntProgress(value=0, max=4291), HTML(value='')))

In [33]:
semeval_2016_df['embedding'] = semeval_2016_df.target.progress_apply(bert_embedder.get_averaged_vector)

HBox(children=(IntProgress(value=0, max=4291), HTML(value='')))

In [45]:
semeval_2016_df.head()

Unnamed: 0.1,Unnamed: 0,category,entity,aspect,target,polarity,embedding,cluster
0,41,AMBIENCE#GENERAL,AMBIENCE,GENERAL,tables,negative,"[-0.26636368, -0.9098466, -0.23358482, -0.0058...",9
1,38,AMBIENCE#GENERAL,AMBIENCE,GENERAL,candle-light,positive,"[-0.46664533, -1.2059094, -0.12933376, -0.0101...",9
2,43,AMBIENCE#GENERAL,AMBIENCE,GENERAL,interior decor,positive,"[-0.47207767, -1.2466235, -0.20648655, -0.3695...",9
3,56,AMBIENCE#GENERAL,AMBIENCE,GENERAL,interior,negative,"[-0.1322847, -0.7842638, -0.5341981, -0.113490...",3
4,57,AMBIENCE#GENERAL,AMBIENCE,GENERAL,space,negative,"[-0.1815286, -1.1119602, 0.0013807006, 0.19098...",5


In [57]:
len(semeval_2016_df.category.unique())

12

In [35]:
semeval_2016_df.to_pickle(SEMEVAL_DATASETS_2016 / 'all-entities-and-aspects-with-bert-embeddings.pkl')

In [38]:
semeval_2016_df.drop_duplicates(subset=['target'], inplace=True)

In [39]:
from sklearn.cluster import AgglomerativeClustering

In [61]:
hierarchical_cluster = AgglomerativeClustering(n_clusters=50).fit(semeval_2016_df.embedding.tolist())
semeval_2016_df['cluster'] = hierarchical_cluster.labels_

In [62]:
from aspects.analysis import draw_embeddings

In [63]:
semeval_2016_df.head(1)

Unnamed: 0.1,Unnamed: 0,category,entity,aspect,target,polarity,embedding,cluster,tooltip
0,41,AMBIENCE#GENERAL,AMBIENCE,GENERAL,tables,negative,"[-0.26636368, -0.9098466, -0.23358482, -0.0058...",18,AMBIENCE#GENERAL : tables


In [64]:
semeval_2016_df['tooltip'] = semeval_2016_df.category + ' : ' + semeval_2016_df.target

In [65]:
import qgrid

In [66]:
qgrid.show_grid(semeval_2016_df)

QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': True, 'defau…

In [58]:
semeval_2016_df

Unnamed: 0.1,Unnamed: 0,category,entity,aspect,target,polarity,embedding,cluster,tooltip
0,41,AMBIENCE#GENERAL,AMBIENCE,GENERAL,tables,negative,"[-0.26636368, -0.9098466, -0.23358482, -0.0058...",9,AMBIENCE#GENERAL : tables
1,38,AMBIENCE#GENERAL,AMBIENCE,GENERAL,candle-light,positive,"[-0.46664533, -1.2059094, -0.12933376, -0.0101...",9,AMBIENCE#GENERAL : candle-light
2,43,AMBIENCE#GENERAL,AMBIENCE,GENERAL,interior decor,positive,"[-0.47207767, -1.2466235, -0.20648655, -0.3695...",9,AMBIENCE#GENERAL : interior decor
3,56,AMBIENCE#GENERAL,AMBIENCE,GENERAL,interior,negative,"[-0.1322847, -0.7842638, -0.5341981, -0.113490...",3,AMBIENCE#GENERAL : interior
4,57,AMBIENCE#GENERAL,AMBIENCE,GENERAL,space,negative,"[-0.1815286, -1.1119602, 0.0013807006, 0.19098...",5,AMBIENCE#GENERAL : space
5,9,AMBIENCE#GENERAL,AMBIENCE,GENERAL,Decor,positive,"[-0.113958694, -0.471622, -0.07242559, -0.0668...",3,AMBIENCE#GENERAL : Decor
6,22,AMBIENCE#GENERAL,AMBIENCE,GENERAL,place,positive,"[-0.39081353, -0.9756247, -0.08489291, -0.0357...",9,AMBIENCE#GENERAL : place
7,30,AMBIENCE#GENERAL,AMBIENCE,GENERAL,trattoria,positive,"[0.033496413, -0.48428226, -0.31615463, -0.096...",3,AMBIENCE#GENERAL : trattoria
8,18,DRINKS#STYLE_OPTIONS,DRINKS,STYLE_OPTIONS,Bombay beer,positive,"[-0.4347301, -0.95703965, -0.056810495, -0.064...",9,DRINKS#STYLE_OPTIONS : Bombay beer
9,60,DRINKS#QUALITY,DRINKS,QUALITY,sake,positive,"[-0.2192443, 0.9397465, -0.014916062, 0.225952...",1,DRINKS#QUALITY : sake


In [56]:
draw_embeddings.draw(draw_embeddings.get_tsne(semeval_2016_df, tooltip_col='tooltip'))