In [1]:
import pandas as pd
import numpy as np
from pprint import pprint
from datasets import load_dataset

## Read data

In [2]:
polemo_offi_config = pd.read_json('data/polemo2-official/dataset_infos.json') #this is config file
polemo_category = "hotels_text"
polemo_official = load_dataset("data/polemo2-official/", polemo_category) # only oppinions about hotels
df_polemo_official = pd.DataFrame(polemo_official["train"])

aspectemo = load_dataset("data/aspectemo")
df_aspectemo = pd.DataFrame(aspectemo["train"])

df_opta_reviews = pd.read_json("data/OPTA-treebank-reviews/OPTA-treebank-0.1.json")
df_opta_skladnica = pd.read_json("data/OPTA-treebank-skladnica/skladnica_output.json")

100%|██████████| 3/3 [00:00<00:00, 407.50it/s]
100%|██████████| 2/2 [00:00<00:00, 417.01it/s]


## Explore data

### Polemo official

Polemo out and polemo in includes in polemo official, so we don't use them.

In [3]:
# Available categories of oppinions in polemo
list(polemo_offi_config.columns)

['all_text',
 'all_sentence',
 'hotels_text',
 'hotels_sentence',
 'medicine_text',
 'medicine_sentence',
 'products_text',
 'products_sentence',
 'reviews_text',
 'reviews_sentence']

In [4]:
df_polemo_official.target.value_counts()

1    1237
2     804
3     790
0     334
Name: target, dtype: int64

In [5]:
# get meanings of labels in polemo_official based on polemo_in (polemo out has the same meanings)
df_polemo_out = pd.read_csv("data/klej_polemo2_in/train.csv")

df_polemo_out.rename({'sentence': 'text'}, axis=1, inplace=True)
df_intersection = pd.merge(df_polemo_out, df_polemo_official, how='inner', on=['text'])
df_intersection.head()

df_intersection.drop("text", axis=1, inplace = True)
df_intersection.drop_duplicates("target_y", inplace = True)

df_intersection

Unnamed: 0,target_x,target_y
0,__label__meta_minus_m,1
1,__label__meta_plus_m,2
5,__label__meta_zero,0
6,__label__meta_amb,3


### Aspectemo

In [6]:
print(df_aspectemo.head(3))
# ["O", "a_minus_m", "a_minus_s", "a_zero", "a_plus_s", "a_plus_m", "a_amb"]

example = 2
list(zip(df_aspectemo.iloc[example].tokens, df_aspectemo.iloc[example].labels))

                                              tokens  \
0  [Wykłady, strasznie, nudne, ,, totalna, porażk...   
1  [Ogólnie, bardzo, pozytywny, choc, troche, zak...   
2  [Pan, Krzysztof, to, ogólnie, bardzo, pozytywn...   

                                              labels  
0  [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  
1  [0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 0, ...  
2  [5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, ...  


[('Pan', 5),
 ('Krzysztof', 0),
 ('to', 0),
 ('ogólnie', 0),
 ('bardzo', 0),
 ('pozytywna', 0),
 ('postać', 0),
 (',', 0),
 ('na', 0),
 ('ćwiczeniach', 0),
 ('lubi', 6),
 ('się', 0),
 ('pośmiać', 0),
 ('(', 0),
 ('ze', 0),
 ('studentów', 0),
 ('i', 0),
 ('nie', 0),
 ('tylko', 0),
 (')', 0),
 ('.', 0),
 ('Zajęcia', 0),
 ('prowadzi', 5),
 ('w', 0),
 ('sposób', 0),
 ('ciekawy', 0),
 (',', 0),
 ('potrafi', 5),
 ('przekazywać', 0),
 ('wiedzę', 0),
 ('.', 0),
 ('Koła', 6),
 ('zwykle', 0),
 ('do', 0),
 ('najłatwiejszych', 0),
 ('nie', 0),
 ('należą', 0),
 (',', 0),
 ('ale', 0),
 ('jeśli', 0),
 ('człowiek', 0),
 ('się', 0),
 ('troszkę', 0),
 ('postara', 0),
 ('to', 0),
 ('zaliczy', 0),
 ('bez', 0),
 ('problemu', 0),
 ('.', 0)]

### OPTA reviews

The file contains JSON formatted results of annotation of opinions and their targets. 

Each item in the list is a sentence which contains following fields:

'file_id': ID of review (used internally, same sentence may appear multiple times with different sentiments and opinin target pairs) </br>
'domain': review type (perfume or clothes) </br>
'dist':  dependency path distance between S and T, filled only for the 1st batch of annotations </br>
'isSentIncorrect': human annotation - is sentiment word S incorrect? </br>
'isAttrIncorrect': human annotation - is opinion target word T incorrect? </br>
'parsedSent': CONLL-formatted parsed sentence; last column contains pointers to : </br>
	S = sentiment word </br>
	T = opinion target word </br>
'isStrError': human annotation - is dependency structure erroneous between S and T </br>
'isAtrRelToSent': human annotation - is S related to T  </br>
'rule_id': ID of extraction rule that pointed to T (see "extraction rules")

read about CONLL format here: https://universaldependencies.org/format.html

In [7]:
df_opta_reviews.head()

Unnamed: 0,domain,dist,isSentIncorrect,parsedSent,isAttrIncorrect,file_id,isStrError,isAtrRelToSent,rule_id
0,uroda,1,0,[1\tPolecam\tpolecać\tfin\tfin\tsg|pri|imperf\...,0,40642_3.conll\n,0,0,121
1,uroda,1,0,[1\tSłodkawy\tsłodkawy\tadj\tadj\tsg|nom|m1|po...,0,40716_5.conll\n,0,1,179
2,uroda,1,0,[1\tLetnia\tletni\tadj\tadj\tsg|nom|f|pos\t2\t...,0,55145_4.conll\n,0,1,55
3,uroda,1,0,[1\tRaczej\traczej\tqub\tqub\t_\t0\tpred\t_\t_...,0,55161_5.conll\n,0,1,179
4,uroda,1,0,[1\tCiekawa\tciekawy\tadj\tadj\tsg|nom|f|pos\t...,0,55166_5.conll\n,0,1,109


In [8]:
def conll_get_word_and_misc(conll: list) -> list:
    # based on readme and https://universaldependencies.org/format.html
    conll = conll[0].replace("\n", "")
    conll_list = conll.split("\t")[:-1] # last field is '\n'
    return [conll_list[1]] + [conll_list[-1]]

def df_from_parsedSent(df: pd.DataFrame):
    opta_words = df["parsedSent"].apply(conll_get_word_and_misc)
    opta_words = list(zip(*opta_words))
    df_opta_words = pd.DataFrame()
    df_opta_words["word"] = opta_words[0]
    df_opta_words["misc"] = opta_words[1]
    return df_opta_words

print("Example od parsedSent\n", df_opta_reviews["parsedSent"][0][0])

df_opta_words = df_from_parsedSent(df_opta_reviews)
print("Words with their labels(?)\n", df_opta_words.head(), "\n")

print("Class distribution for labels(?)\n", df_opta_words["misc"].value_counts())

# I don't quite understand how this dataset works :(


Example od parsedSent
 1	Polecam	polecać	fin	fin	sg|pri|imperf	12	conjunct	_	_	S	

Words with their labels(?)
        word misc
0   Polecam    S
1  Słodkawy    _
2    Letnia    _
3    Raczej    _
4   Ciekawa    S 

Class distribution for labels(?)
 _    991
S    199
A    174
Name: misc, dtype: int64


### OPTA składnica

In [9]:
df_opta_skladnica.head()

Unnamed: 0,parsedSent,isAtrRelToSent,file_id
0,[1\tTeoretycznie\tteoretycznie\tadv\tadv\tpos\...,0,3
1,[1\tBarszczucha\tBarszczucha\tsubst\tsubst\tsg...,0,10
2,[1\tW\tw\tprep\tprep\tloc|nwok\t9\tadjunct\t_\...,-1,12
3,"[1\tTo\tto\tpred\tpred\t_\t0\tpred\t_\t_\n\t_,...",0,15
4,[1\tWysoka\twysoki\tadj\tadj\tsg|nom|f|pos\t2\...,-1,17


In [10]:
df_opta_words = df_from_parsedSent(df_opta_skladnica)
print(df_opta_words.head())
df_opta_words["misc"].value_counts()

# Looks familiar to OPTA reviews, there are many _ in last column from parsedSent. I don't think it can be useful for us


           word misc
0  Teoretycznie    _
1   Barszczucha    _
2             W    _
3            To    _
4        Wysoka    _


_    950
S     19
T      6
Name: misc, dtype: int64