In [13]:
import re
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from pathlib import Path

In [8]:
ls ../datasets/raw-semeval-2016/

[31mtest.xml[m[m*  [31mtrain.xml[m[m*


In [9]:
train_path = Path.cwd().parent.joinpath('datasets/raw-semeval-2016/train.xml')
test_path = Path.cwd().parent.joinpath('datasets/raw-semeval-2016/test.xml')

In [14]:
soup = None
with train_path.open(encoding="utf-8") as f:
    soup = BeautifulSoup(f.read().strip(), "lxml-xml")
if soup is None:
    raise Exception("Can't read xml file")
sentence_nodes = soup.find_all("sentence")

In [27]:
def soup2dict(sentence_nodes):
    """
    Input: a soup object, e.g. soup.find_all("sentence")
    Output: a list of dictionaries, contains id, text, aspect terms
    """
    sentences = []
    i = 0
    for n in sentence_nodes:
        i += 1
        sentence = {}
        sentence['id'] = i
        aspect_term = []
        sentence['text'] = n.find('text').string
        if n.find('Opinions'):
            category_term = []
            for c in n.find('Opinions').contents:
                if c.name == 'Opinion':
                    sentence['polarity'] = c['polarity']
                    if c['category'] not in category_term:
                        category_term.append(c['category'])
                    if c['target'] not in aspect_term:
                        aspect_term.append(c['target'])

        sentence['category'] = category_term
        sentences.append(sentence)
        
    return sentences

In [28]:
sentences = soup2dict(sentence_nodes)

In [29]:
len(sentences)

2000

In [30]:
sentences[:5]

[{'category': ['RESTAURANT#GENERAL'],
  'id': 1,
  'polarity': 'negative',
  'text': 'Judging from previous posts this used to be a good place, but not any longer.'},
 {'category': ['SERVICE#GENERAL'],
  'id': 2,
  'polarity': 'negative',
  'text': 'We, there were four of us, arrived at noon - the place was empty - and the staff acted like we were imposing on them and they were very rude.'},
 {'category': ['SERVICE#GENERAL'],
  'id': 3,
  'polarity': 'negative',
  'text': 'They never brought us complimentary noodles, ignored repeated requests for sugar, and threw our dishes on the table.'},
 {'category': ['FOOD#QUALITY', 'FOOD#STYLE_OPTIONS'],
  'id': 4,
  'polarity': 'negative',
  'text': 'The food was lousy - too sweet or too salty and the portions tiny.'},
 {'category': ['SERVICE#GENERAL'],
  'id': 5,
  'polarity': 'negative',
  'text': 'After all that, they complained to me about the small tip.'}]

# Categories df

In [38]:
label_for_sentences = []
for s in sentences:
    label_for_sentences.append(s['category'])

In [39]:
label_for_sentences[:4]

[['RESTAURANT#GENERAL'],
 ['SERVICE#GENERAL'],
 ['SERVICE#GENERAL'],
 ['FOOD#QUALITY', 'FOOD#STYLE_OPTIONS']]

In [40]:
label_kinds = set()
for s in label_for_sentences:
    for c in s:
        label_kinds.add(c)

In [41]:
label_kinds

{'AMBIENCE#GENERAL',
 'DRINKS#PRICES',
 'DRINKS#QUALITY',
 'DRINKS#STYLE_OPTIONS',
 'FOOD#PRICES',
 'FOOD#QUALITY',
 'FOOD#STYLE_OPTIONS',
 'LOCATION#GENERAL',
 'RESTAURANT#GENERAL',
 'RESTAURANT#MISCELLANEOUS',
 'RESTAURANT#PRICES',
 'SERVICE#GENERAL'}

In [43]:
labels = []

for sentence_labels in label_for_sentences:
    if len(sentence_labels) > 0:
        row = {}
    for k in label_kinds:
        if k in sentence_labels:
            row[k] = 1
        else:
            row[k] = 0
    labels.append(row)

In [47]:
labels[3]

{'AMBIENCE#GENERAL': 0,
 'DRINKS#PRICES': 0,
 'DRINKS#QUALITY': 0,
 'DRINKS#STYLE_OPTIONS': 0,
 'FOOD#PRICES': 0,
 'FOOD#QUALITY': 1,
 'FOOD#STYLE_OPTIONS': 1,
 'LOCATION#GENERAL': 0,
 'RESTAURANT#GENERAL': 0,
 'RESTAURANT#MISCELLANEOUS': 0,
 'RESTAURANT#PRICES': 0,
 'SERVICE#GENERAL': 0}

In [48]:
import pandas as pd

In [49]:
labels_df = pd.DataFrame(labels)

In [54]:
print(labels_df.shape)

(2000, 12)


In [57]:
labels_df.columns

Index(['AMBIENCE#GENERAL', 'DRINKS#PRICES', 'DRINKS#QUALITY',
       'DRINKS#STYLE_OPTIONS', 'FOOD#PRICES', 'FOOD#QUALITY',
       'FOOD#STYLE_OPTIONS', 'LOCATION#GENERAL', 'RESTAURANT#GENERAL',
       'RESTAURANT#MISCELLANEOUS', 'RESTAURANT#PRICES', 'SERVICE#GENERAL'],
      dtype='object')

In [56]:
labels_df.head()

Unnamed: 0,AMBIENCE#GENERAL,DRINKS#PRICES,DRINKS#QUALITY,DRINKS#STYLE_OPTIONS,FOOD#PRICES,FOOD#QUALITY,FOOD#STYLE_OPTIONS,LOCATION#GENERAL,RESTAURANT#GENERAL,RESTAURANT#MISCELLANEOUS,RESTAURANT#PRICES,SERVICE#GENERAL
0,0,0,0,0,0,0,0,0,1,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,1,1,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,1


# Sum all step

In [59]:
def soup2dict(sentence_nodes):
    """
    Input: a soup object, e.g. soup.find_all("sentence")
    Output: a list of dictionaries, contains id, text, aspect terms
    """
    sentences = []
    i = 0
    for n in sentence_nodes:
        i += 1
        sentence = {}
        sentence['id'] = i
        aspect_term = []
        sentence['text'] = n.find('text').string
        if n.find('Opinions'):
            category_term = []
            for c in n.find('Opinions').contents:
                if c.name == 'Opinion':
                    sentence['polarity'] = c['polarity']
                    if c['category'] not in category_term:
                        category_term.append(c['category'])
                    if c['target'] not in aspect_term:
                        aspect_term.append(c['target'])

        sentence['category'] = category_term
        sentences.append(sentence)
        
    return sentences

In [62]:
def dict2labels(sentences):
    """
    Input:
        sentences: a list of dictionaries
    Output:
        label_df: a dataframe contains multiple labels
        label_kinds: contain all labels categories
        
    """
    # all layer to a list of label
    label_for_sentences = []
    for s in sentences:
        label_for_sentences.append(s['category'])

    # get all categories
    label_kinds = set()
    for s in label_for_sentences:
        for c in s:
            label_kinds.add(c)    

    # convert all labels to one-hot format
    labels = []
    for sentence_labels in label_for_sentences:
        if len(sentence_labels) > 0:
            row = {}
        for k in label_kinds:
            if k in sentence_labels:
                row[k] = 1
            else:
                row[k] = 0
        labels.append(row)    
    
    labels_df = pd.DataFrame(labels)
    
    return labels_df

In [64]:
def get_labels(file_path):    
    # Get soup object
    soup = None
    with file_path.open(encoding="utf-8") as f:
        soup = BeautifulSoup(f.read().strip(), "lxml-xml")
    if soup is None:
        raise Exception("Can't read xml file")
    sentence_nodes = soup.find_all("sentence")

    # soup obejct to a list of dictionaries
    sentences = soup2dict(sentence_nodes)

    # a list of dictionaries to dataframe
    label_df = dict2labels(sentences)
    
    return label_df

In [65]:
# Train and Test Path
train_path = Path.cwd().parent.joinpath('datasets/raw-semeval-2016/train.xml')
test_path = Path.cwd().parent.joinpath('datasets/raw-semeval-2016/test.xml')

# Get labels
train_label_df = get_labels(train_path)
test_label_df = get_labels(test_path)
 

In [74]:
ls ../preprocessed_data/semeval-2016/

model_param  vocab


In [76]:
ls ../datasets/semeval-2016/slot1

In [78]:
# Save data
train_path = '../datasets/semeval-2016/slot1/train_label_df.csv'
test_path = '../datasets/semeval-2016/slot1/test_label_df.csv'

train_label_df.to_csv(train_path, encoding='utf-8', index=False)
test_label_df.to_csv(test_path, encoding='utf-8', index=False)

In [81]:
# Load data
train_path = '../datasets/semeval-2016/slot1/train_label_df.csv'
test_path = '../datasets/semeval-2016/slot1/test_label_df.csv'

train_labels = pd.read_csv(train_path)
test_labels = pd.read_csv(test_path)

In [82]:
print(train_labels.shape)
print(test_labels.shape)

(2000, 12)
(676, 12)
