In [21]:
import re
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from pathlib import Path


# 1 raw data to soup object 

In [2]:
train_file = Path.cwd().parent.joinpath('raw-data/semeval-2016/train.xml')
train_file

PosixPath('/Users/smap10/Project/aspect-extraction/raw-data/semeval-2016/train.xml')

In [7]:
soup = None
with train_file.open(encoding="utf-8") as f:
    soup = BeautifulSoup(f.read().strip(), "lxml-xml")
if soup is None:
    raise Exception("Can't read xml file")
sentence_nodes = soup.find_all("sentence")

In [8]:
for i, node in enumerate(sentence_nodes):
    print(node)
    print()
    if i > 5:
        break

<sentence id="1004293:0">
<text>Judging from previous posts this used to be a good place, but not any longer.</text>
<Opinions>
<Opinion category="RESTAURANT#GENERAL" from="51" polarity="negative" target="place" to="56"/>
</Opinions>
</sentence>

<sentence id="1004293:1">
<text>We, there were four of us, arrived at noon - the place was empty - and the staff acted like we were imposing on them and they were very rude.</text>
<Opinions>
<Opinion category="SERVICE#GENERAL" from="75" polarity="negative" target="staff" to="80"/>
</Opinions>
</sentence>

<sentence id="1004293:2">
<text>They never brought us complimentary noodles, ignored repeated requests for sugar, and threw our dishes on the table.</text>
<Opinions>
<Opinion category="SERVICE#GENERAL" from="0" polarity="negative" target="NULL" to="0"/>
</Opinions>
</sentence>

<sentence id="1004293:3">
<text>The food was lousy - too sweet or too salty and the portions tiny.</text>
<Opinions>
<Opinion category="FOOD#QUALITY" from="4" polari

# 2  convert soup object to a list of dictionaries

In [9]:
def soup2dict(sentence_nodes):
    """
    Input: a soup object, e.g. soup.find_all("sentence")
    Output: a list of dictionaries, contains id, text, aspect terms
    """
    sentences = []
    i = 0
    for n in sentence_nodes:
        i += 1
        sentence = {}
        aspect_term = []
        sentence['id'] = i
        sentence['text'] = n.find('text').string
        if n.find('Opinions'):
            for c in n.find('Opinions').contents:
                if c.name == 'Opinion':
                    if c['target'] not in aspect_term:
                        aspect_term.append(c['target'])

        sentence['aspect'] = aspect_term
        sentences.append(sentence)
        
    return sentences

In [10]:
sentences = soup2dict(sentence_nodes)

In [11]:
sentences[:5]

[{'aspect': ['place'],
  'id': 1,
  'text': 'Judging from previous posts this used to be a good place, but not any longer.'},
 {'aspect': ['staff'],
  'id': 2,
  'text': 'We, there were four of us, arrived at noon - the place was empty - and the staff acted like we were imposing on them and they were very rude.'},
 {'aspect': ['NULL'],
  'id': 3,
  'text': 'They never brought us complimentary noodles, ignored repeated requests for sugar, and threw our dishes on the table.'},
 {'aspect': ['food', 'portions'],
  'id': 4,
  'text': 'The food was lousy - too sweet or too salty and the portions tiny.'},
 {'aspect': ['NULL'],
  'id': 5,
  'text': 'After all that, they complained to me about the small tip.'}]

See how many sentences contain aspect terms

In [12]:
# No sentence contain two NULL target (aspect term)
aspect_zero = 0
aspect_one = 0

for s in sentences:
    # aspect == [] or aspect == ['NULL']
    if len(s['aspect'])==0 or s['aspect'][0] == 'NULL':
        aspect_zero += 1
    else:
        aspect_one += 1
        
print(len(sentences)) # Total 2000 sentences
print(aspect_zero) # xxx sentences have no aspect term
print(aspect_one) # xxx sentences have at least 1 aspect term

2000
791
1209


We have total 2000 sentences, and only 1209 sentences have 1 aspect term at least.

# 3 list to dataframe

## 3.1 split text to words

Here we preserve the punctuation

In [17]:
def split2words(s_text):
    """Split string with white and prereserve the punctuation
    Input:
        s_text: string, a sentence, e.g. Judging from previous posts this used to be a good place, but not any longer.
    Output:
        words: a list of words, e.g. ['judging', 'from', 'previous', 'posts', 'this', 'used', 'to', 'be', 'a', 'good', 'place', ',', 'but', 'not', 'any', 'longer', '.']
    """
    s_text = re.sub('([.,!?()])', r' \1 ', s_text) # match the punctuation characters and surround them by spaces,
    s_text = re.sub('\s{2,}', ' ', s_text)         # collapse multiple spaces to one space
    words = s_text.lower().split()
    return words

## 3.2 taggin word with IOB format

In [18]:
def tagging_IOB(s, aspects):
    """Assigning IOB tag to each word in s
    Input: 
        s: sentences, a list of words, e.g. ['judging', 'from', 'previous', 'posts']
        aspects: a list of aspect term, e.g. ['a good place', 'Posts']
    Output:
        tag: a list of tag, e.g. ['O', 'O', 'O', 'B']
    """
    tags = ['O'] * len(s)

    for aspect in aspects:
        pre_index = 0
        for word in s: 
            if word in aspect: # 'good' in 'a good place'
                cur_index = s.index(word) 
                if cur_index - pre_index == 1: # inside an aspect term
                    tags[cur_index] = 'I'
                else:                       # beginning of an aspect term
                    tags[cur_index] = 'B'
                pre_index = cur_index 
    return tags


# 3.3 convert a list of dict to dataframe

In [19]:
def dict2df(sentences):
    """Convert list of dict to dataframe
    Input: 
        sentences: a list of dictionaries, contains id, text, aspect terms. The output of raw2dict
    Output:
        data: a dataframe with three columns, sentence id, words, tag with IOB format
    """
    data = pd.DataFrame()
    for s in sentences:
        sentence = {}
        sentence['Sentence #'] = s['id']
        sentence['Word'] = split2words(s['text'])  # split text to words
        s_length = len(sentence['Word']) # the length of sentence, used to generate tag
        if len(s['aspect'])==0 or s['aspect'][0] == 'NULL': # tagging: if no aspect term
            sentence['Tag'] = ['O'] * s_length
        else:                                               # IOB format tag if aspect exist
            aspect_terms = [x.lower() for x in s['aspect']]  
            sentence['Tag'] = tagging_IOB(sentence['Word'], aspect_terms)

        # convert each setence to dataframe 
        sentence_df = pd.DataFrame.from_dict(sentence)
        data = data.append(sentence_df, ignore_index=True)
    
    return data

In [22]:
data = dict2df(sentences)

In [25]:
data.tail()

Unnamed: 0,Sentence #,Tag,Word
28641,2000,O,would
28642,2000,O,retrain
28643,2000,O,the
28644,2000,B,staff
28645,2000,O,.


# 4 Sum it up

In [26]:
# 1 raw data to soup

train_file = Path.cwd().parent.joinpath('raw-data/semeval-2016/train.xml')
soup = None
with train_file.open(encoding="utf-8") as f:
    soup = BeautifulSoup(f.read().strip(), "lxml-xml")
if soup is None:
    raise Exception("Can't read xml file")
sentence_nodes = soup.find_all("sentence")

# 2  convert soup object to a list of dictionaries
sentences = soup2dict(sentence_nodes)

# 3 list to dataframe
data = dict2df(sentences)

In [28]:
data[:10]

Unnamed: 0,Sentence #,Tag,Word
0,1,O,judging
1,1,O,from
2,1,O,previous
3,1,O,posts
4,1,O,this
5,1,O,used
6,1,O,to
7,1,O,be
8,1,B,a
9,1,O,good


# 5 Convert step 4 to a function

In [29]:
def read_data(file_path):
    # 1 raw data to soup
    soup = None
    with file_path.open(encoding="utf-8") as f:
        soup = BeautifulSoup(f.read().strip(), "lxml-xml")
    if soup is None:
        raise Exception("Can't read xml file")
    sentence_nodes = soup.find_all("sentence")

    # 2  convert soup object to a list of dictionaries
    sentences = soup2dict(sentence_nodes)

    # 3 list to dataframe
    data = dict2df(sentences)
    
    return data

# 6 Test read_data on test data

In [34]:
test_path = Path.cwd().parent.joinpath('raw-data/semeval-2016/test.xml')

In [35]:
test_data = read_data(test_path)

In [37]:
test_data.tail()

Unnamed: 0,Sentence #,Tag,Word
9864,676,O,was
9865,676,O,good
9866,676,O,","
9867,676,O,too
9868,676,O,.


# 7 Save preprocessed data

In [38]:
save_file = Path.cwd().parent.joinpath('data/semeval-2016/test.csv')

data.to_csv(save_file, index=False)