# Imports:

In [2]:
import os
import numpy as np
import xml.etree.ElementTree as et
from nltk import word_tokenize
from nltk.tag import StanfordPOSTagger
import io, json
from tqdm.auto import tqdm
import pickle

print('imported')

imported


## Constants:

In [3]:
# setting java environ variable
java_env_path = 'C:/Program Files/Java/jdk1.8.0_261/bin/java.exe'
os.environ['JAVAHOME'] = java_env_path

# Stanford POS Tagger related constants
tag_list = ['CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNS','NNP', 'NNPS', 'PDT', 
    'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP','SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 
    'WP', 'WP$', 'WRB', 'HYPH', '-LRB-', '-RRB-', 'AFX', 'NFP', ',', '.', ':', '$', '#', "``", "''", '(',')']
tag_idx = {tag: idx + 1 for idx, tag in enumerate(sorted(tag_list))}

# Stanford POS Tagger
tagger_dir = 'stanford-postagger/'
tagger = StanfordPOSTagger( 
    os.path.join(tagger_dir, 'models/english-left3words-distsim.tagger'), 
    os.path.join(tagger_dir, 'stanford-postagger.jar'), 
    encoding='utf8'
)


# Data preprocessing:

In [129]:
def build_vocab(data_dir):
    # placeholders
    max_sequence_length = {}
    data_length = {}
    vocab = set()

    # iterating over files
    xml_files = os.listdir(data_dir)
    for xml_file in xml_files:
        root = et.parse(data_dir + xml_file).getroot()
        for sentence in root.iter('sentence'):
            text = sentence.find('text').text
            tokens = word_tokenize(text)
            [vocab.add(word) for word in tokens]
            # updating metadata
            max_sequence_length[xml_file] = max(max_sequence_length.get(xml_file, 0), len(tokens))
            data_length[xml_file] = data_length.get(xml_file, 0) + 1

    # sorting for more lisibility
    vocab = sorted(vocab)
    return {word: index + 1 for index, word in enumerate(vocab)}, max_sequence_length, data_length

def build_data(vocab, source_file, max_sentence_length, data_length, save=True, out_dir='data/'):
    # placeholders
    x = np.zeros((data_length, 2, max_sentence_length)) # will contain indexes of words and pos_tags
    y = np.zeros((data_length, max_sentence_length)) # will contain aspect_terms mask
    opinions = []

    # progress bar for more lisibility
    pbar = tqdm(total=data_length)
    
    # iterating over each sentence of the data file
    root = et.parse(source_file).getroot()
    for s_idx, sentence in enumerate(root.iter('sentence')):
        text = sentence.find('text').text
        tokens = word_tokenize(text)
        pos_tags = [tag_idx[tag] for _, tag in tagger.tag(tokens)]
        for w_idx, word in enumerate(tokens):
            x[s_idx, 0, w_idx] = vocab[word]
            x[s_idx, 1, w_idx] = pos_tags[w_idx]

        # iterating over each opinion of the given sentence
        sentence_opinions = []
        for opinion in sentence.iter('Opinion'):
            target, category, polarity = opinion.get('target'), opinion.get('category'), opinion.get('polarity')
            first, last = int(opinion.get('from')), int(opinion.get('to')) # first and last indexes of the opinion
            if last != 0:
                if first != 0:
                    pre_seq_len = len(word_tokenize(text[:first]))
                post_seq_len = len(word_tokenize(text[:last]))
                # for training only identify aspect word, but not polarity
                y[s_idx, pre_seq_len] = 1
                if post_seq_len > pre_seq_len: # more than one token
                    y[s_idx, pre_seq_len + 1: post_seq_len] = 2 
                # adding to list
                sentence_opinions.append({
                    'target': target,
                    'category': category,
                    'polarity': polarity,
                    'first': pre_seq_len,
                    'last': post_seq_len
                })
        # adding to main opinions list
        opinions.append(sentence_opinions)
        # update pbar
        pbar.update(1)
    # if save is true, save the data
    if save:
        np.save(os.path.join(out_dir, 'train_x.npy'), x)
        np.save(os.path.join(out_dir, 'train_y.npy'), y)
        with open(os.path.join(out_dir, 'opinions.json')) as op_file:
            json.dump(opinions, op_file)
        print('Files saved at {}'.format(out_dir))
    # return
    return x, y, opinions

In [None]:
data_dir = 'data/raw/'

vocab, max_sequence_length, data_size = build_vocab(data_dir)
print('vocab size: {}'.format(len(vocab)))
print('Max sequence length of each file: {}'.format(max_sequence_length))
print('Sentences count of each file: {}'.format(data_size))

train_file = 'ABSA16_Restaurants_Train_SB1_v2.xml'

train_x, train_y, train_opinions = build_data(
    vocab, data_dir + train_file, max_sequence_length[train_file], data_size[train_file],
    save=True, out_dir='data/preprocessed/'
)

In [138]:
print('Vocab: {} ...'.format(list(vocab.items())[:10]))
print('Train x shape: {}\n{}'.format(train_x.shape, train_x[0]))
print('Train y shape: {}\n{}'.format(train_y.shape, train_y[0]))

Vocab: [('!', 1), ('#', 2), ('$', 3), ('%', 4), ('&', 5), ("'", 6), ("''", 7), ("'after", 8), ("'best", 9), ("'cuz", 10)] ...
Train x shape: (2000, 2, 68)
[[ 638. 2516. 3619. 3577. 4409. 4606. 4442. 1526. 1262. 2587. 3529.   25.
  1687. 3282. 1395. 3022.   28.    0.    0.    0.    0.    0.    0.    0.
     0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
     0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
     0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
     0.    0.    0.    0.    0.    0.    0.    0.]
 [  39.   16.   17.   25.   12.   38.   35.   37.   12.   17.   22.    6.
    10.   30.   30.   31.    8.    0.    0.    0.    0.    0.    0.    0.
     0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
     0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
     0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
     0.    0.    0.    0.    0.    0.    0.    0.]]
Tr