In [46]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from pathlib import Path
import anago
from anago.utils import load_data_and_labels, load_glove


In [11]:
train_path = Path.cwd().parent.joinpath('data/semeval-2016/train.csv')
test_path = Path.cwd().parent.joinpath('data/semeval-2016/test.csv')
print(train_path)
print(test_path)

/Users/smap10/Project/aspect-extraction/data/semeval-2016/train.csv
/Users/smap10/Project/aspect-extraction/data/semeval-2016/test.csv


In [12]:
# Read data
data_train = pd.read_csv(train_path)
data_test = pd.read_csv(test_path)

In [13]:
data_train.tail()

Unnamed: 0,Sentence #,Tag,Word
28641,2000,O,would
28642,2000,O,retrain
28643,2000,O,the
28644,2000,B,staff
28645,2000,O,.


In [14]:
data_test.tail()

Unnamed: 0,Sentence #,Tag,Word
9864,676,O,was
9865,676,O,good
9866,676,O,","
9867,676,O,too
9868,676,O,.


In [20]:
def df2data(df):
    """Read data and labels from dataframe
    Input:
        df: three columns, ['Sentence #', 'Tag', 'Word']
    Output:
        data: datasize * ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']
        label: datasize * ['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']
    """
    agg_func = lambda s: [(w, t) for w, t in zip(s["Word"].values.tolist(),
                                                 s["Tag"].values.tolist())]
    grouped = df.groupby("Sentence #").apply(agg_func)
    data = [[w[0] for w in s] for s in grouped]
    label = [[w[1] for w in s] for s in grouped]  
    
    return data, label

In [38]:
x_train, y_train = df2data(data_train)
x_test, y_test = df2data(data_test)

In [39]:
print(len(x_train))
print(len(x_test))
print(x_train[0])
print(y_train[0])

2000
676
['judging', 'from', 'previous', 'posts', 'this', 'used', 'to', 'be', 'a', 'good', 'place', ',', 'but', 'not', 'any', 'longer', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'O', 'B', 'O', 'O', 'O', 'O', 'O', 'O']


In [32]:
# x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.1, random_state=42)
# print(len(x_train))
# print(len(x_valid))

1800
200


In [44]:
EMBEDDING_PATH = '../embedding_weights/glove.840B.300d.txt'

In [47]:
embeddings = load_glove(EMBEDDING_PATH)

In [50]:
for i, (key, value) in enumerate(embeddings.items()):
    print('{0}: {1}'.format(key, value[:5]))
    if i > 3:
        break

,: [-0.082752  0.67204  -0.14987  -0.064983  0.056491]
.: [ 0.012001  0.20751  -0.12578  -0.59325   0.12525 ]
the: [ 0.27204  -0.06203  -0.1884    0.023225 -0.018158]
and: [-0.18567   0.066008 -0.25209  -0.11725   0.26513 ]
to: [ 0.31924   0.06316  -0.27858   0.2612    0.079248]


In [51]:
len(embeddings)

2196016

In [53]:
# Use pre-trained word embeddings
model = anago.Sequence(embeddings=embeddings, word_embedding_dim=300)
model.fit(x_train, y_train, x_test, y_test, epochs=50)

Epoch 1/50
 - f1: 31.67
             precision    recall  f1-score   support

          B       0.49      0.22      0.30       600
          I       0.37      0.32      0.34       269

avg / total       0.45      0.25      0.31       869

Epoch 2/50
 - f1: 44.32
             precision    recall  f1-score   support

          B       0.60      0.39      0.47       600
          I       0.46      0.34      0.39       269

avg / total       0.56      0.37      0.44       869

Epoch 3/50
 - f1: 48.93
             precision    recall  f1-score   support

          B       0.61      0.42      0.50       600
          I       0.53      0.43      0.47       269

avg / total       0.58      0.42      0.49       869

Epoch 4/50
 - f1: 51.91
             precision    recall  f1-score   support

          B       0.66      0.42      0.51       600
          I       0.57      0.51      0.54       269

avg / total       0.63      0.45      0.52       869

Epoch 5/50
 - f1: 51.59
             precisi

 - f1: 56.32
             precision    recall  f1-score   support

          B       0.58      0.53      0.55       600
          I       0.61      0.57      0.59       269

avg / total       0.59      0.54      0.56       869

Epoch 29/50
 - f1: 55.27
             precision    recall  f1-score   support

          B       0.62      0.49      0.55       600
          I       0.60      0.53      0.56       269

avg / total       0.61      0.50      0.55       869

Epoch 30/50
 - f1: 56.02
             precision    recall  f1-score   support

          B       0.55      0.55      0.55       600
          I       0.63      0.55      0.59       269

avg / total       0.57      0.55      0.56       869

Epoch 31/50
 - f1: 55.98
             precision    recall  f1-score   support

          B       0.64      0.46      0.53       600
          I       0.66      0.57      0.61       269

avg / total       0.65      0.49      0.56       869

Epoch 32/50
 - f1: 56.81
             precision    r

# predict on test dataset

In [136]:
data = data_test

# Sentence class
class SentenceGetter(object):

    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, t) for w, t in zip(s["Word"].values.tolist(),
#                                                            s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]

    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

getter = SentenceGetter(data)
sentences = getter.sentences # get all sentences
   
# Word2inx & Padding for X
X = [[word2idx.get(w[0], 0) for w in s] for s in sentences]
X_test = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=0)

# Word2inx & Padding for y
y = [[tag2idx[w[1]] for w in s] for s in sentences]
y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=0)

# Get one-hot labels
y_test = [to_categorical(i, num_classes=n_tags) for i in y]

In [137]:
print(sentences[1])
print(X_test[1])
print(np.argmax(y_test[1], -1))

[('serves', 'O'), ('really', 'O'), ('good', 'O'), ('sushi', 'B'), ('.', 'O')]
[1183 3401 2591  176 1280    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0]
[1 1 1 3 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0]


In [153]:
# Predictions.
idx2word = {value: key for key, value in word2idx.items()}
idx2tag = {value: key for key, value in tag2idx.items()}


true_all = np.argmax(y_test, -1)

true_all_tags = [[idx2tag[idx] for idx in s if idx!=0] for s in true_all]

p_all = model.predict(np.array(X_test)) # (4796, 75, 18)
p_all= np.argmax(p_all, axis=-1) # (4796, 75)
p_all_tags = [[idx2tag[idx] for idx in s] for s in p_all] # ['B-gpe', 'O', 'O', 'O']

for i, true in enumerate(true_all_tags):
    length = len(true)
    p_all_tags[i] = p_all_tags[i][:length]

p_all_tags = [[x.replace('<pad>', 'O') for x in s] for s in p_all_tags]

# for (true_tag, p)

print(sentences[3])
print(X_test[3])
print(true_all_tags[3])
print(p_all_tags[3])

[('green', 'B'), ('tea', 'I'), ('creme', 'I'), ('brulee', 'I'), ('is', 'O'), ('a', 'B'), ('must', 'O'), ('!', 'O')]
[3286 1088    0    0  539 2177 3012 3425    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0]
['B', 'I', 'I', 'I', 'O', 'B', 'O', 'O']
['I', 'I', 'O', 'O', 'O', 'O', 'O', 'O']


In [154]:
print("{:15}||{:5}||{}".format("Word", "True", "Pred"))
print(30 * "=")
for w, t, pred in zip(sentences[3], true_all_tags[3], p_all_tags[3]):
    if w != 0:
        print("{:15}: {:5} {}".format(w[0], w[1], pred))


Word           ||True ||Pred
green          : B     I
tea            : I     I
creme          : I     O
brulee         : I     O
is             : O     O
a              : B     O
must           : O     O
!              : O     O


In [155]:
# Evaluation
from seqeval.metrics import f1_score, classification_report
print(f1_score(true_all_tags, p_all_tags))
print(classification_report(true_all_tags, p_all_tags))


0.41892832289492
             precision    recall  f1-score   support

          B       0.58      0.34      0.42       599
          I       0.45      0.37      0.41       269

avg / total       0.54      0.35      0.42       868

