In [1]:
import os
import xml.etree.ElementTree as ET
import logging
import sys
import warnings
import copy
from functools import reduce
from typing import List

import numpy as np
import torch as th
from sklearn.model_selection import KFold, GridSearchCV
from tqdm import tqdm
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from absa import train_reviews_path, test_reviews_path, TEST_APPENDIX, word2vec_model_path, \
                parsed_reviews_dump_path, PROGRESSBAR_COLUMNS_NUM, images_path
from absa.preprocess.spell_check import spell_check
from absa.preprocess.dependency import dep_parse_reviews
from absa.models.level.sentence.aspect.classifier import AspectClassifier as SentenceAspectClassifier
from absa.models.level.opinion.aspect.classifier import AspectClassifier as OpinionAspectClassifier
from absa.utils.embedding import Embeddings
from absa.utils.dump import load_dump

seed = 42
np.random.seed(seed)
th.manual_seed(seed)
th.cuda.manual_seed(seed)
logging.basicConfig(level=logging.INFO)

In [2]:
vocabulary = Embeddings.vocabulary
embeddings_matrix = Embeddings.embeddings_matrix

# Prepare data

In [3]:
train_texts = load_dump(pathway=parsed_reviews_dump_path)
test_texts = load_dump(pathway=parsed_reviews_dump_path + TEST_APPENDIX)

INFO:root:Upload from dump: /home/dmitry/Projects/absa/dumps/data/dep_parsed_sentence
INFO:root:Upload from dump: /home/dmitry/Projects/absa/dumps/data/dep_parsed_sentence.test


### display

In [4]:
SCORE_NAME = 'F1'
PARAMETER_DECIMAL_LEN = 5
SCORE_DECIMAL_LEN = 3


def display_score(parameter_values: List,
                  train_values: np.array,
                  val_values: np.array,
                  parameter_name='Epoch',
                  score_name=SCORE_NAME) -> float:

    max_param, max_acc = [(parameter_values[index], val)
                          for index, val in enumerate(val_values) if val == max(val_values)][0]

    plt.figure(figsize=(10, 10))
    plt.grid(True, alpha=0.3)
    plt.xlim(left=min(parameter_values), right=max(parameter_values))

    # train
    plt.plot(parameter_values, train_values, color='blue')
    # validation
    plt.plot(parameter_values,
             val_values,
             color='#EE6B24')

    ax = plt.gca()
    ax.xaxis.set_major_locator(MaxNLocator(integer=True))
    plt.title(f'Dependence of {score_name} from {parameter_name}')
    plt.xlabel(f'{parameter_name}')
    plt.ylabel(score_name.capitalize())
    if isinstance(max_param, (int, )):
        plt.legend([
            f'Maximal {score_name}={max_acc:.{SCORE_DECIMAL_LEN}} when {parameter_name}={max_param}'
        ])
    else:
        plt.legend([
            f'Maximal {score_name}={max_acc:.{SCORE_DECIMAL_LEN}}' +
            f' when {parameter_name}={max_param:.{PARAMETER_DECIMAL_LEN}f}'
        ])
    plt.legend(['Train', 'Validation'])

In [5]:
splits_number = 5
kf = KFold(n_splits=splits_number)

# Sentence-Level Aspect Classification

### Optimal parameters

In [6]:
num_epoch = 100
cv = 3

In [7]:
parameters = {
    'batch_size': [100,],
    'nn_params': [{'layers_dim': np.array((x*2, x))} for x in range(20, 24, 2)]
}
clf = GridSearchCV(SentenceAspectClassifier(), 
                   parameters, 
                   cv=cv)
clf.fit(train_texts, 
        vocabulary=vocabulary, 
        embeddings=embeddings_matrix,
        save_state=False)

100%|███████████████████████████████████████████████████████████████| 50/50 [00:37<00:00,  1.33it/s]
100%|███████████████████████████████████████████████████████████████| 50/50 [00:36<00:00,  1.36it/s]
100%|███████████████████████████████████████████████████████████████| 50/50 [00:38<00:00,  1.30it/s]
100%|███████████████████████████████████████████████████████████████| 50/50 [00:37<00:00,  1.32it/s]
  4%|██▌                                                             | 2/50 [00:01<00:32,  1.46it/s]

  precision = correct / total_predictions


100%|███████████████████████████████████████████████████████████████| 50/50 [00:35<00:00,  1.41it/s]
  6%|███▊                                                            | 3/50 [00:02<00:33,  1.39it/s]

  f1 = 2 * (precision * recall) / (precision + recall)


100%|███████████████████████████████████████████████████████████████| 50/50 [00:38<00:00,  1.29it/s]
100%|███████████████████████████████████████████████████████████████| 50/50 [00:56<00:00,  1.12s/it]


GridSearchCV(cv=3, error_score=nan,
             estimator=AspectClassifier(batch_size=100,
                                        nn_params=<frozendict {'layers_dim': array([40])}>,
                                        num_epoch=50,
                                        optimizer_class=<class 'torch.optim.adam.Adam'>,
                                        optimizer_params=<frozendict {'lr': 0.01}>),
             iid='deprecated', n_jobs=None,
             param_grid={'batch_size': [100],
                         'nn_params': [{'layers_dim': array([40, 20])},
                                       {'layers_dim': array([44, 22])}]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [8]:
clf.cv_results_

{'mean_fit_time': array([39.71655027, 38.20819012]),
 'std_fit_time': array([1.85015788, 1.43609684]),
 'mean_score_time': array([0.55217187, 0.50693202]),
 'std_score_time': array([0.05934784, 0.06081869]),
 'param_batch_size': masked_array(data=[100, 100],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'param_nn_params': masked_array(data=[{'layers_dim': array([40, 20])},
                    {'layers_dim': array([44, 22])}],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'params': [{'batch_size': 100, 'nn_params': {'layers_dim': array([40, 20])}},
  {'batch_size': 100, 'nn_params': {'layers_dim': array([44, 22])}}],
 'split0_test_score': array([0.71224018, 0.7271854 ]),
 'split1_test_score': array([0.73966579, 0.71981132]),
 'split2_test_score': array([0.70990056, 0.71992819]),
 'mean_test_score': array([0.72060218, 0.7223083 ]),
 'std_test_score': array([0.0135138 , 0.00344896]),
 'rank_test_score': ar

In [9]:
stop

NameError: name 'stop' is not defined

In [9]:
sentence_train_f1_history = np.zeros(shape=(num_epoch,))
sentence_val_f1_history = np.zeros(shape=(num_epoch,))
threshold = None

for train_index, val_index in kf.split(train_texts):
    classifier = SentenceAspectClassifier(vocabulary=vocabulary, emb_matrix=embeddings_matrix)
    t, v = classifier.fit(
        train_texts=[train_texts[x] for x in train_index], 
        val_texts=[train_texts[x] for x in val_index],
        num_epoch=num_epoch)
    sentence_train_f1_history += t
    sentence_val_f1_history += v
    if threshold is not None:
        threshold += classifier.threshold
    else:
        threshold = classifier.threshold
sentence_train_f1_history /= splits_number
sentence_val_f1_history /= splits_number
threshold /= splits_number

100%|█████████████████████████████████████████████████████████████| 100/100 [00:58<00:00,  1.71it/s]
100%|█████████████████████████████████████████████████████████████| 100/100 [00:57<00:00,  1.74it/s]
100%|█████████████████████████████████████████████████████████████| 100/100 [00:58<00:00,  1.70it/s]
100%|█████████████████████████████████████████████████████████████| 100/100 [00:56<00:00,  1.76it/s]
100%|█████████████████████████████████████████████████████████████| 100/100 [00:57<00:00,  1.74it/s]


In [10]:
%store sentence_train_f1_history sentence_val_f1_history threshold

Stored 'sentence_train_f1_history' (ndarray)
Stored 'sentence_val_f1_history' (ndarray)
Stored 'threshold' (ndarray)


In [None]:
%store -r

In [None]:
display_score(parameter_values=[x for x in range(num_epoch)],
              train_values=sentence_train_f1_history,
              val_values=sentence_val_f1_history,
              score_name='F1 score')
plt.savefig(os.path.join(images_path, 'sentence_level_aspect_classifier_f1_vs_epoch.pdf'))

### Fit

In [None]:
classifier = SentenceAspectClassifier(vocabulary=vocabulary,
                                      emb_matrix=embeddings_matrix)
_ = classifier.fit(train_texts=train_texts,
                   init_threshold=threshold,
                   fixed_threshold=True)

### Predict

In [None]:
test_texts_pred = copy.deepcopy(test_texts)
for text in test_texts_pred:
    text.reset_opinions()

test_texts_pred = classifier.predict(test_texts_pred)
score = classifier.score(texts=test_texts, texts_pred=test_texts_pred)
score

# Opinion-Level Aspect Classification

### Optimal epoch

In [None]:
num_epoch = 100
opinion_train_f1_history = np.zeros(shape=(num_epoch), dtype=np.float)
opinion_val_f1_history = np.zeros(shape=(num_epoch), dtype=np.float)

for train_index, val_index in kf.split(train_texts):
    classifier = OpinionAspectClassifier(vocabulary=vocabulary,
                                         emb_matrix=embeddings_matrix)
    t, v = classifier.fit(
        train_texts=[train_texts[x] for x in train_index], 
        val_texts=[train_texts[x] for x in val_index],
        num_epoch=num_epoch)
    opinion_train_f1_history += t
    opinion_val_f1_history += v
opinion_train_f1_history /= splits_number
opinion_val_f1_history /= splits_number

In [None]:
%store opinion_train_f1_history opinion_val_f1_history

In [None]:
%store -r

In [None]:
display_score(parameter_values=[x for x in range(num_epoch)],
              train_values=opinion_train_f1_history,
              val_values=opinion_val_f1_history,
              score_name='F1 score')
plt.savefig(os.path.join(images_path, 'target_level_aspect_classifier_f1_vs_epoch.pdf'))

### Fit

In [None]:
classifier = OpinionAspectClassifier(vocabulary=vocabulary,
                                     emb_matrix=embeddings_matrix)
_ = classifier.fit(train_texts=train_texts)

### Predict

In [None]:
test_texts_pred = classifier.predict(test_texts_pred)
score = classifier.score(texts=test_texts, texts_pred=test_texts_pred)
print(f'{score}')