# Steps
1. Preprocessing

In [45]:
#! python -m spacy download en_core_web_sm

In [144]:
import torch
import torchvision
import torchvision.transforms as transforms
import pandas as pd
import numpy as np
import re
import torch.utils.data
import matplotlib.pyplot as plt
import spacy
import en_core_web_sm
nlp = en_core_web_sm.load()
from tqdm import tqdm

In [145]:
# load data
devdata = pd.read_csv('data/devdata.csv', sep = '\t', header = None)
traindata = pd.read_csv('data/traindata.csv', sep = '\t', header = None)

In [146]:
# basic preprocessing
def preprocessing(df, nlp):

    # rename columns
    df = df.rename(columns = {0: 'sentiment', 1: 'aspect', 2: 'target', 3: 'character', 4: 'sentence'})
    # define beginning and ending characters
    df['begin'] = df['character'].apply(lambda x : int(x.split(":")[0]))
    df['end'] = df['character'].apply(lambda x : int(x.split(":")[1]))

    # lower cases 
    df['sentence'] = df['sentence'].apply(lambda x : x.lower())
    
    # remove punctuation
    df['sentence'] = df['sentence'].apply(lambda x: nlp(x))
    df['sentence'] = df['sentence'].apply(lambda x: [token.orth_ for token in x if not token.is_punct])
    
    # rejoin
    df['sentence'] = df['sentence'].apply(lambda x: " ".join(x))

    return df


In [155]:
devdata_df = preprocessing(devdata, nlp)
traindata_df = preprocessing(traindata, nlp)

# labels = df['sentiment']

## Model 1: Hybrid Classification
1. Detecting aspect terms 
    * training on liblinear
2. Corresponding polarity
    * classifier
3. Detecting the aspect categories
    * classifier
4. Corresponding polarities
    * classifier

### 1. Aspect Category Classification

## Model 2: BERT model

In [148]:
#!pip install pytorch-pretrained-bert pytorch-nlp

from pytorch_pretrained_bert import BertTokenizer

In [149]:
#pre-processing using BERT BasicTokeniser or BertTokeniser
def preprocessing_BERT(df):
    
    
    return df

In [157]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

def prep_BERT(df):
    # sentence preparation (BERT standards)
    df['sentence'] = df['sentence'].apply(lambda x: "[CLS] " + x + " [SEP]" )
    # tokenisation
    df['sentence'] = df['sentence'].apply(lambda x: tokenizer.tokenize(x))
    # convert token to vocabulary indices
    df['id_tokens'] = df['sentence'].apply(lambda x: tokenizer.convert_tokens_to_ids(x))
    # convert inputs to PyTorch tensors
    df['tokens_tensor'] = df['id_tokens'].apply(lambda x: torch.tensor([x]))
    
    return df

In [158]:
devdata_df = prep_BERT(devdata_df)
traindata_df = prep_BERT(traindata_df)

# something weird happens during tokenisation, example 135 

# Files .py from SRC

## Classifier 

In [None]:
class Classifier:
    """The Classifier"""


    #############################################
    def train(self, trainfile):
        """Trains the classifier model on the training set stored in file trainfile"""


    def predict(self, datafile):
        """Predicts class labels for the input instances in file 'datafile'
        Returns the list of predicted labels
        """


## Tester

In [None]:
import time, sys
import numpy as np

from classifier import Classifier


def set_reproducible():
    # The below is necessary to have reproducible behavior.
    import random as rn
    import os
    os.environ['PYTHONHASHSEED'] = '0'
    # The below is necessary for starting Numpy generated random numbers
    # in a well-defined initial state.
    np.random.seed(17)
    # The below is necessary for starting core Python generated random numbers
    # in a well-defined state.
    rn.seed(12345)



def load_label_output(filename):
    with open(filename, 'r', encoding='UTF-8') as f:
        return [line.strip().split("\t")[0] for line in f if line.strip()]



def eval_list(glabels, slabels):
    if (len(glabels) != len(slabels)):
        print("\nWARNING: label count in system output (%d) is different from gold label count (%d)\n" % (
        len(slabels), len(glabels)))
    n = min(len(slabels), len(glabels))
    incorrect_count = 0
    for i in range(n):
        if slabels[i] != glabels[i]: incorrect_count += 1
    acc = (n - incorrect_count) / n
    return acc*100



def train_and_eval(classifier, trainfile, devfile, testfile, run_id):
    print(f"\nRUN: {run_id}")
    print("  %s.1. Training the classifier..." % str(run_id))
    classifier.train(trainfile)
    print()
    print("  %s.2. Eval on the dev set..." % str(run_id), end="")
    slabels = classifier.predict(devfile)
    glabels = load_label_output(devfile)
    devacc = eval_list(glabels, slabels)
    print(" Acc.: %.2f" % devacc)
    testacc = -1
    if testfile is not None:
        # Evaluation on the test data
        print("  %s.3. Eval on the test set..." % str(run_id), end="")
        slabels = classifier.predict(testfile)
        glabels = load_label_output(testfile)
        testacc = eval_list(glabels, slabels)
        print(" Acc.: %.2f" % testacc)
    print()
    return (devacc, testacc)


if __name__ == "__main__":
    set_reproducible()
    n_runs = 5
    if len(sys.argv) > 1:
        n_runs = int(sys.argv[1])
    datadir = "../data/"
    trainfile =  datadir + "traindata.csv"
    devfile =  datadir + "devdata.csv"
    testfile = None
    # testfile = datadir + "testdata.csv"

    # Runs
    start_time = time.perf_counter()
    devaccs = []
    testaccs = []
    for i in range(1, n_runs+1):
        classifier =  Classifier()
        devacc, testacc = train_and_eval(classifier, trainfile, devfile, testfile, i)
        devaccs.append(np.round(devacc,2))
        testaccs.append(np.round(testacc,2))
    print('\nCompleted %d runs.' % n_runs)
    total_exec_time = (time.perf_counter() - start_time)
    print("Dev accs:", devaccs)
    print("Test accs:", testaccs)
    print()
    print("Mean Dev Acc.: %.2f (%.2f)" % (np.mean(devaccs), np.std(devaccs)))
    print("Mean Test Acc.: %.2f (%.2f)" % (np.mean(testaccs), np.std(testaccs)))
    print("\nExec time: %.2f s. ( %d per run )" % (total_exec_time, total_exec_time / n_runs))