# **Aspect Extraction Baseline with Handcrafted features**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Download some libraries

In [None]:
!pip install seqeval

In [None]:
!pip install sklearn_crfsuite

Collecting sklearn_crfsuite
  Downloading sklearn_crfsuite-0.3.6-py2.py3-none-any.whl (12 kB)
Collecting python-crfsuite>=0.8.3
  Downloading python_crfsuite-0.9.8-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (965 kB)
[K     |████████████████████████████████| 965 kB 4.1 MB/s 
Installing collected packages: python-crfsuite, sklearn-crfsuite
Successfully installed python-crfsuite-0.9.8 sklearn-crfsuite-0.3.6


In [None]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

## Make the necessary imports

In [None]:
from nltk.tag import pos_tag
from seqeval.metrics import f1_score, classification_report
from sklearn_crfsuite import CRF, metrics
from sklearn.metrics import make_scorer,confusion_matrix
from pprint import pprint
from sklearn.metrics import f1_score,classification_report
from sklearn.pipeline import Pipeline
import string
import warnings
warnings.filterwarnings('ignore')

## Load the training/testing data. 

**input**: Iob format data, but with only one space separated colums - words and NEtags.

**output**: A list where each item is 2 lists.  sentence as a list of tokens, Aspect tags as a list for each token.

In [None]:
def load__data_conll(file_path):
    myoutput,words,tags = [],[],[]
    fh = open(file_path)
    for line in fh:
        line = line.strip()
        #print(line)
        if line=='':
            #Sentence ended.
            #print("-----------------------------")
            myoutput.append([words,tags])
            words,tags = [],[]
        else:
            word, tag = line.split()
            words.append(word)
            tags.append(tag)
    fh.close()
    return myoutput

In [None]:
data=load__data_conll("/content/drive/MyDrive/Restaurants_Train_v2_mod.iob")
data[0]

[['But', 'the', 'staff', 'was', 'so', 'horrible', 'to', 'us'],
 ['O', 'O', 'B-A', 'O', 'O', 'O', 'O', 'O']]

## Feature engineering


1.   **sent2feats(sentence)**.

     Get features for all words in the sentence Features:

 *   word context: a window of 2 words on either side of the current word, and current word.
 *   POS context: a window of 2 POS tags on either side of the current word, and current tag. 
 *   input: sentence as a list of tokens.
 *   output: list of dictionaries. each dict represents features for that word.

2. **get_feats_conll(IOB_data)**

  Extract features from the IOB data, after loading it.



In [43]:
def sent2feats(sentence):
    feats = []
    sen_tags = pos_tag(sentence) #This format is specific to this POS tagger!
    for i in range(0,len(sentence)):
        word = sentence[i]
        wordfeats = {}
       #word features: word, prev 2 words, next 2 words in the sentence.
        wordfeats['word'] = word
        if i == 0:
            wordfeats["prevWord"] = wordfeats["prevSecondWord"] = "<S>"
        elif i==1:
            wordfeats["prevWord"] = sentence[0]
            wordfeats["prevSecondWord"] = "</S>"
        else:
            wordfeats["prevWord"] = sentence[i-1]
            wordfeats["prevSecondWord"] = sentence[i-2]
        #next two words as features
        if i == len(sentence)-2:
            wordfeats["nextWord"] = sentence[i+1]
            wordfeats["nextNextWord"] = "</S>"
        elif i==len(sentence)-1:
            wordfeats["nextWord"] = "</S>"
            wordfeats["nextNextWord"] = "</S>"
        else:
            wordfeats["nextWord"] = sentence[i+1]
            wordfeats["nextNextWord"] = sentence[i+2]
        
        #POS tag features: current tag, previous and next 2 tags.
        wordfeats['tag'] = sen_tags[i][1]
        if i == 0:
            wordfeats["prevTag"] = wordfeats["prevSecondTag"] = "<S>"
        elif i == 1:
            wordfeats["prevTag"] = sen_tags[0][1]
            wordfeats["prevSecondTag"] = "</S>"
        else:
            wordfeats["prevTag"] = sen_tags[i - 1][1]

            wordfeats["prevSecondTag"] = sen_tags[i - 2][1]
            # next two words as features
        if i == len(sentence) - 2:
            wordfeats["nextTag"] = sen_tags[i + 1][1]
            wordfeats["nextNextTag"] = "</S>"
        elif i == len(sentence) - 1:
            wordfeats["nextTag"] = "</S>"
            wordfeats["nextNextTag"] = "</S>"
        else:
            wordfeats["nextTag"] = sen_tags[i + 1][1]
            wordfeats["nextNextTag"] = sen_tags[i + 2][1]
        #That is it! You can add whatever you want!
        feats.append(wordfeats)
    return feats

In [42]:
sent2feats(data[0][0][0:1])

[{'nextNextTag': '</S>',
  'nextNextWord': '</S>',
  'nextTag': '</S>',
  'nextWord': '</S>',
  'prevSecondTag': '<S>',
  'prevSecondWord': '<S>',
  'prevTag': '<S>',
  'prevWord': '<S>',
  'tag': 'CC',
  'word': 'But'}]

In [44]:
def get_feats_conll(IOB_data):
    feats = []
    labels = []
    for sentence in IOB_data:
        feats.append(sent2feats(sentence[0]))
        labels.append(sentence[1])
    return feats, labels

## Train a sequence model
1. **train_seq(X_train,Y_train,X_dev,Y_dev)**
   * CRF Model Training.
2. **print_cm(cm, labels)**
   * pretty print for confusion matrixes.
3. **get_confusion_matrix(y_true,y_pred,labels)**
   * python-crfsuite does not have a confusion matrix function, so writing it using sklearn's confusion matrix and print_cm from github.

In [45]:
def train_seq(X_train,Y_train,X_dev,Y_dev):
    crf = CRF(algorithm='lbfgs', c1=0.1, c2=10, max_iterations=100, all_possible_states=True)
    #Just to fit on training data
    crf.fit(X_train, Y_train)
    labels = list(crf.classes_)
    #testing:
    y_pred = crf.predict(X_dev)
    sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0]))
    print(f1_score(Y_dev, y_pred))
    print(classification_report(Y_dev, y_pred))
    get_confusion_matrix(Y_dev, y_pred,labels=sorted_labels) 
    return Y_dev,y_pred,sorted_labels

In [32]:
def print_cm(cm, labels):
    print("\n")
    columnwidth = max([len(x) for x in labels] + [5])  # 5 is value length
    empty_cell = " " * columnwidth
    # Print header
    print("    " + empty_cell, end=" ")
    for label in labels:
        print("%{0}s".format(columnwidth) % label, end=" ")
    print()
    # Print rows
    for i, label1 in enumerate(labels):
        print("    %{0}s".format(columnwidth) % label1, end=" ")
        sum = 0
        for j in range(len(labels)):
            cell = "%{0}.0f".format(columnwidth) % cm[i, j]
            sum =  sum + int(cell)
            print(cell, end=" ") 
        print(sum) #Prints the total number of instances per cat at the end.

In [33]:
#python-crfsuite does not have a confusion matrix function, 
#so writing it using sklearn's confusion matrix and print_cm from github
def get_confusion_matrix(y_true,y_pred,labels):
    trues,preds = [], []
    for yseq_true, yseq_pred in zip(y_true, y_pred):
        trues.extend(yseq_true)
        preds.extend(yseq_pred)
    print_cm(confusion_matrix(trues,preds,labels=labels),labels) 

## Training
* **main() function** to start training a sequential classification model with CRF.

In [40]:
def main():
  
    train_path = '/content/drive/MyDrive/Restaurants_Train_v2_mod.iob'
    test_path = '/content/drive/MyDrive/Restaurants_Test_Gold_mod.iob'
        
    conll_train = load__data_conll(train_path)
    conll_dev = load__data_conll(test_path)
    
    print("Training a Sequence classification model with CRF")
    feats, labels = get_feats_conll(conll_train)
    #print(feats)
    devfeats, devlabels = get_feats_conll(conll_dev)
    global Y_dev,y_pred,sorted_labels
    Y_dev,y_pred,sorted_labels=train_seq(feats, labels, devfeats, devlabels)
    print("Done with sequence model")

if __name__=="__main__":
    main() 

Training a Sequence classification model with CRF
0.6358754027926959
              precision    recall  f1-score   support

           A       0.81      0.52      0.64      1134

   micro avg       0.81      0.52      0.64      1134
   macro avg       0.81      0.52      0.64      1134
weighted avg       0.81      0.52      0.64      1134



              O   B-A   I-A 
        O  9190    59    30 9279
      B-A   479   624    31 1134
      I-A   257    45   198 500
Done with sequence model
