# Named Entity Recognition with MIT Restaurant Dataset

## Task Description

In this assignment, I will train a NER Model using Conditional Random Fields (CRF) on and report the accuracy of the model on the test dataset.



In [None]:
!pip install -q python-crfsuite

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.1/1.1 MB[0m [31m2.6 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.4/1.1 MB[0m [31m5.2 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━[0m [32m0.7/1.1 MB[0m [31m6.6 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.1/1.1 MB[0m [31m7.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from itertools import chain
import pycrfsuite

## Dataset
[MIT Restaurant Dataset](https://groups.csail.mit.edu/sls/downloads/restaurant/)

The [train](https://groups.csail.mit.edu/sls/downloads/restaurant/restauranttrain.bio) data
The [test](https://groups.csail.mit.edu/sls/downloads/restaurant/restauranttest.bio) data.

In [None]:
%%capture
!rm -f restauranttrain.bio
!rm -f restauranttest.bio

!wget https://groups.csail.mit.edu/sls/downloads/restaurant/restauranttest.bio
!wget https://groups.csail.mit.edu/sls/downloads/restaurant/restauranttrain.bio

The sentence below will be loaded into a list

```
O	a
B-Rating	four
I-Rating	star
O	restaurant
B-Location	with
I-Location	a
B-Amenity	bar
```



In [None]:


def load_data(file_path):
    """Load data into a list of list of (word, tag) tuples

    Args:
        file_path (str): Path to data

    Returns:
        sentences: list of (word, tag) tuples
    """
    sentences = []


    with open(file_path, 'r') as file:
        sentence = []
        for line in file:
            line = line.strip()
            if line:
                tag, word = line.split('\t')
                sentence.append((word, tag))
            else:
                if sentence:
                    sentences.append(sentence)
                    sentence = []

        if sentence:
            sentences.append(sentence)

    return sentences


In [None]:
train_sents = load_data('restauranttrain.bio')
test_sents = load_data('restauranttest.bio')

Let's check the number of sentences in train and test data

In [None]:
len(train_sents)

7660

In [None]:
len(test_sents)

1521

In [None]:
train_sents[0]

[('2', 'B-Rating'),
 ('start', 'I-Rating'),
 ('restaurants', 'O'),
 ('with', 'O'),
 ('inside', 'B-Amenity'),
 ('dining', 'I-Amenity')]

##Extracting features

In [None]:
def is_all_caps(word):
    return word.upper() == word and not word.isdigit()
def word2features(sentence, i):
    """
    Arguments:
        sentence (list): list of words [w1, w2,...,w_n]
        i (int): index of the word
    Return:
        features (dict): dictionary of features
    """
    word = sentence[i]
    prev_word = '' if i==0 else sentence[i-1].lower()
    next_word = '' if i==len(sentence)-1 else sentence[i+1].lower()
    features = {

        'word.lower()': word.lower(),  # Word identity (lowercase)
        'prev_word.lower()': prev_word,  # Previous word identity
        'current_word.lower()': word.lower(),  # Current word identity
        'next_word.lower()': next_word,  # Next word
        'prev_word_current_word': prev_word + '||' + word.lower(),  # Previous word and current word combination
        'current_word_next_word': word.lower() + '||' + next_word,  # Current word and next word combination
        'word_shape': word_shape(word),  # Word shapes
        'prefix_1': word[0],
        'prefix_2': word[:2],
        'prefix_3': word[:3],
        'prefix_4': word[:4],
        'suffix_1': word[-1],
        'suffix_2': word[-2:],
        'suffix_3': word[-3:],
        'suffix_4': word[-4:],
        'is_capitalized': int(word[0].isupper())  # The first character of the current word is capitalized

    }

    return features

def word_shape(word):
    shape = ''
    for char in word:
        if char.isalpha():
            if char.isupper():
                shape += 'X'
            else:
                shape += 'x'
        elif char.isdigit():
            shape += 'd'
        else:
            shape += char
    return shape

def sent2features(sentence):
    """
    sentence is a list of words [w1, w2,...,w_n]
    """
    return [word2features(sentence, i) for i in range(len(sentence))]


def sent2labels(sentence):
    """
    sentence is a list of tuples (word, postag)
    """
    return [tag for token, tag in sentence]

def untag(sentence):
    """
    sentence is a list of tuples (word, postag)
    """
    return [token for token, _ in sentence]

In [None]:
train_sents[0]

[('2', 'B-Rating'),
 ('start', 'I-Rating'),
 ('restaurants', 'O'),
 ('with', 'O'),
 ('inside', 'B-Amenity'),
 ('dining', 'I-Amenity')]

In [None]:
sent2features(untag(train_sents[0]))[1]

{'word.lower()': 'start',
 'prev_word.lower()': '2',
 'current_word.lower()': 'start',
 'next_word.lower()': 'restaurants',
 'prev_word_current_word': '2||start',
 'current_word_next_word': 'start||restaurants',
 'word_shape': 'xxxxx',
 'prefix_1': 's',
 'prefix_2': 'st',
 'prefix_3': 'sta',
 'prefix_4': 'star',
 'suffix_1': 't',
 'suffix_2': 'rt',
 'suffix_3': 'art',
 'suffix_4': 'tart',
 'is_capitalized': 0}

### Create train/test data

In [None]:
X_train = [sent2features(untag(s)) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(untag(s)) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

## Training

In [None]:
%%time
trainer = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

CPU times: user 2.08 s, sys: 85.5 ms, total: 2.16 s
Wall time: 2.55 s


In [None]:
#@title Set model parameters

max_iterations = "50" #@param[50, 20, 100]

trainer.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': max_iterations,

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

In [None]:
%%time
trainer.train('mitrestaurant.crfsuite')

CPU times: user 13 s, sys: 209 ms, total: 13.2 s
Wall time: 15.1 s


## EvaluationEvaluation

In [None]:
!pip install -q seqeval[cpu]

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


### Make Predictions

In [None]:
tagger = pycrfsuite.Tagger()
tagger.open('mitrestaurant.crfsuite')

<contextlib.closing at 0x7e00089b4820>

In [None]:
example_sent = test_sents[0]
example_sent

[('a', 'O'),
 ('four', 'B-Rating'),
 ('star', 'I-Rating'),
 ('restaurant', 'O'),
 ('with', 'B-Location'),
 ('a', 'I-Location'),
 ('bar', 'B-Amenity')]

In [None]:
print("Predicted:", ' '.join(tagger.tag(sent2features(untag(example_sent)))))
print("Correct:  ", ' '.join(sent2labels(example_sent)))

Predicted: O B-Rating I-Rating O O O B-Amenity
Correct:   O B-Rating I-Rating O B-Location I-Location B-Amenity


In [None]:
%%time
y_pred = [tagger.tag(xseq) for xseq in X_test]

CPU times: user 165 ms, sys: 2.03 ms, total: 168 ms
Wall time: 168 ms


In [None]:
from seqeval.metrics import classification_report

print(classification_report(y_test, y_pred))

                 precision    recall  f1-score   support

        Amenity       0.70      0.66      0.68       533
        Cuisine       0.84      0.83      0.83       532
           Dish       0.72      0.72      0.72       288
          Hours       0.71      0.66      0.69       212
       Location       0.82      0.80      0.81       812
          Price       0.81      0.80      0.81       171
         Rating       0.76      0.75      0.76       201
Restaurant_Name       0.80      0.73      0.77       402

      micro avg       0.78      0.75      0.77      3151
      macro avg       0.77      0.74      0.76      3151
   weighted avg       0.78      0.75      0.77      3151



# References

1. Datasets for Entity Recognition: https://github.com/juand-r/entity-recognition-datasets
2. [sklearn-crfsuite tutorial](https://sklearn-crfsuite.readthedocs.io/en/latest/tutorial.html#let-s-use-conll-2002-data-to-build-a-ner-system).
3. [Quick Recipe: Build a POS tagger using a Conditional Random Field](https://nlpforhackers.io/crf-pos-tagger/)
4. [NLP Guide: Identifying Part of Speech Tags using Conditional Random Fields](https://medium.com/analytics-vidhya/pos-tagging-using-conditional-random-fields-92077e5eaa31)
5. [CRFsuite - Tutorial on Chunking Task](http://www.chokkan.org/software/crfsuite/tutorial.html)