In [1]:
import os
import torch
import pandas as pd
import evaluate
import numpy as np
import seaborn as sns
import time
import wandb
import warnings
import requests
import json
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, AutoConfig, TrainingArguments, Trainer, pipeline
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from datetime import datetime
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

## Restaurant Search NER Recognition By Fine Tuning DistilBERT

## The Dataset
### MIT Restaurant Dataset

In [2]:
warnings.filterwarnings("ignore") #Don't do in production

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [None]:
# train = pd.read_csv("Data/train.bio", sep="\t", header=None)
# test = pd.read_csv("Data/test.bio", sep="\t", header=None)
# train.head()

### Read as generic text file

In [3]:
response = requests.get('https://raw.githubusercontent.com/laxmimerit/All-CSV-ML-Data-Files-Download/master/mit_restaurant_search_ner/train.bio')
response = response.text


In [4]:
response = response.splitlines()

In [5]:
temp_tokens = []
temp_tags = []
train_tokens = []
train_tags = []

for line in response:
    if line != "":
        tag, token = line.strip().split("\t")
        temp_tags.append(tag)
        temp_tokens.append(token)
    else:
        train_tokens.append(temp_tokens)
        train_tags.append(temp_tags)

        temp_tokens, temp_tags = [], []

In [None]:
train_tokens

In [6]:
len(train_tokens), len(train_tags)

(7659, 7659)

## Hugging Face Dataset Prep

In [7]:
df = pd.DataFrame({'tokens': train_tokens, 'ner_tags_str': train_tags})
dataset = Dataset.from_pandas(df)

dataset = DatasetDict({'train': dataset})

In [11]:
response = requests.get('https://raw.githubusercontent.com/laxmimerit/All-CSV-ML-Data-Files-Download/master/mit_restaurant_search_ner/test.bio')
response = response.text
response = response.splitlines()

In [12]:
temp_tokens = []
temp_tags = []
test_tokens = []
test_tags = []

for line in response:
    if line != "":
        tag, token = line.strip().split("\t")
        temp_tags.append(tag)
        temp_tokens.append(token)
    else:
        test_tokens.append(temp_tokens)
        test_tags.append(temp_tags)

        temp_tokens, temp_tags = [], []

In [13]:
len(test_tokens), len(test_tags)

(1520, 1520)

In [18]:
df = pd.DataFrame({'tokens': train_tokens, 'ner_tags_str': train_tags})
train = Dataset.from_pandas(df)

df = pd.DataFrame({'tokens': test_tokens, 'ner_tags_str': test_tags})
test = Dataset.from_pandas(df)

dataset = DatasetDict({'train': train, 'test': test, 'validation': test})

In [19]:
dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags_str'],
        num_rows: 7659
    })
    test: Dataset({
        features: ['tokens', 'ner_tags_str'],
        num_rows: 1520
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags_str'],
        num_rows: 1520
    })
})

In [20]:
dataset['train'][0]

{'tokens': ['2', 'start', 'restaurants', 'with', 'inside', 'dining'],
 'ner_tags_str': ['B-Rating', 'I-Rating', 'O', 'O', 'B-Amenity', 'I-Amenity']}

In [26]:
unique_tags = set()
for tag in dataset['train']['ner_tags_str']:
    unique_tags.update(tag)
 
#unique_tags   
unique_tags = list(set([x[2:] for x in list(unique_tags) if x != 'O']))

['Price',
 'Cuisine',
 'Amenity',
 'Rating',
 'Location',
 'Restaurant_Name',
 'Hours',
 'Dish']

In [27]:
tag2index = {"O": 0}
for i, tag in enumerate(unique_tags):
    tag2index[f'B-{tag}'] = len(tag2index)
    tag2index[f'I-{tag}'] = len(tag2index)
    
tag2index

{'O': 0,
 'B-Price': 1,
 'I-Price': 2,
 'B-Cuisine': 3,
 'I-Cuisine': 4,
 'B-Amenity': 5,
 'I-Amenity': 6,
 'B-Rating': 7,
 'I-Rating': 8,
 'B-Location': 9,
 'I-Location': 10,
 'B-Restaurant_Name': 11,
 'I-Restaurant_Name': 12,
 'B-Hours': 13,
 'I-Hours': 14,
 'B-Dish': 15,
 'I-Dish': 16}

In [None]:
index2tag = 