In [None]:
!pip install torch torchvision torchaudio
!pip install transformers

# The implementations of 4 pipelines with models from HuggingFace library
### List of models consists of base BERT, BERT uncased, RoBERTa, and Electra
### Since none of these models are trained to recognise dates, the datefinder library is used to accomodate that
### The models can also be found in ner_models.py

In [1]:
import re
import torch
import datefinder
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from scripts import load_test_data, test_n_samples, convert_to_standard_date

user_requests, required_data = load_test_data()

# Bert base NER

In [2]:
class HfBERTModel():
    def __init__(self):
        tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
        model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")        
        self.model = pipeline("ner", model=model, tokenizer=tokenizer)

    def extract_flight_details(self, user_request):
        # Initializing base variables
        name = 'Unspecified'
        departure = 'Unspecified'
        destination = 'Unspecified'
        date = 'Unspecified'
        
        ner_results = self.model(user_request)
        
        departure_end = None
        destination_end = None
        
        for entity in ner_results:           
            if entity['entity'] == 'B-PER' or entity['entity'] == 'I-PER':
                if name == 'Unspecified':
                    name = entity['word']
                else:
                    name += " " + entity['word']
                    
            elif entity['entity'] == 'B-LOC' or entity['entity'] == 'I-LOC':
                if departure == 'Unspecified':
                    if ' from ' in user_request[:entity['start']]:
                        departure = entity['word']
                        departure_end = entity['end']
                    elif ' to ' in user_request[:entity['start']]:
                        destination = entity['word']
                        destination_end = entity['end']
                        
                elif departure_end is not None and entity['start'] == departure_end+1:
                        departure += ' '+ entity['word']
                
                elif destination == 'Unspecified':
                    destination = entity['word']
                    destination_end = entity['end']

                elif destination_end is not None and entity['start'] == destination_end+1:
                        destination += ' '+ entity['word']
                else:
                    pass

        matches = datefinder.find_dates(user_request)
        match = next(matches, None)
        date = match.strftime("%d-%m-%Y") if match else 'Unspecified'
            
            
        return name.title(), departure.title(), destination.title(), date



In [3]:
model = HfBERTModel()

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
test_n_samples(model, user_requests, required_data, 5)

Name: Olivia Parker, True_Name: Olivia Parker
Departure: Barcelona, True_Departure: Barcelona
Destination: Amsterdam, True_Destination: Amsterdam
Date: 20-05-2024, True_Date: 20-05-2024
---------------------------------------------
Name: Henry Wright, True_Name: Henry Wright
Departure: Seoul, True_Departure: Seoul
Destination: Sydney, True_Destination: Sydney
Date: 08-09-2024, True_Date: 08-09-2024
---------------------------------------------
Name: Lily Johnson, True_Name: Lily Johnson
Departure: Rome, True_Departure: Rome
Destination: Paris, True_Destination: Paris
Date: 12-07-2024, True_Date: 12-07-2024
---------------------------------------------
Name: Lucas Thompson, True_Name: Lucas Thompson
Departure: New Delhi, True_Departure: New Delhi
Destination: Dubai, True_Destination: Dubai
Date: 05-10-2024, True_Date: 05-10-2024
---------------------------------------------
Name: Isabelle Brown, True_Name: Isabelle Brown
Departure: Tokyo, True_Departure: Tokyo
Destination: London, True_

# BERT base uncased

In [5]:
class HfBERTUncasedModel():
    def __init__(self):
        tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER-uncased")
        model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER-uncased")        
        self.model = pipeline("ner", model=model, tokenizer=tokenizer)

    def extract_flight_details(self, user_request):
        # Initializing base variables
        name = 'Unspecified'
        departure = 'Unspecified'
        destination = 'Unspecified'
        date = 'Unspecified'
        
        ner_results = self.model(user_request)
        
        departure_end = None
        destination_end = None
        
        for entity in ner_results:           
            if entity['entity'] == 'B-PER' or entity['entity'] == 'I-PER':
                if name == 'Unspecified':
                    name = entity['word']
                else:
                    name += " " + entity['word']
                    
            elif entity['entity'] == 'B-LOC' or entity['entity'] == 'I-LOC':
                if departure == 'Unspecified':
                    if ' from ' in user_request[:entity['start']]:
                        departure = entity['word']
                        departure_end = entity['end']
                    elif ' to ' in user_request[:entity['start']]:
                        destination = entity['word']
                        destination_end = entity['end']
                        
                elif departure_end is not None and entity['start'] == departure_end+1:
                        departure += ' '+ entity['word']
                
                elif destination == 'Unspecified':
                    destination = entity['word']
                    destination_end = entity['end']

                elif destination_end is not None and entity['start'] == destination_end+1:
                        destination += ' '+ entity['word']
                else:
                    pass

        matches = datefinder.find_dates(user_request)
        match = next(matches, None)
        date = match.strftime("%d-%m-%Y") if match else 'Unspecified'
            
        return name.title(), departure.title(), destination.title(), date



In [6]:
model = HfBERTModel()

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
test_n_samples(model, user_requests, required_data, 5)

Name: Olivia Parker, True_Name: Olivia Parker
Departure: Barcelona, True_Departure: Barcelona
Destination: Amsterdam, True_Destination: Amsterdam
Date: 20-05-2024, True_Date: 20-05-2024
---------------------------------------------
Name: Henry Wright, True_Name: Henry Wright
Departure: Seoul, True_Departure: Seoul
Destination: Sydney, True_Destination: Sydney
Date: 08-09-2024, True_Date: 08-09-2024
---------------------------------------------
Name: Lily Johnson, True_Name: Lily Johnson
Departure: Rome, True_Departure: Rome
Destination: Paris, True_Destination: Paris
Date: 12-07-2024, True_Date: 12-07-2024
---------------------------------------------
Name: Lucas Thompson, True_Name: Lucas Thompson
Departure: New Delhi, True_Departure: New Delhi
Destination: Dubai, True_Destination: Dubai
Date: 05-10-2024, True_Date: 05-10-2024
---------------------------------------------
Name: Isabelle Brown, True_Name: Isabelle Brown
Departure: Tokyo, True_Departure: Tokyo
Destination: London, True_

# RoBERTa Large

In [8]:
class HfRoBERTaModel():
    def __init__(self):
        tokenizer = AutoTokenizer.from_pretrained("Jean-Baptiste/roberta-large-ner-english")
        model = AutoModelForTokenClassification.from_pretrained("Jean-Baptiste/roberta-large-ner-english")
        self.model = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")
        
    def extract_flight_details(self, user_request):
        # Initializing base variables
        name = 'Unspecified'
        departure = 'Unspecified'
        destination = 'Unspecified'
        date = 'Unspecified'
        
        ner_results = self.model(user_request)

        for entity in ner_results:
            if entity['entity_group'] == 'PER':
                if name == 'Unspecified':
                    name = entity['word']
                    
            elif entity['entity_group'] in ['LOC']:
                if departure == 'Unspecified':
                    if ' from ' in user_request[:entity['start']]:
                        departure = entity['word']
                    elif ' to ' in user_request[:entity['start']]:
                        destination = entity['word']
                elif destination == 'Unspecified':
                    destination = entity['word']
        
        matches = datefinder.find_dates(user_request)
        match = next(matches, None)
        date = match.strftime("%d-%m-%Y") if match else 'Unspecified'
        
        return name.title().strip(), departure.title().strip(), destination.title().strip(), date


In [9]:
model = HfRoBERTaModel()

In [10]:
test_n_samples(model, user_requests, required_data, 5)

Name: Olivia Parker, True_Name: Olivia Parker
Departure: Barcelona, True_Departure: Barcelona
Destination: Amsterdam, True_Destination: Amsterdam
Date: 20-05-2024, True_Date: 20-05-2024
---------------------------------------------
Name: Henry Wright, True_Name: Henry Wright
Departure: Seoul, True_Departure: Seoul
Destination: Sydney, True_Destination: Sydney
Date: 08-09-2024, True_Date: 08-09-2024
---------------------------------------------
Name: Lily Johnson, True_Name: Lily Johnson
Departure: Rome, True_Departure: Rome
Destination: Paris, True_Destination: Paris
Date: 12-07-2024, True_Date: 12-07-2024
---------------------------------------------
Name: Lucas Thompson, True_Name: Lucas Thompson
Departure: New Delhi, True_Departure: New Delhi
Destination: Dubai, True_Destination: Dubai
Date: 05-10-2024, True_Date: 05-10-2024
---------------------------------------------
Name: Isabelle Brown, True_Name: Isabelle Brown
Departure: Tokyo, True_Departure: Tokyo
Destination: London, True_

# Electra model with large discriminator

In [11]:
class HfElectraModel():
    def __init__(self):
        self.model = pipeline("ner", model="dbmdz/electra-large-discriminator-finetuned-conll03-english", grouped_entities=True)

    def extract_flight_details(self, user_request):
        # Initializing base variables
        name = 'Unspecified'
        departure = 'Unspecified'
        destination = 'Unspecified'
        date = 'Unspecified'
        
        ner_results = self.model(user_request)

        for entity in ner_results:
            if entity['entity_group'] == 'PER':
                if name == 'Unspecified':
                    name = entity['word']
                    
            elif entity['entity_group'] in ['LOC']:
                if departure == 'Unspecified':
                    if ' from ' in user_request[:entity['start']]:
                        departure = entity['word']
                    elif ' to ' in user_request[:entity['start']]:
                        destination = entity['word']
                elif destination == 'Unspecified':
                    destination = entity['word']

        matches = datefinder.find_dates(user_request)
        match = next(matches, None)
        date = match.strftime("%d-%m-%Y") if match else 'Unspecified'
            
        return name.title(), departure.title(), destination.title(), date

In [12]:
user_requests, required_data = load_test_data()

In [13]:
model = HfElectraModel()



In [14]:
test_n_samples(model, user_requests, required_data, 5)

Name: Olivia Parker, True_Name: Olivia Parker
Departure: Barcelona, True_Departure: Barcelona
Destination: Amsterdam, True_Destination: Amsterdam
Date: 20-05-2024, True_Date: 20-05-2024
---------------------------------------------
Name: Henry Wright, True_Name: Henry Wright
Departure: Seoul, True_Departure: Seoul
Destination: Sydney, True_Destination: Sydney
Date: 08-09-2024, True_Date: 08-09-2024
---------------------------------------------
Name: Lily Johnson, True_Name: Lily Johnson
Departure: Rome, True_Departure: Rome
Destination: Paris, True_Destination: Paris
Date: 12-07-2024, True_Date: 12-07-2024
---------------------------------------------
Name: Lucas Thompson, True_Name: Lucas Thompson
Departure: New Delhi, True_Departure: New Delhi
Destination: Dubai, True_Destination: Dubai
Date: 05-10-2024, True_Date: 05-10-2024
---------------------------------------------
Name: Isabelle Brown, True_Name: Isabelle Brown
Departure: Tokyo, True_Departure: Tokyo
Destination: London, True_