This work was done by Boshra AL-Sadder and AL-Rahma Sadder.

DA-NERB: Domain-aware name entity recognition model for booking queries 

The ABC Dataset is a specialized dataset for reservations texts written in Arabic.

We propose a recognition engine for Arabic booking chatbots (DA-NERB) that can recognize reservation entities when different booking domains are considered by using state-of-the-art solutions to achieve high accuracy.

Also, we built the ABC dataset, as well as collected and labeled the Real dataset.

This model is configurable to seven domains including flights, hotels, cinemas, football matches, cars, restaurants and clinics.

The adopted work that we based on is the state-of-the-art model to solve NLP tasks, the AraBERTv02-base model. (https://github.com/aub-mind/arabert)

This model was trained on the ABC dataset.

The proposed model achieved 100% and 96.6% accuracy scores on the ABC dataset and the real dataset, respectively


In [1]:
# Installing the required libraries
!git clone https://github.com/aub-mind/arabert
!pip install openpyxl 
!pip install xlsxwriter
!pip install pytorchtools
!pip install transformers

# importing the required libraries
from arabert.preprocess import ArabertPreprocessor

import transformers
from tqdm import tqdm
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

import torch
import torch.nn as nn

import copy
import numpy as np
import pandas as pd
import xlsxwriter
from datetime import datetime
from collections import Counter
import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedShuffleSplit 
from sklearn.metrics import accuracy_score,f1_score,precision_score,recall_score,classification_report,confusion_matrix

# Initializing constants
MAX_SEQ_LEN = 60
TRAIN_BATCH_SIZE =32
VALID_BATCH_SIZE = 8
EPOCHS = 100
LEARNING_RATE = 5e-5
WARMUP_RATIO = 0.1
MAX_GRAD_NORM = 1.0

# For early stopping
PATIENCE= 5           
min_val_loss = np.Inf # inf 
epochs_no_improve = 0 

#In our experiment to build DA-NERB we used AraBERTv0.2_base 
BASE_MODEL_PATH = 'aubmindlab/bert-base-arabertv02'
TOKENIZER = transformers.BertTokenizer.from_pretrained(
    BASE_MODEL_PATH,
    do_lower_case=False)

# The preprocessor class
arabert_prep = ArabertPreprocessor(model_name=BASE_MODEL_PATH)


Cloning into 'arabert'...
remote: Enumerating objects: 559, done.[K
remote: Counting objects: 100% (345/345), done.[K
remote: Compressing objects: 100% (247/247), done.[K
remote: Total 559 (delta 184), reused 246 (delta 92), pack-reused 214[K
Receiving objects: 100% (559/559), 9.16 MiB | 14.88 MiB/s, done.
Resolving deltas: 100% (307/307), done.
Collecting openpyxl
  Downloading openpyxl-3.0.9-py2.py3-none-any.whl (242 kB)
     |████████████████████████████████| 242 kB 937 kB/s            
[?25hCollecting et-xmlfile
  Downloading et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-1.1.0 openpyxl-3.0.9
Collecting xlsxwriter
  Downloading XlsxWriter-3.0.2-py3-none-any.whl (149 kB)
     |████████████████████████████████| 149 kB 938 kB/s            
[?25hInstalling collected packages: xlsxwriter
Successfully installed xlsxwriter-3.0.2
Collecting pytorchtools
  Downloading pytorchtools-0.0

Downloading:   0%|          | 0.00/805k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/381 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.52M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/384 [00:00<?, ?B/s]

In [2]:
#Reading the ABC dataset (contains 76,117 samples and 26 entities categories)
columns = pd.read_excel (r'../input/datasetgithub/ABC_Dataset.xlsx', sheet_name='ABC_Dataset' , engine = 'openpyxl')

# Preprocess 
def preprossingFun(Sentences, labels ,Domains):
    Sentences_Prep=[]
    for sen in range(len(Sentences)):
        string = Sentences[sen]
        str=''
        for i in range(len(string)):
            if (string[i] in '()+'): 
                continue
            else:
                str += string[i]
                
        preprocess_text = arabert_prep.preprocess(str)
        newString = arabert_prep.unpreprocess(preprocess_text)   

        str_list=[]
        for i in range(len(newString)):
            str_list.append(newString[i])

        str=''
        jump_index =-1
        for i in range(len(str_list)):
            if(i == jump_index):
                continue
            if(str_list[i]==':'):
                str +=str_list[i]
                if(str_list[i+1]==' '):
                    jump_index= i+1 
            else:
                str +=str_list[i]
    
        Sentences_Prep.append(str)                          
                            
    print('Sentences_Prep ',len(Sentences_Prep))    
    data=[]
    token=[]
    tag=[]
    domain=[]
    for i in range(len(Sentences_Prep)):
         token = Sentences_Prep[i].split()
         tag= labels[i].split()
         domain= Domains[i]
         data.append((token,tag,domain))
         token = []
         tag = []
         domain = []

    return data

# Splitting the ABC Dataset (Train set,Val set, Test set)
train_samples=61654
test_samples=7612
val_samples=6851
samples = test_samples+val_samples

Sentences =columns.values.T[0].tolist()
labels =columns.values.T[1].tolist()
Domains =columns.values.T[2].tolist()

#Test 7612  
Sentences_Test =Sentences[:test_samples]
labels_Test =labels[:test_samples]
Domains_Test =Domains[:test_samples]
data_test= preprossingFun(Sentences_Test, labels_Test, Domains_Test) 


#Val   6851 
Sentences_Val =Sentences[test_samples: samples]
labels_Val =labels[test_samples: samples]
Domains_Val =Domains[test_samples: samples]
data_val= preprossingFun(Sentences_Val, labels_Val, Domains_Val) 


#Train   61654 
Sentences_Train = Sentences[samples:]
labels_Train =labels[samples:]
Domains_Train =Domains[samples:]
data_train= preprossingFun(Sentences_Train, labels_Train, Domains_Train) 

# Each instance in the Train data, Test data, or Val data 
# is about a tuple of two lists(tokens list and tags list) and one variable of a domain type.


FileNotFoundError: [Errno 2] No such file or directory: '../input/datasetgithub/ABC_Dataset.xlsx'

In [None]:
# 26 entities categories in the ABC dataset
label_list = list(Counter([ label for label in data_test for label in label[1]]).keys())

# TOKENIZER.pad_token_id = 17029
# pad_token_label_id = -100


# Creating a custom dataset class that matches the ABC dataset to convert the data to PyTorch tensors
class NERDataset:
  def __init__(self, texts, tags, domains, label_list):
    self.texts = texts # list of sentences
    self.tags = tags
    self.domains = domains
    self.label_map = {label: i for i, label in enumerate(label_list)} 
    self.pad_token_label_id = nn.CrossEntropyLoss().ignore_index  #-100
    
    # Use cross entropy ignore_index as padding label id so that only
    # real label ids contribute to the loss later.
     
  def __len__(self):
    return len(self.texts)

  def __getitem__(self, item):
    textlist = self.texts[item] 
    tags = self.tags[item]
    domains = self.domains[item]
    tokens = []
    label_ids = []
    for word, label in zip(textlist, tags):
        word_tokens = TOKENIZER.tokenize(word) 
        if len(word_tokens) > 0:
            tokens.extend(word_tokens)    
            # Use the real label id for the first token of the word, and padding ids for the remaining tokens
            label_ids.extend([self.label_map[label]] + [self.pad_token_label_id] * (len(word_tokens) - 1))  # ['ن', '##بالغ']   [[-100],[5]]

    # Account for [CLS] and [SEP] 
    special_tokens_count = TOKENIZER.num_special_tokens_to_add() #2  
    if len(tokens) > MAX_SEQ_LEN - special_tokens_count -1:
      tokens = tokens[: (MAX_SEQ_LEN - special_tokens_count)]
      label_ids = label_ids[: (MAX_SEQ_LEN - special_tokens_count)]
  
    #Add the [SEP] token // after the sentence
    tokens += [TOKENIZER.sep_token]               
    label_ids += [self.pad_token_label_id]      
    segment_ids = [0] * len(tokens)              

 
    #Add the [CLS] TOKEN // before the sentence
    tokens = [TOKENIZER.cls_token] + tokens            
    label_ids = [self.pad_token_label_id] + label_ids   
    segment_ids = [0] + segment_ids
    
    input_ids = TOKENIZER.convert_tokens_to_ids(tokens)                                                

    # The mask has 1 for real tokens and 0 for padding tokens. 
    # Only real tokens are attended to.
    input_mask = [1] * len(input_ids)   

    # Zero-pad up to the sequence length.
    padding_length = MAX_SEQ_LEN - len(input_ids) 

    input_ids += [TOKENIZER.pad_token_id] * padding_length  
    input_mask += [0] * padding_length    
    segment_ids += [0] * padding_length 
    label_ids += [self.pad_token_label_id] * padding_length 
    domain_type = (domains)-1
    
    assert len(input_ids) == MAX_SEQ_LEN
    assert len(input_mask) == MAX_SEQ_LEN
    assert len(segment_ids) == MAX_SEQ_LEN
    assert len(label_ids) == MAX_SEQ_LEN
    
    return {
        'input_ids' : torch.tensor(input_ids, dtype=torch.long),
        'input_mask' : torch.tensor(input_mask, dtype=torch.long),
        'segment_ids' : torch.tensor(segment_ids, dtype=torch.long),
        'label_ids' : torch.tensor(label_ids, dtype=torch.long),
        'domain_type' : torch.tensor(domain_type ,dtype=torch.long)
    }

# calculate the loss
def loss_fn(output, target, mask, num_labels):
    lfn = nn.CrossEntropyLoss()
    active_loss = mask.view(-1) == 1 
    active_logits = output.view(-1, num_labels)
    active_labels = torch.where(
        active_loss,
        target.view(-1),
        torch.tensor(lfn.ignore_index).type_as(target)
    )
    loss = lfn(active_logits, active_labels)
    return loss

# build the model (DA-NERB)
class NERModel(nn.Module):
    def __init__(self, num_tag):
        super(NERModel, self).__init__()
        self.num_tag = num_tag
        self.bert = transformers.BertModel.from_pretrained(BASE_MODEL_PATH)
        self.bert_drop = nn.Dropout(0.3)
        self.out_tag = nn.Linear(776, self.num_tag)
        self.domain = nn.Embedding(7, 8)
        
    def forward(self, input_ids, input_mask, segment_ids, label_ids,domain_type):
        o1,x = self.bert(input_ids, attention_mask=input_mask, token_type_ids=segment_ids,return_dict=False)
        bo_tag = self.bert_drop(o1)        
        out_domain=self.domain(domain_type)
        out_domain_A = out_domain.unsqueeze(1).expand(-1,60,-1)
        out_cat = torch.cat((bo_tag, out_domain_A), dim=2)
        logits = self.out_tag(out_cat)
        loss_tag = loss_fn(logits, label_ids, input_mask, self.num_tag)
        return logits, loss_tag


# Reduce the dimensions of the output 
# by selecting the classes with the highest probability and removing the padding from the sequence
def align_predictions(predictions, label_ids):
    inverst_label_map = {i: label for i, label in enumerate(label_list)}
    preds = np.argmax(predictions, axis=2) # return the index for the largest probabilities

    batch_size, seq_len = preds.shape

    out_label_list = [[] for _ in range(batch_size)]
    preds_list = [[] for _ in range(batch_size)]

    for i in range(batch_size):
        for j in range(seq_len):
            if (label_ids[i, j] != nn.CrossEntropyLoss().ignore_index):#-100
                out_label_list[i].append(inverst_label_map[label_ids[i][j]])
                preds_list[i].append(inverst_label_map[preds[i][j]])
                
    out_label_list_1_Dim=[]           
    for i in out_label_list:
        for j in i:
            out_label_list_1_Dim.append(j)
            
    preds_list_1_Dim=[]           
    for i in preds_list:
        for j in i:
            preds_list_1_Dim.append(j)
            
    return preds_list_1_Dim, out_label_list_1_Dim
    

# Computing the metrics
def compute_metrics(predictions,label_ids):
    preds_list, out_label_list = align_predictions(predictions,label_ids)
    print(classification_report(out_label_list, preds_list,digits=4))
    return {
        "accuracy_score": accuracy_score(out_label_list, preds_list),
        "precision": precision_score(out_label_list, preds_list , average='macro'),
        "recall": recall_score(out_label_list, preds_list, average='macro'),
        "f1": f1_score(out_label_list, preds_list, average='macro'),
        }
    

# validate  the model 
def eval_fn(data_loader, model, device):
    model.eval() # prep model for evaluation
    with torch.no_grad():
      final_loss = 0
      preds= None
      label_ids = None
      for data in tqdm(data_loader, total=len(data_loader)):
        for k, v in data.items():
            data[k] = v.to(device)
        # forward pass: compute predicted outputs by passing inputs to the model
        # and calculate the loss
        logits , loss = model(**data)
        final_loss += loss.item()
        if logits is not None:        
          preds = logits if preds is None else torch.cat((preds, logits), dim=0)
        if data['label_ids'] is not None:
          label_ids = data['label_ids'] if label_ids is None else torch.cat((label_ids, data['label_ids']), dim=0)

      preds = preds.detach().cpu().numpy()
      labels = label_ids.cpu().numpy()     
                
    return compute_metrics(preds,labels) ,final_loss / len(data_loader)


# train the model 
def train_fn(data_loader, model, optimizer, device, scheduler):
    model.train()   # prep model for training
    final_loss = 0
    preds= None
    label_ids = None
    for data in tqdm(data_loader, total=len(data_loader)):
        for k, v in data.items():  
            data[k] = v.to(device)
        # clear the gradients of all optimized variables
        optimizer.zero_grad() 
        # forward pass: compute predicted outputs by passing inputs to the model
        # and calculate the loss
        logits , loss = model(**data)
        # backward pass: compute gradient of the loss with respect to model parameters
        loss.backward() 
        torch.nn.utils.clip_grad_norm_(model.parameters(), MAX_GRAD_NORM)
        # perform a single optimization step (parameter update)
        optimizer.step()
        scheduler.step()
        final_loss += loss.item()
        
        if logits is not None:
            preds = logits if preds is None else torch.cat((preds, logits), dim=0)
        if data['label_ids'] is not None:
            label_ids = data['label_ids'] if label_ids is None else torch.cat((label_ids, data['label_ids']), dim=0)
            
    preds = preds.detach().cpu().numpy()
    labels = label_ids.cpu().numpy()
   
    return compute_metrics(preds,labels), final_loss / len(data_loader)



if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")  

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))
    !nvidia-smi

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")


In [None]:
# Create object of the dataset class (For training data) 
train_dataset = NERDataset(
    texts= [x[0] for x in data_train],
    tags = [x[1] for x in data_train],
    domains = [x[2] for x in data_train],
    label_list = label_list)

# Implementing dataloader on the dataset
train_dataloader = torch.utils.data.DataLoader(
    dataset=train_dataset,
    batch_size=TRAIN_BATCH_SIZE,
    num_workers=2 )

# Create object of the dataset class (For validating data)
dev_dataset = NERDataset(
    texts= [x[0] for x in data_val],
    tags = [x[1] for x in data_val],
    domains = [x[2] for x in data_val],
    label_list = label_list)

# Implementing dataloader on the dataset
dev_dataloader = torch.utils.data.DataLoader(
    dataset=dev_dataset,
    batch_size=VALID_BATCH_SIZE,
    num_workers=1 )

num_tag = len(label_list)
model = NERModel(num_tag=num_tag)
model.to(device)
print('MODEL LOADED!')

param_optimizer = list(model.named_parameters())
no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
optimizer_parameters = [
  {
      "params": [
          p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
      ],
      "weight_decay": 0.01,
  },
  {
      "params": [
          p for n, p in param_optimizer if any(nd in n for nd in no_decay)
      ],
      "weight_decay": 0.0,
  },
]

num_train_steps = int(len(train_dataset) / TRAIN_BATCH_SIZE * EPOCHS)
print('Number of training steps: ', num_train_steps)
optimizer = AdamW(optimizer_parameters, lr=LEARNING_RATE)
scheduler = get_linear_schedule_with_warmup(
  optimizer, num_warmup_steps=int(WARMUP_RATIO*num_train_steps), num_training_steps=num_train_steps
)


training_start_time = datetime.now()
print('training_start_time: ',training_start_time.strftime("%Y-%m-%d %H:%M:%S"))

Train_acc = []
Valid_acc = []
Train_loss = []
Valid_loss = []

for epoch in range(1, EPOCHS + 1):
    print('epoch :', epoch)
    train_metrics, train_loss = train_fn(train_dataloader, model, optimizer, device, scheduler)
    eval_metrics,eval_loss = eval_fn(dev_dataloader, model, device)  
    
    print(f"Train Loss = {train_loss} Valid Loss = {eval_loss}")
    print("eval_metrics" , eval_metrics)
    print("train_metrics" , train_metrics)
   
    Train_loss.extend([train_loss])
    Valid_loss.extend([eval_loss])
    Train_acc.extend([train_metrics['accuracy_score']])
    Valid_acc.extend([eval_metrics['accuracy_score']])

    # early_stopping needs the validation loss to check if it has decresed, 
    # and if it has, it will make a checkpoint of the current model
    if eval_loss < min_val_loss: 
        epochs_no_improve = 0
        best_model_state = copy.deepcopy(model.state_dict())
        torch.save(best_model_state, './model.pt')
        min_val_loss = eval_loss
    else:
        epochs_no_improve += 1
    if epochs_no_improve == PATIENCE: 
        print('Early stopping!' )
        break
    print('epochs_no_improve ', epochs_no_improve)
    print('min_val_loss ', min_val_loss)


training_end_time = datetime.now()
print('Training end time: ',training_end_time.strftime("%Y-%m-%d %H:%M:%S"))   

d = training_end_time - training_start_time
print('Training time duration: ', d.total_seconds()/ 60.0, 'mins')

In [None]:
# Plotting the training curves 
import matplotlib.pyplot as plt

fig = plt.figure(figsize=(10,8))
plt.plot(range(1,len(Train_loss)+1),Train_loss, label='Training Loss')
plt.plot(range(1,len(Valid_loss)+1),Valid_loss,label='Validation Loss')

# find position of lowest validation loss
minposs = Valid_loss.index(min(Valid_loss))+1 
plt.axvline(minposs, linestyle='--', color='r',label='Early Stopping Checkpoint')


plt.title('Training and validation loss')
plt.xlabel('epochs')
plt.ylabel('loss')
plt.legend()
plt.grid(True)
plt.show()
fig.savefig('loss_plot.png', bbox_inches='tight')

In [None]:
#load model
num_tag = len(label_list)
model = NERModel(num_tag=num_tag)   
#model.load_state_dict(torch.load('./model.pt', map_location=torch.device('cpu'))) #CPU
model.load_state_dict(torch.load('./model.pt'))  #GPU 
model.to(device)
print('MODEL LOADED!')

In [None]:
# data_test  
# Create object of the dataset class (For testing data) 
test_dataset = NERDataset(
    texts= [x[0] for x in data_test],    
    tags = [x[1] for x in data_test],    
    domains = [x[2] for x in data_test],
    label_list = label_list,)

# Implementing dataloader on the dataset
test_dataloader = torch.utils.data.DataLoader(
    dataset=test_dataset,
    batch_size=VALID_BATCH_SIZE,
    num_workers=1
)

test_metrics,test_loss= eval_fn(test_dataloader, model, device)
print(f"Test Loss = {test_loss}")
print('test_metrics ',test_metrics)

In [None]:
#data_realTestset
columns = pd.read_excel (r'../input/datasetgithub/ABC_Dataset.xlsx', sheet_name='Real_Dataset' , engine = 'openpyxl')

Sentences_RealDataset =columns.values.T[0].tolist()
labels_RealDataset =columns.values.T[3].tolist()
Domains_RealDataset =columns.values.T[4].tolist()

RealDataset= preprossingFun(Sentences_RealDataset, labels_RealDataset, Domains_RealDataset)

In [None]:
# Create object of the dataset class (For the Real Dataset) 
real_dataset = NERDataset(
    texts= [x[0] for x in RealDataset],    
    tags = [x[1] for x in RealDataset],    
    domains = [x[2] for x in RealDataset],
    label_list = label_list)

# Implementing dataloader on the dataset
real_dataloader = torch.utils.data.DataLoader(
    dataset=real_dataset,
    batch_size=VALID_BATCH_SIZE,
    num_workers=1
)

real_data_metrics,real_data_loss= eval_fn(real_dataloader, model, device)
print(f"Real_data Loss = {real_data_loss}")
print('Real_data metrics ',real_data_metrics)


In [None]:
# The Post-Processing Function uses two predefined sets for each domain,
# which are a set that contains all categories that may be used in the domain
# and a set that contains all required categories for the domain.

# Categories that belong to each domain
AirlineTicket_Domain_Entity={0,8 ,16, 17, 1, 7, 23, 3 ,5, 2, 12, 10, 9, 4, 6}
Hotel_Domain_Entity={0,16,17,1, 23, 3,5, 12, 10,  9, 4, 6}
Cinema_Domain_Entity={0,20,19,7,3,12,10,9,4,6}
Football_Domain_Entity={0,24,15,13,14,1,7,3,12,10,9,4,6}
Car_Domain_Entity={0,21,22,23,3,5,12,10,4,6}
Restaurant_Domain_Entity={0,18,3,12,10,9,4,6}
Clinic_Domain_Entity={0,11,25,3,12,10,4,6}

# The required categories for each domain
AirlineTicket_Domain_Required_Entity={8,1,7,3,2,9,4}
Hotel_Domain_Required_Entity={16,17,3,5,9,4,6}
Cinema_Domain_Required_Entity={20,19,7,3,9,4}
Football_Domain_Required_Entity={13,14,7,3,9,4}
Car_Domain_Required_Entity={21,22,3,5,4,6}
Restaurant_Domain_Required_Entity={18,3,9,4}
Clinic_Domain_Required_Entity={11,25,3,12,10,4}

# ................Post-Processing Function...........
# To improve the predictions of the DA-NERD model 
# by replacing unexpected tags with 0 (others) tags depending on the domain type.

def Post_Processing (seq,domain):
    Requested_Entities=[]  
    seq_split= seq.split()
    new_seq=''
    for i in range(len(seq_split)):
        if(domain==1):
            if (int(seq_split[i]) in AirlineTicket_Domain_Entity):
                new_seq+=seq_split[i]+" "
            else:
                new_seq+="0"+" "
        if(domain==2):
            if (int(seq_split[i]) in Hotel_Domain_Entity):
                new_seq+=seq_split[i]+" "
            else:
                new_seq+="0"+" "
        if(domain==3):
            if (int(seq_split[i]) in Cinema_Domain_Entity):
                new_seq+=seq_split[i]+" "
            else:
                new_seq+="0"+" "
        if(domain==4):
            if (int(seq_split[i]) in Football_Domain_Entity):
                new_seq+=seq_split[i]+" "
            else:
                new_seq+="0"+" "
        if(domain==5):
            if (int(seq_split[i]) in Car_Domain_Entity):
                new_seq+=seq_split[i]+" "
            else:
                new_seq+="0"+" "
        if(domain==6):
            if (int(seq_split[i]) in Restaurant_Domain_Entity):
                new_seq+=seq_split[i]+" "
            else:
                new_seq+="0"+" "
        if(domain==7):
            if (int(seq_split[i]) in Clinic_Domain_Entity):
                new_seq+=seq_split[i]+" "
            else:
                new_seq+="0"+" "

   #fill Requested entities list
    new_seq=new_seq.split()
    
    if(domain==1):
        for i in range(len(AirlineTicket_Domain_Required_Entity)):
            if (str(list(AirlineTicket_Domain_Required_Entity)[i]) not in new_seq):
                Requested_Entities.append(str(list(AirlineTicket_Domain_Required_Entity)[i]))
                
    if(domain==2):
        for i in range(len(Hotel_Domain_Required_Entity)):
            if (str(list(Hotel_Domain_Required_Entity)[i]) not in new_seq):
                Requested_Entities.append(str(list(Hotel_Domain_Required_Entity)[i]))
                
    if(domain==3):
        for i in range(len(Cinema_Domain_Required_Entity)):
            if (str(list(Cinema_Domain_Required_Entity)[i]) not in new_seq):
                Requested_Entities.append(str(list(Cinema_Domain_Required_Entity)[i]))
                
   
    if(domain==4):
        for i in range(len(Football_Domain_Required_Entity)):
            if (str(list(Football_Domain_Required_Entity)[i]) not in new_seq):
                Requested_Entities.append(str(list(Football_Domain_Required_Entity)[i]))
                
    if(domain==5):
        for i in range(len(Car_Domain_Required_Entity)):
            if (str(list(Car_Domain_Required_Entity)[i]) not in new_seq):
                Requested_Entities.append(str(list(Car_Domain_Required_Entity)[i]))
                
    if(domain==6):
        for i in range(len(Restaurant_Domain_Required_Entity)):
            if (str(list(Restaurant_Domain_Required_Entity)[i]) not in new_seq):
                Requested_Entities.append(str(list(Restaurant_Domain_Required_Entity)[i]))
                
    if(domain==7):
        for i in range(len(Clinic_Domain_Required_Entity)):
            if (str(list(Clinic_Domain_Required_Entity)[i]) not in new_seq):
                Requested_Entities.append(str(list(Clinic_Domain_Required_Entity)[i]))
   
    return new_seq,Requested_Entities