This notebook takes the CONLL2003 dataset using deepavlov, and creates templates (utterances with placeholders) for a PII synthetic data generator to use in order to create new sentences.

The notebook additionally introduces two new entities: TITLE and ROLE, in order to overcome cases like "UK David Scott called his wife", where the original sentence is "UK Prime Minister Boris Johnson called his wife" as "Prime Minister" was originally tagged as PER in the original dataset. Same logic goes for titles, like Mr., Mrs., Ms.

In [1]:
import pandas as pd
pd.options.display.max_rows = 4000
pd.set_option('display.max_colwidth', -1)
from deeppavlov.dataset_readers.conll2003_reader import Conll2003DatasetReader

In [2]:
import numpy as np

In [3]:
reader = Conll2003DatasetReader()
dataset = reader.read(data_path ="../../data",dataset_name='conll2003')
#Note: make sure you haven't downloaded something else with this function before, 
# as it will not download a new dataset (even if your previous download was for a different dataset)

In [4]:
dataset['train'][12]

(['Only',
  'France',
  'and',
  'Britain',
  'backed',
  'Fischler',
  "'s",
  'proposal',
  '.'],
 ['O', 'B-LOC', 'O', 'B-LOC', 'O', 'B-PER', 'O', 'O', 'O'])

### To pandas + add sentence_idx

In [5]:
new_dataset = [list(zip(a,b)) for a,b in dataset['train']]
df_list = []
sentence_id = 0
for sentence in new_dataset:
   
    df = pd.DataFrame(sentence,columns = ["word","tag"])
    df["sentence_idx"] = sentence_id
    sentence_id+=1
    df_list.append(df)
ner_dataset = pd.concat(df_list)


In [6]:
sentences = ner_dataset.groupby('sentence_idx')['word'].apply(lambda x: " ".join(x))

In [7]:
print(sentences[12])

Only France and Britain backed Fischler 's proposal .


#### Example sentence:

In [8]:
ner_dataset[ner_dataset['sentence_idx']==12]

Unnamed: 0,word,tag,sentence_idx
0,Only,O,12
1,France,B-LOC,12
2,and,O,12
3,Britain,B-LOC,12
4,backed,O,12
5,Fischler,B-PER,12
6,'s,O,12
7,proposal,O,12
8,.,O,12


In [9]:
# Unique entities
ner_dataset['tag'].unique()

array(['B-ORG', 'O', 'B-MISC', 'B-PER', 'I-PER', 'B-LOC', 'I-ORG',
       'I-MISC', 'I-LOC'], dtype=object)

Replace tokenization replacements

In [10]:
ner_dataset['word'] = ner_dataset['word']\
.replace('-LRB-','(')\
.replace('-RRB-',')')\
.replace('-LCB-','(')\
.replace('-RCB-',')')\
.replace('``','"')\
.replace("''",'"')\
.replace('/.','.')

In [11]:
# helper columns:
ner_dataset['prev-word'] = ner_dataset.word.shift(1)
ner_dataset['prev-prev-word'] = ner_dataset['word'].shift(2)
ner_dataset['next-word'] = ner_dataset['word'].shift(-1)
ner_dataset['next-next-word'] = ner_dataset['word'].shift(-2)
ner_dataset['prev-tag'] = ner_dataset['tag'].shift(1)
ner_dataset['next-tag'] = ner_dataset['tag'].shift(-1)

In [64]:
ner_dataset[ner_dataset['sentence_idx']==900]

Unnamed: 0,word,tag,sentence_idx,prev-word,prev-prev-word,next-word,next-next-word,prev-tag,next-tag,metadata
0,But,O,900,.,year,new,coach,O,O,
1,new,O,900,But,.,coach,Rolf,O,O,
2,coach,O,900,new,But,Rolf,Fringer,O,B-PER,
3,Rolf,B-PER,900,coach,new,Fringer,is,O,I-PER,
4,Fringer,I-PER,900,Rolf,coach,is,clearly,B-PER,O,
5,is,O,900,Fringer,Rolf,clearly,a,I-PER,O,
6,clearly,O,900,is,Fringer,a,Knup,O,O,
7,a,O,900,clearly,is,Knup,fan,O,B-PER,
8,Knup,B-PER,900,a,clearly,fan,and,O,O,
9,fan,O,900,Knup,a,and,included,B-PER,O,


In [13]:
np.unique(ner_dataset['tag'])

array(['B-LOC', 'B-MISC', 'B-ORG', 'B-PER', 'I-LOC', 'I-MISC', 'I-ORG',
       'I-PER', 'O'], dtype=object)

#### Remove unneeded (non PII) entities:

In [14]:
TAGS_TO_IGNORE = ['CARDINAL','FAC','LAW','LANGUAGE','MISC','TIME','DATE','ORDINAL','EVENT','QUANTITY','WORK_OF_ART','MONEY','PRODUCT','PERCENT']
def remote_unwanted_tags(x):
    if len(x)>1 and x[2:] in TAGS_TO_IGNORE:
        return 'O'
    else:
        return x

ner_dataset['tag'] = ner_dataset['tag'].apply(remote_unwanted_tags)
ner_dataset[ner_dataset['sentence_idx']==3]

Unnamed: 0,word,tag,sentence_idx,prev-word,prev-prev-word,next-word,next-next-word,prev-tag,next-tag
0,The,O,3,1996-08-22,BRUSSELS,European,Commission,O,B-ORG
1,European,B-ORG,3,The,1996-08-22,Commission,said,O,I-ORG
2,Commission,I-ORG,3,European,The,said,on,B-ORG,O
3,said,O,3,Commission,European,on,Thursday,I-ORG,O
4,on,O,3,said,Commission,Thursday,it,O,O
5,Thursday,O,3,on,said,it,disagreed,O,O
6,it,O,3,Thursday,on,disagreed,with,O,O
7,disagreed,O,3,it,Thursday,with,German,O,O
8,with,O,3,disagreed,it,German,advice,O,B-MISC
9,German,O,3,with,disagreed,advice,to,O,O


#### Remove PERSON tags if preceding word is 'the' (e.g. the Bush administration)

In [15]:
# removing PERSON tags from sentences with a 'the' preceding the person:

def remove_tag_if_the_person(row):
    if row['prev-word'].lower() == 'the' and row['tag']=='B-PERSON':
        return 'O'
    elif row['prev-prev-word'].lower() == 'the' and row['prev-tag']=='I-PERSON' and row['tag']=='B-PERSON':
        return 'O'
    return row['tag']

ner_dataset['prev-word']=ner_dataset['prev-word'].astype('str')
ner_dataset['prev-prev-word']=ner_dataset['prev-prev-word'].astype('str')
ner_dataset['tag'] = ner_dataset.apply(remove_tag_if_the_person,axis=1)

#### Remove tag from 's (Joe Wilson's cat)

In [16]:
def remove_tag_if_apostraphe_after_tag(row):
    if row['prev-tag'] != 'O' and row['word']=="'s":
        return 'O'
    return row['tag']
ner_dataset['tag'] = ner_dataset.apply(remove_tag_if_the_person,axis=1)

#### Re-tag words from dictionaries (countries, nationalities, roles, titles)

Nationalities and countries:

In [17]:
nationalities = pd.read_csv("../raw_data/nationalities.csv")
nationalities.head()

Unnamed: 0,country,nationality,man,woman,plural
0,algeria,algerian,algerian,algerian,algerians
1,andorra,andorran,andorran,andorran,andorrans
2,angola,angolan,angolan,angolan,angolans
3,argentina,argentinian,argentinian,argentinian,argentinians
4,armenia,armenian,armenian,armenian,armenians


In [18]:
"algeria" in nationalities['country'].values

True

In [19]:

ner_dataset['metadata'] = None

def get_nationality_as_metadata(row):
    if row['word'].lower() in nationalities['country'].values:
        return 'COUNTRY'
    elif row['word'].lower() in nationalities['nationality'].values:
        return 'NATIONALITY'
    elif row['word'].lower() in nationalities['man'].values:
        return 'NATION_MAN'
    elif row['word'].lower() in nationalities['woman'].values:
        return 'NATION_WOMAN'
    elif row['word'].lower() in nationalities['plural'].values:
        return 'NATION_PLURAL'
    return row['metadata']

row = pd.Series({'word':'Frenchwoman','metadata':None})
print("Example: Frenchwoman -> ",get_nationality_as_metadata(row))

def update_tag_based_on_metadata(row):
    if row['metadata'] is not None:
        return "B-"+row['metadata']
    else:
        return row['tag']



Example: Frenchwoman ->  NATION_WOMAN


In [20]:
ner_dataset['metadata'] = ner_dataset.apply(get_nationality_as_metadata, axis=1)


In [21]:
ner_dataset['metadata']

0    None       
1    None       
2    NATIONALITY
3    None       
4    None       
     ...        
1    None       
0    None       
1    None       
2    None       
3    None       
Name: metadata, Length: 203621, dtype: object

#### Titles

In [22]:
MALE_TITLES = ['mr', 'dr', 'professor', 'eng','prof','doctor']
FEMALE_TITLES = ['mrs', 'ms', 'miss', 'dr', 'professor', 'eng', 'prof','doctor']

def get_title_as_metadata(row):
    if row['word'].lower() in MALE_TITLES:
        return 'MALE_TITLE'
    elif row['word'].lower() in FEMALE_TITLES:
        return 'FEMALE_TITLE'
    return row['metadata']


def update_title_tag_if_missing(row):
    if row['word'].lower() in MALE_TITLES and row['tag']=='O':
        return 'B-MALE_TITLE'
    elif row['word'].lower() in FEMALE_TITLES and row['tag']=='O':
        return 'B-FEMALE_TITLE'
    else:
        return row['tag']

ner_dataset['metadata'] = ner_dataset.apply(get_title_as_metadata,axis=1)
ner_dataset['tag'] = ner_dataset.apply(update_title_tag_if_missing,axis=1)

In [23]:
ner_dataset[ner_dataset['sentence_idx']==18]

Unnamed: 0,word,tag,sentence_idx,prev-word,prev-prev-word,next-word,next-next-word,prev-tag,next-tag,metadata
0,Germany,B-LOC,18,.,beef,imported,47600,O,O,COUNTRY
1,imported,O,18,Germany,.,47600,sheep,B-LOC,O,
2,47600,O,18,imported,Germany,sheep,from,O,O,
3,sheep,O,18,47600,imported,from,Britain,O,O,
4,from,O,18,sheep,47600,Britain,last,O,B-LOC,
5,Britain,B-LOC,18,from,sheep,last,year,O,O,COUNTRY
6,last,O,18,Britain,from,year,",",B-LOC,O,
7,year,O,18,last,Britain,",",nearly,O,O,
8,",",O,18,year,last,nearly,half,O,O,
9,nearly,O,18,",",year,half,of,O,O,


### Remove 'the' from 'the NORP' if NORP is not in nationalities list.

In [26]:
def remove_tag_if_the_norp(row):
    if row['prev-word'].lower() == 'the' and row['tag']=='B-NORP' and row['metadata'] is None:
        return 'O'
    elif row['prev-prev-word'].lower() == 'the' and row['prev-tag']=='I-NORP' and row['tag']=='B-NORP' and row['metadata'] is None:
        return 'O'
    return row['tag']
ner_dataset['tag'] = ner_dataset.apply(remove_tag_if_the_norp,axis=1)

### Remove sentences with adjacent different entities (e.g calling from New York Larry King)

In [27]:
ner_dataset['entity'] = ner_dataset['tag'].str[2:]
ner_dataset['next-entity']=ner_dataset['next-tag'].str[2:]
adjacent_idc = (ner_dataset['tag'] != 'O') & (ner_dataset['next-tag'] != 'O') & (ner_dataset['entity'] != ner_dataset['next-entity'])
sentences_to_remove = ner_dataset[adjacent_idc]['sentence_idx'].values
sentences_to_remove

ner_dataset=ner_dataset[~ner_dataset['sentence_idx'].isin(sentences_to_remove)]

#### Update tag for discovered metadata values (eg. nationalities)

In [28]:
ner_dataset['tag'] = ner_dataset.apply(update_tag_based_on_metadata, axis=1)

In [33]:
np.unique(ner_dataset['metadata'][ner_dataset['metadata'].values != None])

array(['COUNTRY', 'FEMALE_TITLE', 'MALE_TITLE', 'NATIONALITY',
       'NATION_MAN', 'NATION_PLURAL'], dtype=object)

In [34]:
np.unique(ner_dataset['tag'])

array(['B-FEMALE_TITLE', 'B-LOC', 'B-MALE_TITLE', 'B-ORG', 'B-PER',
       'I-LOC', 'I-ORG', 'I-PER', 'O'], dtype=object)

### Create templates base on NER dataset
Here we create the actual templates + handle multiple weird cases that should cause the template sentences to be weird. Note that a manual run over the templates dataset is still required after this step.

In [30]:
import re
class SentenceGetter(object):
    
    def __init__(self, dataset):
        self.n_sent = 1
        self.dataset = dataset
        self.empty = False
        agg_func = lambda s: [(w, t) for w,t in zip(s["word"].values.tolist(),
                                                        s["tag"].values.tolist())]
        self.grouped = self.dataset.groupby("sentence_idx").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None
        
    @staticmethod    
    def cleanse_template(template, ents):
        # Remove whitespace before certain punctuation marks
        template = re.sub(r'\s([?,:.!](?:|$))+', r'\1', template)
        
        # Remove whitespaces within double quotes
        template = re.sub('\"\s*([^\"]*?)\s*\"', r'"\1"', template)    
        
        # Remove whitespaces within quotes
        template = re.sub("\'\s*([^\']*?)\s*\'", r"'\1'", template)    
        
        # Remove whitespaces within parentheses
        template = re.sub('\(\s*([^\(]*?)\s*\)', r'(\1)', template)    
        
        for ent in ents:
            #Turn PERSON PERSON into PERSON
            duplicates = "[{}] [{}]".format(ent,ent)
            template = template.replace(duplicates,"[{}]".format(ent))
        
        
        # Replace additional weird templates:
        to_replace = {
            "[LOCATION] says" : "[PERSON] says",
            "[LOCATION] said" : "[PERSON] said",
            "[ORGANIZATION] of [ORGANIZATION]" : "[ORGANIZATION]",
            "the [COUNTRY]" : "[COUNTRY]",
            " 's ":"'s",
            "] 's ":"]'s ",
            "] 's,":"]'s,",
            "] 's.":"]'s.",
            " n't" : "n't",
            "/?":"?",
            "%u":"u",
            "%m":"m",
            "%e":"e",  
            "%h":"h",  
            "%a":"a",
            " %":"%",
            " ?":"?",
            " /?":"?",
            " ' .":"'.",
            "[ ":"(",
            " ]":")",
            "[PERSON] -- [PERSON]":"[PERSON]",
            "[COUNTRY] -- [ORGANIZATION]":"[ORGANIZATION]",
            "Jews" : "[NATIONALITY]",
            "Chinese" : "[NATIONALITY]",
            "Dutch" : "[NATIONALITY]",
            "[LOCATION], [LOCATION]":"[LOCATION]",
            "[LOCATION] [ORGANIZATION]":"[ORGANIZATION]"
        }
        
        for weird in to_replace.keys():
            #if weird in template:
            #    print("Weird sentence",template)
            template = template.replace(weird,to_replace[weird])
  
        template = template.replace(" -- "," - ")
        
        #Ignore templates that are incomplete
        if "/-" in template:
            template = ""
            
        #Ignore templates that have numbers after the end or start of the entity
        if len(re.findall(r"\]\s[0-9]",template)) > 0:
            template = ""
            
        if len(re.findall(r"[0-9]\s\[",template)) > 0:
            template = ""
            
        if len(re.findall(r"[0-9].\s\[",template)) > 0:
            template = ""
            
            
        if "[PERSON] ([COUNTRY])" in template:
            template = ""
        if "[PERSON] ([LOCATION])" in template:
            template = ""
            
        if template.count('"') == 1:
            template = template.replace('"','')

        return template
    
    @staticmethod    
    def get_template(grouped,entity_name_replace_dict):
        template = ""
        i=0
        cur_index = 0
        ents = []
        for token in grouped:
            # remove brackets as they interefere with the data generation process
            token_text = token[0].replace("[", "(").replace("]",")")
            token_text = token[0].replace("{", "(").replace("}",")")
            token_tag = token[1]
            token_entity = token_tag[2:] if len(token_tag)>1 else token_tag
            
            if token_entity == 'O':
                template += " " + token_text
            elif 'B-' in token_tag and token_entity not in TAGS_TO_IGNORE:
                #print("found entity: {}".format(token_entity))
                ent = entity_name_replace_dict[token_entity]
                ents.append(ent)
                 
                template += " [" + ent + "]"
            #print("template: ",template)
        
        template = SentenceGetter.cleanse_template(template, ents)
        
        return template.strip()
    
getter = SentenceGetter(ner_dataset)

In [31]:
ENTITIES_DICTIONARY = {"PERSON":"PERSON",
                       "PER":"PERSON",
                       "GPE":"COUNTRY",
                       "NORP":"LOCATION",
                       "LOC":"LOCATION",
                       "ORG":"ORGANIZATION",
                       "MALE_TITLE":"MALE_TITLE",
                       "FEMALE_TITLE":"FEMALE_TITLE",
                       "COUNTRY":"COUNTRY",
                       "NATIONALITY":"NATIONALITY",
                       "NATION_WOMAN":"NATION_WOMAN",
                       "NATION_MAN":"NATION_MAN",
                       "NATION_PLURAL":"NATION_PLURAL"}

sentences = getter.sentences

sent_id = 445

print("original:",sentences[sent_id])
print("template:", getter.get_template(sentences[sent_id],entity_name_replace_dict=ENTITIES_DICTIONARY))

original: [('I.', 'B-PER'), ('Salisbury', 'I-PER'), ('not', 'O'), ('out', 'O'), ('1', 'O')]
template: [PERSON] not out 1


In [32]:
all_templates = [getter.get_template(sentence,entity_name_replace_dict=ENTITIES_DICTIONARY) for sentence in sentences]

In [33]:
print("original length of templates: {}".format(len(all_templates)))
all_templates = list(set(all_templates))
print("length after duplicates removal: {}".format(len(all_templates)))

original length of templates: 13775
length after duplicates removal: 8566


Save templates to file:

In [34]:
with open("../raw_data/conll_based_templates.txt","w+",encoding='utf-8') as f:
    for template in all_templates:
        f.write("%s\n" % template)        