Compile gazetteer for use in beamforming.

Sources:

    - Legasee metadata tags
    
    - collections from ww2db.com
    
    - "Shipnames.txt", collated from Wikipedia

In [1]:
import pandas as pd
import numpy as np
import re
import os
import json

# Used to convert numbers to words, e.g. 8 o'clock -> eight o'clock, 1944 -> nineteen forty four
import inflect
# Inflect is more flexible, but doesn't create ordinals as words - use num2words for that
from num2words import num2words
# Roman numerals
import roman

In [36]:
ROOTPATH = '/H_Drive/srv/studat/cdt/team2/data'

OUTPATH = os.path.expanduser("~")+ROOTPATH+'/external'

# Specific items (used to initialise the list)
FIXED_ITEMS = [
    'World War One',
    'World War Two',
    'Service',
    'D Day',
    'Major General',
    'Curtis LeMay',
    'Stratheden',
    'sub',
    'submarine',
    'leave',
    'R and R',
    'SAS',
    '13th Lancashire Parachute Battalion',
    'Nuremberg',
    'Wrens',
    'WRNS',
    'wren',
    'waaf',
]

In [3]:
META_PATH = os.path.expanduser("~")+ROOTPATH+'/legasee/metadata/'

# Particular items dropped from metadata lists
META_DROP = [
    'Description',
    'Camp / Accom',
    'Co',
    'Basic / Initial',
    'Life',
    'Mil Actions',
    'Pre Service',
    'WWII',
    'WWI',
    'D-Day Preperation',
    'Maj. Gen. Curtis LeMay',
    'P&amp;O Stratheden',
    'Personal',
    'Ship / Sub',
    'News / Views',
    'Leave / R&amp;R',
    'Daily Routine / Jobs',
    'Casualty / Death',
    'ship (Catapult',
    'CAM ship (Catapult',
    '13th (Lancashire) Parachute Battalion',
    'Wac?aw Struszy?sk',
    'Places',
    'V',
]

# Stripped as tailing elements
META_STRIP = [
    '\(\s*SF\s*\)$',
    '\(\s*[Ss]tone [Ff]rigate\s*\)$',
    ]

In [4]:
## Legasee metadata
meta_df = pd.read_csv(META_PATH+'master_metadata.csv', converters={'Priority Words': eval, 'Name Words' : eval})

# Remove Test items
train_meta = meta_df[~pd.Series(meta_df.Allocation == "Test")]

# Flatten, de-dupe
meta_list = set([t for subl in train_meta['Priority Words'] for t in subl])

# Remove specified items
meta_list = [x for x in meta_list if x not in META_DROP]

# Handle slashes, brackets!
## Slashes - if between spaces, split into two items ('Electrical Fitter / Engineer'). If not, replace with spaces ('9/11 Attacks')
meta_list = [re.sub(r"(?<!\s)\/(?!\s)",' ',x) for x in meta_list ]

meta_l2 = []
for x in meta_list:
    meta_l2.extend(re.split("\s+\/\s+",x))

## Brackets -'(High Speed Launch)', 'CAM ship (Catapult' , 'Berlin (Tegel)', 'HMS Excalibur (SF)'
for s in META_STRIP:
    meta_l2 = [ re.sub(s,'',x) for x in meta_l2 ] 
    
meta_list = []
for x in meta_l2:
    meta_list.extend(re.split("[()]",x))

# &amp; -> and
meta_list = [ re.sub(r"&amp;",' and ',x) for x in meta_list ]

# Sqn
meta_list = [ re.sub(r"\s*[Ss]qn.?",' Squadron ',x) for x in meta_list ]

# Cut weapon measurements ("6inch mortar" -> "mortar")
meta_list = [ re.sub("\d+(\.\d+)?\s*\-?(mm|cm|in\.?|inch)(\s+|$)",' ',x) for x in meta_list ]

----

In [5]:
## ww2db
WW2DB_PATH = os.path.expanduser("~")+ROOTPATH+'/external/ww2db/text_only'

# List of subdirectories, .txt files from which will be processed
WW2DB_DIRS = ['aircraft', 'events', 'ships', 'weapons']

# Removed
WW2DB_STRIP = ['and Other Cities$', 'and Other Trials Against Germany$', 'and Other Trials Against Japan$' ]

# Subbed for spaces - parentheses, "No. 5" etc, measurements - 13.2 mm / cm / inch / -inch
WW2DB_CUT = ["[()]" , "[Nn]o\.?\s+\d+", "\d+(\.\d+)?\s*\-?(mm|cm|in\.?|inch)(\s+|$)" ]

In [6]:
ww2db_list = []

for d in WW2DB_DIRS:
    for _fn in os.scandir(WW2DB_PATH+'/'+d):
        # Ignore subfolders, non .txt files any whose name begins with "."
        if _fn.is_file() and not _fn.name.startswith('.') and _fn.name[-4:] == '.txt':
            with open(WW2DB_PATH+'/'+d+'/'+_fn.name,'r') as tfile:
                for l in tfile.readlines():
                    ww2db_list.extend([l.rstrip('\n').strip()])
                    
# Remove trailing items (from events)
for s in WW2DB_STRIP:
    ww2db_list = [ re.sub(s,'',x,flags=re.IGNORECASE) for x in ww2db_list ] 

In [7]:
# Some events formatted as "Operation X and Operation Y" or "Operations X and Y", "Battles of X and Y" - split into two
def op_splitter(instr):
    patt_opXopY = re.compile(r"[Oo]peration\s+(?P<opX>.+)\s+(and|[&,])\s+[Oo]peration\s+(?P<opY>.+)")
    patt_opsXY  = re.compile(r"[Oo]perations\s+(?P<opX>.+)\s+(and|[&,])\s+(?P<opY>.+)")
    patt_batXbatY = re.compile(r"[Bb]attle\s+of\s+(?P<opX>.+)\s+(and|[&,])\s+[Bb]attle\s+of\s+(?P<opY>.+)")
    patt_batsXY = re.compile(r"[Bb]attles\s+of\s+(?P<opX>.+)\s+(and|[&,])\s+(?P<opY>.+)")
    
    m = re.fullmatch(patt_opXopY,instr)
    if m:
        return [ 'Operation '+m.group("opX") , 'Operation '+m.group("opY") ]
    
    n = re.fullmatch(patt_opsXY,instr)
    if n:
        return [ 'Operation '+n.group("opX") , 'Operation '+n.group("opY") ]
    
    b = re.fullmatch(patt_batXbatY,instr)
    if b:
        return [ 'Battle of '+b.group("opX") , 'Battle of '+b.group("opY") ]
    
    c = re.fullmatch(patt_batsXY,instr)
    if c:
        return [ 'Battle of '+c.group("opX") , 'Battle of '+c.group("opY") ]
    
    return [ instr ]

# Some munitions have nicknames, e.g "Browning Automatic Rifle M1918 'BAR'" - split these. 
# Some also end up as e.g. "'Sticky bomb'" - remove apostrophes
def nick_splitter(instr):
    patt_nick = re.compile(r"\s*(?P<prenick>[^']+)\s+\'+(?P<nick>[^']+)\'+\s*")
    patt_quoted = re.compile(r"\s*\'+(?P<nick>[^']+)\'+\s*")
    
    n = re.fullmatch(patt_nick,instr)
    if n:
        return [ n.group("prenick") , n.group("nick")  ]
    
    q = re.fullmatch(patt_quoted,instr)
    if q:
        return [ q.group("nick") ]
    
    return [ instr ]
    

In [8]:
# Operation / battle splits
ww2db_l2 = []
for x in ww2db_list:
    ww2db_l2.extend(op_splitter(x))

# Split on campaign phases ('Normandy Campaign, Phase 1', 'New Guinea-Papua Campaign, Phase 1, Bismarck Islands', )
ww2db_list = []
for x in ww2db_l2:
    ww2db_list.extend( re.split(",*\s+[Pp]hase\s+\d+,*(\s+|$)",x))
    
# Split on / (not requiring space) - for e.g. aircraft
ww2db_l2 = []
for x in ww2db_list:
    ww2db_l2.extend( re.split("\/",x))
    
# Remove cut elements (sub for spaces)
for c in WW2DB_CUT:
    ww2db_l2 = [ re.sub(c,'',x) for x in ww2db_l2 ]
    
# Nickname splits
ww2db_list = []
for x in ww2db_l2:
    ww2db_list.extend(nick_splitter(x))

----

In [9]:
## Wiki ship names
SHIP_PATH = os.path.expanduser("~")+ROOTPATH+'/external'

SHIP_STRIP = [
    # "List of ships named  X"
    "^[Ll]ist\s+of\s+ships\s+named\s+",
    
    # Parentheticals (mainly years but some year and class)
    "\(.{,24}\)",
]

In [10]:
ship_list = []

with open(SHIP_PATH+'/'+'Shipnames.txt','r') as tfile:
    for l in tfile.readlines():
        ship_list.extend([l.rstrip('\n').strip()])
        
# Remove strip items
for s in SHIP_STRIP:
    ship_list = [ re.sub(s,'',x) for x in ship_list ]

----

In [11]:
## Collate - append each to initial list
full_list = FIXED_ITEMS
full_list.extend(meta_list)
full_list.extend(ww2db_list)
full_list.extend(ship_list)

In [12]:
def deromanise(instr):
    # Convert roman numerals to integers.
    # Only applying to ones made up of characters IVX - other instances (C, D, L, M) appear to be better remaining as letters.
    # Additionally, not applying where the group of chars is the first word in the string - 'X class Midget Submarine', 'I 15' remain as letters
    outstr = instr
    
    patt_rom = re.compile(r"(?<=\s)(?P<roman>[IVX]+)\b")
    
    r = re.search(patt_rom,instr)
    if r:
        try:
            outstr = outstr[:r.start("roman")] + str(roman.fromRoman(r.group("roman"))) + outstr[r.end("roman"):]
        except:
            pass
    
    return outstr
    
def m_roman(instr):
    # Instances where complete string is e.g. MI , MIV - insert space between M and the rest prior to deromanisation
    if re.fullmatch(r"M[IV]+",instr):
        return 'M '+instr[1:]
    return instr

In [13]:
def expand_initials(instr):
    # Capital letters and (optionally) numbers, contiguously. Need at least one letter.
    patt_initials = re.compile(r"(([A-Z0-9])+[A-Z]([A-Z0-9])*|([A-Z0-9])*[A-Z]([A-Z0-9])+)")
    
    tout = ''
    p_end = 0

    a = re.finditer(patt_initials,instr)

    # Process each matching group in the string - could have multiple initialisms to expand
    for m in a:
        # Retain everything between previous match and this one
        tout+=instr[p_end:m.start()]

        # Append this match text, letters gaining pre/post spaces
        for c in m.group(0):
            if c.isdigit():
                tout+=c
            else:
                tout+=' '
                tout+=c
                tout+=' '

        p_end = m.end()
        
    # Add back in anything after the last match
    tout+=instr[p_end:]
        
    if len(tout): return tout
    return instr

In [14]:
def DigitsToWords(instr, target=re.compile(r"\b[,.\d]+\b"), **ntw_opts):
    """
    Use inflect library's number_to_words functionality to substitute digits for corresponding strings.
    E.g. 8 o'clock -> eight o'clock.
    Note that some instances may warrant distinct treatment, e.g. $1,000 might have desired output "one thousand dollars", 
     "1944" may want to be "nineteen fourty-four".
    To enable this, only substrings matching the `target` compiled regex pattern are processed. By default, this is any
     "word" (per standard RegEx word boundaries) consisting of digits, commas and decimal points.

    **kwargs are passed through to the number_to_words function - see docs at https://pypi.org/project/inflect/
    Note that passing the groups parameter introduces additional commas.
    """
    ie = inflect.engine()

    for m in re.finditer(target, instr):
        repl = ie.number_to_words(m[0], **ntw_opts)
        instr = instr.replace(m[0],repl,1)
    return instr


def OrdinalsToWords(instr, target=re.compile(r"\b(?P<numpart>[,.\d]+)(st|nd|rd|th)\b")):
    """
    Use num2words library's functionality to substitute ordinals for corresponding strings.
    E.g. 22nd -> twenty-second.
    https://pypi.org/project/num2words/

    If modifying the target regex, note that the group label 'numpart' is required 
     (for the numeric section which is retained and converted to ordinal words).
    """

    for m in re.finditer(target, instr):
        repl = num2words(m['numpart'], ordinal=True)
        # Want to replace entire match, not just the digits (or we get "secondnd")
        instr = instr.replace(m[0],repl,1)
    return instr

In [25]:
## Cleanup

# Specific substitutions applied to full list. Keys are regexes.
FULL_SWAP = {
    "P\.L\.U\.T\.O\.?" : 'pluto',
    # St / St. - expand as Saint (no instances of "street" observed)
    "St.?\s+" : "Saint ",
    # Sub ampersands
    "&" : ' and ',
    
    r"[Nn]o\s*1\b" : "Number one",
    r"[Nn]o\s*2\b" : "Number two",
    
    'George V' : 'George the fifth',
    'Edward VII' : 'Edward the seventh',
    'Charles II' : 'Charles the second',
    'Charles V' : 'Charles the fifth',
    'Christian VII' : 'Christian the seventh',
    'George III' : 'George the third',
    
    '\bV\-?1\b' : 'v one',
    '\bV\-?2\b' : 'v two',
    
    'Oflag X C' : 'Oflag x c',
    'Do X' : 'Do x',
    
    'Meteor NF14s' : 'Meteor NF 14',
    'Station 53a' : 'Station 53 a',
    'G7es Type V' : 'G7 ES type five',
    'Mauser Kar98k' : 'Mauser Kar 98 k',
    
    # Adjustments prior to acronym expansion
    r"PLA's Pilotage Service" : "PLA Pilotage Service",
    'PPSh 41' : 'PPS h 41',
    'AKAK' : 'ack ack',
    'HNoMY' : 'H No M Y',
    
    # accented letters (umlauts, etc) - not supported by our ASR system
    'ü' : 'u',
    'ä' : 'a',
    'ö' : 'o',
    'ï' : 'i',
 
    'é' : 'e',
    'É' : 'E',
    'è' : 'e',
    'Ç' : 'C',
    'ç' : 'c',
    'à' : 'a',
    
    'ł' : 'l', # In 'Błyskawica'; sorry Dawid!
    'ó' : 'o',
    
    'ø' : 'o',
    
    'á' : 'a',
        }

for k,v in FULL_SWAP.items():
    full_list = [ re.sub(k,v,x) for x in full_list ]

# punctuation fixing
## backtick to apostrophe ( "John O`Groats" )
full_list = [ re.sub('`',"'",x) for x in full_list ]

## O' -> "O ", d' -> "de " , l' -> "le " ?
## Apostrophes allowed in ASR vocab - no need to remove

## dash -> space
full_list = [ re.sub('\-'," ",x) for x in full_list ]

## commas
full_list = [ re.sub(','," ",x) for x in full_list ]

## decimal points -> space
full_list = [ re.sub('\.'," ",x) for x in full_list ]

## quotes
full_list = [ re.sub('"'," ",x) for x in full_list ]

# Instances where complete string is e.g. MI , MIV - insert space between M and the rest prior to deromanisation
full_list = [ m_roman(x) for x in full_list ]

# roman ordinals and cardinals
# "HMS King George V", 'Stalag XVIII A',  'Stalag Luft I', 'Operation Supercharge II', 'G7es Type V', ''Type XXVII-class Midget Submarine', ...
# ordinals handled as special cases in specific replacement above
full_list = [ deromanise(x) for x in full_list ]

# drop leading "the"
full_list = [ re.sub(r"^[Tt]he\s+"," ",x) for x in full_list]

# Acronym expansion - "HMS" -> "H M S". Needs to allow numbers (M41, P2Y...)
full_list = [ expand_initials(x) for x in full_list ]

# ordinals, digits to words. Same processing as evaluation transformations:
## Year processing - convert 200x to "two thousand and x"
full_list = [ DigitsToWords(x, target = re.compile(r"\b200\d{1}\b"), group=0) for x in full_list ]
## Other years (from 1700 on) - convert e.g. 1984 to "nineteen, eighty-four"
## NB: place prior to removal of punctuation as introduces new commas
full_list = [ DigitsToWords(x, target = re.compile(r"\b(17|18|19|20)\d{2}\b"), group=2, zero='oh') for x in full_list ]
## Ordinals - "1st" -> "first"
full_list = [ OrdinalsToWords(x) for x in full_list ]
## Other numbers (as standalone words); longer ones as list of numbers, 1-2 digits as a single group
full_list = [ DigitsToWords(x, target = re.compile(r"\b\d{3,}\b"), group=1, zero='oh') for x in full_list ]
full_list = [ DigitsToWords(x, zero='oh') for x in full_list ]

# repeat commas and dash -> space - new ones introduced by processing of years to words
full_list = [ re.sub(','," ",x) for x in full_list ]
full_list = [ re.sub('\-'," ",x) for x in full_list ]

# lower case
full_list = [ x.lower() for x in full_list ]

# compress spaces, strip lead/trail
full_list = [ re.sub("\s+"," ",x).strip() for x in full_list ]

# remove empty items
full_list = [ x for x in full_list if len(x) ]

# de-dupe
dedup_list = list(set(full_list))

In [42]:
## Output - JSON array
jsonStr = json.dumps(dedup_list)

with open(OUTPATH+'/gazetteer.json','w+') as ofile:
    ofile.write(jsonStr)

## Also .txt
with open(OUTPATH+'/gazetteer.txt','w+') as ofile:
    ofile.write('\n'.join(dedup_list))

In [17]:
check = [x for x in full_list if re.search(r"[^a-zA-Z\'\s]",x)]

check

[]

In [26]:
len(dedup_list)

10278

In [27]:
dedup_list

['howitzer',
 'glatton',
 'charity',
 'tabberer',
 'santa maria',
 'mosquidobit',
 'deale castle',
 'stoker',
 'seventh armoured division',
 'weather',
 'pioneer corp',
 'h m s alaunia',
 'elfreda',
 'periwig',
 'treaty of berlin',
 'thirty first field medical station',
 'three division signals',
 'little charity',
 'loch eil',
 'p b y catalina',
 'u boat',
 'kildorry',
 'j s one',
 'loch fannich',
 'pondicherry',
 'horsa',
 'lewes',
 'penetang',
 'colt m nineteen eleven a one',
 'battle group',
 'biggleswade',
 'calgarian',
 'new territories',
 'tullibee',
 'wasperton',
 'seaton barracks',
 'rodney',
 'venomous',
 'crossbow',
 'r a f tempsford',
 'lardner',
 'ardrossan',
 'haydon',
 'bullfrog',
 'loyal exploit',
 'petulant',
 'r a f blandford',
 'fifth royal inniskilling dragoon guards',
 'skagerrak strait',
 'sea nymph',
 'veteran',
 'alverton',
 'j w fifty eight',
 'blickling',
 'andalsnes',
 'hairong class protected cruiser',
 'loch torridon',
 'atomic bombing of hiroshima and naga