In [None]:
# !python -m spacy download en_core_web_sm
# !pip install geotext
# !pip install spacy

## Import and test Spacy

In [1]:
import spacy
import random
from geotext import GeoText
nlp = spacy.load('en_core_web_sm')
  
sentence = "Apple is looking at buying U.K. startup for $1 billion"
  
doc = nlp(sentence)
  
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Apple 0 5 ORG
U.K. 27 31 GPE
$1 billion 44 54 MONEY


## Load travel dataset

In [2]:
path_data_travel_questions = '5000TravelQuestionsDataset.csv'

In [95]:
import csv

with open(path_data_travel_questions, newline='',encoding='latin-1') as f:
    reader = csv.reader(f)
    data = list(reader)
data_travel_questions = [i[0] for i in data]

In [96]:
content_string = "\n".join(data_travel_questions)

## Analyse dataset, extract other locations

### Load replacement data

In [8]:
#For demoing
subset_size = 100
data_travel_questions = data_travel_questions[0:subset_size]

In [54]:
cities = ['Kigali','Gisenyi','Ruhengeri','Butare','Muhanga','Byumba','Cyangugu','Nyanza','Bugarama','Kayonza','Rwamagana','Nyamata','Ruhango','Gikongoro','Nyagatare','Busogo','Kibuye','Kibungo','Rubengera']
regions = ["Kigali district", "Northern Rwanda", "Western Rwanda", "Eastern Rwanda", "Southern Rwanda"]
rwandan_food = ["ibihaza", "ugali", "rolex", "chapati", "brochettes"]
monuments = ["Nyamata Church", "Ntarama Church", "Camp Kigali Memorial", "Karisoke Research Centre"]

rivers = ["Rusizi River","Sebeya River","Koko River","Rubyiro River","Ruhwa River","Akagera River","Nyabarongo River","Mukungwa River","Akanyaru River","Rukara River","Mbirurume River"]
lakes = ["Lake Kivu","Lake Ihema","Lake Rweru","Lake Rwanyakizinga","Lake Hago","Lake Mpanga","Lake Ruhondo","Lake Muhazi","Lake Cyohoha","Lake Kivumba","Lake Burera","Lake Mugesera","Lake Sake","Lake Mihindi","Lake Mpanga"]
mountains = ["Mount Karisimbi","Mount Gahinga","Mount Sabyinyo","Mount Bisoke","Mount Huye","Mount Nyiragongo","Mount Muhabura"]
parks_and_forests = ["Akagera National Park","Volcanoes National Park","Bwindi Impenetrable Forest","Nyungwe National Park","Cyamudongo Forest"]
islands = ["Bushongo Island","Cyuza Island","Idwi Island","Ihoka Island","Ika Island","Nkombo Island","Munanira Island","Ishovu Island","Ite Island","Kwankoro Island","Sharita Island","Gombo Island","Iwawa Island","Iwinja Island","Kigwa Island","Kikaya Island","Mishungo Island","Bugarura Island"]
airports = ["Kigali International Airport","Kamembe Airport","Gisenyi Airport","Bugesera International Airport"]
falls = ["Rusumo Waterfalls"]
hotels = ["Cleo Lake Kivu Hotel", "Five Volcanoes Boutique Hotel", "Kivu Marina Bay Hotel", "Rutete Eco Lodge", "Virunga Inn Resort & Spa", "Ingagi Park View Lodge", "Radisson Blu Hotel", "Kivu Paradis Resort", "Four Points by Sheraton Kigali", "Kigali Marriott Hotel"]

# other_cities = ["Marrakech","Praslin","Skukuza","La Digue","Cape Point","capetown","Cape Town","Frankfurt","Cancun","Capetown","Cologne city","Copacabana","Francesca","Madina","Marrakech","Mazatlan","Mobay","Monterosso","Monteverde","Nessebar","Oamaru","Old Town","Osta Antica","Paestum","Paleochora","Patagonia","Pathumwan","Patong","Pavlopetri","Punta","Queesntown","Rabaul","Rhodes Town","Samana","Segesta","Shinagawa","Sicily","Sidari","Sohar","Sokcho","Sosua","Sperlong","St Girons","Sukhumvit","Taba","Thumrait","Tibet","Umbria","Vernazza","Villasimius","Whistler","Ximending","chungju","sharm el sheik"]
# other_islands = ["kurumba","Majorco","North Island","Masirsouth islandah","Minorca","Milos","Moorea","Nami islands","Phi Phi","Phu Quoc","Savai'i","Savaii","Tahiti","Tsarabanjina","Upolo","Zante","malolo","moorea","south island","upolu"]
# other_parks = ["Kruger","Naejangsan","Tortuguero","Tsavo","Yellowstone","masai mara","Masai Mara"]
# other_countries = ["UAE", "UK","korea","northern ireland","Tahiti", "scotland", "south korea", "California", "Scandinavia", "germany", "canada", "USA", "pakistan", "UAE", "thailand", "argentina", "Ohio State", "Korea", "Tibet", "Sri lanka", "French Polynesia", "korea", "U.S.", "Mauritius", "Vanuatu", "Jordan"]

In [41]:
#Load NER's to substitute
other_cities = [l.strip() for l in open('map-data/map-cities.txt', 'r')]
other_islands = [l.strip() for l in open('map-data/map-islands.txt', 'r')]
other_parks = [l.strip() for l in open('map-data/map-parks.txt', 'r')]
other_countries = [l.strip() for l in open('map-data/map-countries.txt', 'r')]
other_regions = [l.strip() for l in open('map-data/map-regions.txt', 'r')]
other_demonyms = [l.strip() for l in open('map-data/map-demonyms.txt', 'r')]
other_food = [l.strip() for l in open('map-data/map-food.txt', 'r')]
other_monuments = [l.strip() for l in open('map-data/map-monuments.txt', 'r')]

all_other_locs = other_cities + other_islands + other_parks + other_countries +other_regions + other_demonyms + other_food + other_monuments


In [77]:
other_loc_places = set()
skipped = set()
for sentence in data_travel_questions:
    doc = nlp(sentence)
    for ent in doc.ents:   
        if ent.label_ in ['LOC', 'ORG', 'EVENT', 'FAC', 'GPE'] and ent.text not in all_other_locs:
            other_loc_places.add(ent.text)
        elif ent.label_ in ['PERSON', 'PRODUCT', 'WORK_OF_ART'] and ent.text not in all_other_locs:
            skipped.add((ent.text, ent.label_))

print('other_loc_places', len(other_loc_places))
print('skipped', len(skipped))

other_loc_places 1458
skipped 494


In [78]:
other_loc_places.remove("Airport")

KeyError: 'Airport'

## Substitution

In [24]:
def loc_substitute_fn(place):
    place = place.lower()
    location = ""
    if "island" in place or "isle" in place:
        location = random.choice(islands)
    elif "Mount" in place or "Mt." in place:
        location = random.choice(mountains)
    elif "River" in place:
        location = random.choice(rivers)
    elif "Park" in place:
        location = random.choice(parks_and_forests)
    elif "Lake" in place:
        location = random.choice(lakes)
    elif "airport" in place:
        location = random.choice(airports)
    elif "falls" in place:
        location = random.choice(falls)
    elif "hotel" in place or "Hotel" in place or "resort" in place or "Resort" in place:
        location = random.choice(hotels)
    return location

def city_substitute_fn(city,frequency):
    if frequency >= 10:
        substitute_city = "Kigali"
    else:
        substitute_city = random.choice(rwandan_cities)
    return substitute_city

In [74]:
def substitute_text(content_string, debug=False):
    for original_place in other_loc_places:
        substitute_place = loc_substitute_fn(original_place)
        if substitute_place:
            content_string = content_string.replace(original_place,substitute_place)
    if debug: print(content_string)
    for monument in other_monuments:
        content_string = content_string.replace(monument,random.choice(monuments))
    if debug: print(content_string)
    for region in other_regions:
        content_string = content_string.replace(region,random.choice(regions))
    if debug: print(content_string)
    for food in other_food:
        content_string = content_string.replace(food,random.choice(rwandan_food))
    if debug: print(content_string)
    for island in other_islands:
        content_string = content_string.replace(island,random.choice(islands))
    if debug: print(content_string)
    for park in other_parks:
        content_string = content_string.replace(park,random.choice(parks_and_forests))
    if debug: print(content_string)
    for demonym in other_demonyms:
        content_string = content_string.replace(demonym,"Rwandan")
    if debug: print(content_string)
    for city in other_cities:
        content_string = content_string.replace(city,random.choice(cities))
    if debug: print(content_string)
    for country in other_countries:
        content_string = content_string.replace(country,"Rwanda")    
    if debug: print(content_string)
    return content_string

In [97]:
substituted_content_string = substitute_text(content_string)
with open("5000_travel_sentences_data_substituted.txt","w") as f:
    # for line in substituted_data_travel_content:
    f.write(substituted_content_string)

In [94]:
#Test
substitute_text("What are the hotels close to Belfast International Airport?", True)

What are the hotels close to Bugesera International Airport?
What are the hotels close to Bugesera International Airport?
What are the hotels close to Bugesera International Airport?
What are the hotels close to Bugesera International Airport?
What are the hotels close to Bugesera International Airport?
What are the hotels close to Bugesera International Airport?
What are the hotels close to Bugesera International Airport?
What are the hotels close to Bugesera International Airport?
What are the hotels close to Bugesera International Airport?


'What are the hotels close to Bugesera International Airport?'