In [1]:
pip install names-dataset -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.4/58.4 MB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.2/6.2 MB[0m [31m50.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for names-dataset (setup.py) ... [?25l[?25hdone


In [2]:
from names_dataset import NameDataset, NameWrapper
from typing import Tuple, List

In [None]:
'''
Burundi:            BI
Bostwana:           BW
Burkina Faso:       BF
Cameroon:           CA
Djibouti:           DJ
Ghana:              GH
Namibia:            NA
Nigeria:            NG
South Africa:       ZA
'''

'\nBurundi:            BI\nBostwana:           BW\nBurkina Faso:       BF\nCameroon:           CA\nDjibouti:           DJ\nGhana:              GH\nNamibia:            NA\nNigeria:            NG\nSouth Africa:       ZA\n'

In [34]:
def get_country_names(country_code: str, n_names: int = 1000) -> Tuple[List[str], str]:
  '''
  This used the names_dataset to quickly source for local names, in a country of choice.

  Parameters
  ----------
  country_code: str
          a short code, representing the country name.
  n_names: int optional
          number of names to be generated. (note it generate 2x of n_names passed)

  Returns
  -------
  Tuple[List[str], str]
          a tuple list containing a list of generated names and the country code.
  '''
  nd = NameDataset()

  names = []
  names_dict = nd.get_top_names(n=n_names, country_alpha2=country_code) # gets the name
  names.extend(names_dict[country_code]['M']) # extend(append) to names
  names.extend(names_dict[country_code]['F'])
  return names, country_code




import unicodedata
import string

# combining the ascii characters with some other characters.
all_letters = string.ascii_letters + " .,;"

def unicode_to_ascii(s: str) -> str:
  '''
  Turn a unicode string to plain ASCII

  Parameters
  ----------
  s: str
    represent a name to be passed to the function.

  Returns
  -------
  str
    returns a word stripped of all it unicode characters.
  '''
  return ''.join(
      c for c in unicodedata.normalize('NFD', s)
      if unicodedata.category(c) != 'Mn'
      and c in all_letters
  )




def write_list_to_file(names: List[str], country_code: str) -> None:
  '''
  This function takes the output of get_country_names(), it writes the names and country code to file.

  Parameters
  ----------
  names: List[str]
      a list containing strings(names).
  country_code: str
      a short code, representing the country name.

  Returns
  -------
  None
  '''

  path = '/content/drive/MyDrive/Colab_Notebooks/So_help_me_God/common_countries_name/'

  new_name = []
  for name in names:
    new_name.append(unicode_to_ascii(name))

  names = [name.strip() for name in new_name if name.strip() and len(name.strip()) > 2]
  names = list(set(names))


  # write to file
  with open(path + country_code + '.txt', 'w') as f:
    for name in names:
      f.write(name + '\n')
  f.close()

In [35]:
def filter_data(data):
  '''
  this function makes sure there are no identical names, in the whole list[dict[list]]

  '''
  seen = set()

  for d in data:
      for key, value in d.items():
          for item in value:
              if item in seen:
                  for other_d in data:
                      if key in other_d and item in other_d[key]:
                          other_d[key].remove(item)
              else:
                  seen.add(item)
  return data

In [37]:
country_code = ['BI', 'BW', 'BF', 'CA', 'DJ', 'GH', 'NA', 'NG', 'ZA']
list_files_dict = []

for country in country_code:
  names, code = get_country_names(country, 400)

  # create dict
  files_dict = {code: names}
  list_files_dict.append(files_dict)

filtered_data = filter_data(list_files_dict)

for country_data in filtered_data:
    for code, names in country_data.items():
        write_list_to_file(names, code)