In [20]:
from WikidataTreeBuilderSPARQL import WikidataTreeQuery
import pandas as pd
import simplejson as json
from datetime import datetime
import re

In [21]:
tree._debug = True

In [22]:
tree = WikidataTreeQuery(lookup_claims=["P297", "P31", "P279"])

In [23]:
%%time
flare = tree.from_root("Q6256")

CPU times: user 552 ms, sys: 20 ms, total: 572 ms
Wall time: 6.54 s


In [24]:
with open("flareCountries.json", "wb+") as f:
    json.dump(flare, f, indent=4)

In [25]:
flare = tree.add_labels(flare)

In [27]:
with open("flareCountriesNamed.json", "wb+") as f:
    json.dump(flare, f, indent=4)

In [28]:
df = tree.get_pretty_DF()

In [33]:
df
len(df)

2887

In [34]:
# Get only the countries who have a ISO-3166-1 code. This exclude ancient countries.
df_isoonly = df[df['P297_ISO_3166_1_alpha_2_code'].map(lambda x:x != "")]
len(df_isoonly)

216

In [36]:
df_isoonly

Unnamed: 0,altLabel_en,altLabel_fr,description_en,description_fr,entity,label_en,label_fr,P279_subclass_of,P297_ISO_3166_1_alpha_2_code,P31_instance_of,visited_nodes
658,"(SFRY, SFR Yugoslavia)","(Yougoslavie de Tito, RFS Yougoslavie, Républi...",socialist republic in Southeast Europe between...,ancien État,Q83286,Socialist Federal Republic of Yugoslavia,République fédérative socialiste de Yougoslavie,,YU,"(former country, sovereign state)","((country, former country),)"
1395,"(🇵🇸, Palestine, Palestinian state, PA, State o...","(Palestine, État palestinien)",country in the Middle East,État des Palestiniens,Q219060,State of Palestine,État de Palestine,,PS,"(country, state with limited recognition, state)","((country, state with limited recognition), (c..."
2551,"(Republic of Chile, 🇨🇱, República de Chile, Re...",,republic in South America,pays d'Amérique du Sud,Q298,Chile,Chili,,CL,"(sovereign state, member state of the United N...","((country,),)"
2552,"(🇺🇿, Republic of Uzbekistan, uz)",,republic in Central Asia,pays d'Asie centrale,Q265,Uzbekistan,Ouzbékistan,,UZ,"(sovereign state, member state of the United N...","((country,), (country, landlocked country))"
2553,"(SG, Republic of Singapore, sg, 🇸🇬)",,republic in Southeast Asia,ville-État d'Asie,Q334,Singapore,Singapour,,SG,"(port city, island nation, member state of the...","((country, island nation), (country,))"
2555,"(li, Principality of Liechtenstein, FL, LI, 🇱🇮)",,principality in Central Europe,principauté d'Europe,Q347,Liechtenstein,Liechtenstein,,LI,"(principality, unitary state, state in the Con...","((country, landlocked country), (country,))"
2617,"(🇿🇦, Republic of South Africa, za, SA, RSA)",,republic in Southern Africa,pays d'Afrique,Q258,South Africa,Afrique du Sud,,ZA,"(sovereign state, member state of the United N...","((country,),)"
2632,"(ALG, People’s Democratic Republic of Algeria,...","(dz, République algérienne démocratique et pop...",country in North Africa,pays d'Afrique du Nord,Q262,Algeria,Algérie,,DZ,"(sovereign state, member state of the United N...","((country,),)"
2710,"(🇧🇭, Kingdom of Bahrain, bh)",Royaume de Bahreïn,constitutional monarchy in Southwest Asia,pays du Moyen-Orient,Q398,Bahrain,Bahreïn,,BH,"(sovereign state, member state of the United N...","((country,), (country, island nation))"
2758,"(British North America, ca, Dominion of Canada...",CA,country in North America,pays d'Amérique du Nord,Q16,Canada,Canada,,CA,"(sovereign state, member state of the United N...","((country,),)"


In [42]:
output = dict()

In [44]:
for row in df_isoonly.iterrows():
    
    code_iso = row[1]["P297_ISO_3166_1_alpha_2_code"]
    
    output[row[1]["label_en"]] = code_iso
    output[row[1]["label_fr"]] = code_iso
    
    altLabel_en = row[1]["altLabel_en"]
    if isinstance(altLabel_en, str):
        altLabel_en = (altLabel_en)
    for i in altLabel_en:
        output[i] = code_iso
        
    altLabel_fr = row[1]["altLabel_fr"]
    if isinstance(altLabel_fr, str):
        altLabel_fr = (altLabel_fr)
    for i in altLabel_fr:
        output[i] = code_iso    

In [46]:
with open("ISO-3166-1.json", "wb+") as f:
    json.dump(output, f)

In [155]:
from pyxdameraulevenshtein import normalized_damerau_levenshtein_distance as ndld
from nltk import ngrams

def bigram_ndld(x, y):
    """Function to check if the n-gram x matches a n-gram in the reference string y"""
    n = len(x.split())
    
    ngram = ngrams(y.split(), n)
    ngram = list(ngram)
    
    if len(ngram) == 0:
        return 1.0
    
    ndlds = sorted([ndld(" ".join(i), x) for i in ngram])
    return ndlds[0]
    

def country2iso(x):
    """Function to compare the input string to the names and aliases of all countries, get the closest
    matching entry in the database and output its ISO-3166-1 two-letter code. 
    
    The ISO-3166-1.json is based on Wikidata results. E.g., by running the WikidataTreeBuilderSPARQL 
    from_root function from node Q6256, asking for property P297 (ISO-3166), and getting the aforementioned
    info from the resulting dataframe. 
    
    Example: 
    
    > country2iso("France")
    > 'FR'
    > country2iso("Deutschland")
    > 'DE'
    > country2iso("L'hexagone")
    > 'FR'
    > country2iso("Land of thousand lakes")
    > 'FI'
    > country2iso("Nihon-koku")
    > 'JP'
    > country2iso("Land of the rising sun")
    > 'JP'
    """
    q = x.lower()
    
    if q == "":
        return None

    with open("ISO-3166-1.json", "rb+") as f:
        isonames = json.load(f)   

    dls = [(isonames[key], bigram_ndld(q, key.lower()), key, len(key)/len(q)) for key in isonames]
    return sorted(dls, key = lambda y: (y[1], y[3]))[0][0]


In [167]:
country2iso("Deutschland")

'DE'

In [158]:
df_isoonly.to_excel("tableCountries.xlsx")

In [31]:
print("Program end at "+str(datetime.now()))

Program end at 2017-09-27 11:13:39.045709
