In [1]:
import wptools
from SPARQLWrapper import SPARQLWrapper2, SPARQLWrapper, JSON
import unicodedata
import pandas as pd
import numpy as np
import json
import requests
import time

In [2]:
def get_insee(commune):
    """
    returns the insee code of the commune
    """
    commune_ = unicodedata.normalize('NFD', commune.lower()).encode('ascii', 'ignore').decode("utf-8")
    url = 'https://geo.api.gouv.fr/communes?nom={c}&fields=nom,code&format=json&geometry=centre'\
              .format(c=commune_)
    exists = len(json.loads(requests.get(url).text))
    if exists>0 :
        codes = json.loads(requests.get(url).text)
        result = [code["code"] for code in codes \
                if  unicodedata.normalize('NFD', code["nom"].lower()).encode('ascii', 'ignore').decode("utf-8") == commune_]
        if len(result)>0 :
            return result
        
        return -1
    return -1

In [11]:
def dbPedia_get_wplabel(insee):
    d = SPARQLWrapper2("http://fr.dbpedia.org/sparql")

    d.setQuery(
    """
    PREFIX dbo: <http://dbpedia.org/ontology/>
    PREFIX foaf: <http://xmlns.com/foaf/0.1/>

    SELECT distinct ?label
    WHERE
    {
      ?ville rdfs:label ?label.
      ?ville dbo:country  <http://fr.dbpedia.org/resource/France>.
      ?ville <http://fr.dbpedia.org/property/nom> ?insee.
      filter(regex(?insee,\"^""" +insee+ """\", "i") && lang(?label) = 'fr')
      
    }
    """
    ) 
    communes = set()
    res = d.query().bindings
    for result in res:
        communes.add(result["label"].value)
        
    return communes

#### Getting the wikipedia label for each commune

In [12]:
# labels = {}
# for commune in communes:
#     insee = get_insee(commune)
#     if insee!=-1:
#         result = set()
#         for insee_ in insee:
#             label = dbPedia_get_wplabel(insee_)
#             result = result.union(label)
        
#         if(len(result)>0):
#             labels[commune] = list(result)
#         print(commune, result)
#         print("________________________________________________")
        
# with open("Data/wk_labels2.json", "w") as fp:
#     json.dump(labels,fp)

#from file
json_file = open("Data/wk_labels2.json")
labels = json.load(json_file)

### for each commune label, get the infobox of wikipedia page and extract infobox["gentilé"] 

In [87]:
gentiles = {}
for label in labels :
    wks = labels[label]
    result = set()
    for wk in wks :
        try:
            page = wptools.page(wk, lang='fr')
            parse = page.get_parse()
            infobox = parse.data["infobox"]
            if infobox is not None and "gentilé" in infobox:
                g = infobox["gentilé"]
                result.add(g)
        except:
            print("page does not exist")
            
        if len(result)>0:
            gentiles[label] = list(result)
        print(wks, result)
        
with open("Data/infobox_extracted_demonyms___.json", "w") as fp:
    json.dump(gentiles,fp)

In [64]:
page = wptools.page(wikibase='Q21979655', lang='fr').get()
page.data["infobox"]

www.wikidata.org (wikidata) Q21979655
www.wikidata.org (labels) Q1007427|P910|P242|P17|Q1007523|P1082|Q...
fr.wikipedia.org (query) Val_de_Virvée
fr.wikipedia.org (parse) 9646962
fr.wikipedia.org (restbase) /page/summary/Val de Virvée
fr.wikipedia.org (imageinfo) File:Aubie-et-Espessas Mairie.JPG
Val de Virvée (fr) data
{
  assessments: <dict(1)> Communes de France
  claims: <dict(25)> P17, P31, P571, P374, P2046, P1365, P625, P13...
  description: commune française du département de la Gironde
  exhtml: <str(395)> <p><b>Val de Virvée</b> est, depuis le <time ...
  exrest: <str(294)> Val de Virvée est, depuis le 1er janvier 2016...
  extext: <str(313)> **Val de Virvée** est, depuis le 1er janvier ...
  extract: <str(1641)> <p class="mw-empty-elt"></p><p><b>Val de Vi...
  image: <list(6)> {'file': 'File:Aubie-et-Espessas Mairie.JPG', '...
  infobox: <dict(25)> nom, image, légende, région, département, ar...
  iwlinks: <list(1)> https://fr.wiktionary.org/wiki/Val_de_Virv%C3%A9e
  label: 

{'nom': 'Val de Virvée',
 'image': 'Aubie-et-Espessas Mairie.JPG',
 'légende': 'La mairie.',
 'région': '[[Nouvelle-Aquitaine]]',
 'département': '[[Gironde (département)|Gironde]]',
 'arrondissement': '[[Arrondissement de Blaye|Blaye]]',
 'canton': '[[Canton du Nord-Gironde]]',
 'circonscription législative': '[[Onzième circonscription de la Gironde|Onzième circonscription]]',
 'insee': '33018',
 'cp': '33240',
 'maire': 'Christophe Martial',
 'mandat maire': '[[Élections municipales de 2020 en Gironde|2020]]-2026',
 'intercomm': '[[Communauté de communes du Grand Cubzaguais]]',
 'latitude': '45.02',
 'longitude': '-0.405555555556',
 'alt mini': '13',
 'alt maxi': '64',
 'superficie': '20.77',
 'type': 'Commune rurale',
 'unité urbaine': '[[Unité urbaine de Bordeaux|Bordeaux]] <br><small>([[banlieue]])</small>',
 "aire d'attraction": "[[Aire d'attraction de Bordeaux|Bordeaux]] <br><small>(commune de la couronne)</small>",
 'population': '{{Population de France/dernière_pop}}',
 'année

In [151]:
def wikidata_get_wplabel(name):
    d = SPARQLWrapper("https://query.wikidata.org/sparql", agent="hubeauBot/1.0 (https://github.com/BRGM/hubeau)")
    d.setQuery(
    """
    SELECT distinct ?ville
    WHERE
    { 
      ?ville rdfs:label ?label.
      SERVICE wikibase:label { bd:serviceParam wikibase:language "fr" }.
      ?ville wdt:P17 wd:Q142.
      ?ville wdt:P1448 ?insee.     
      filter(regex(?insee,"^"""+name+"""$", "i")).      
      
    }
    """
    ) 
    communes = set()
    d.setReturnFormat(JSON)
    try:
        res = d.query().convert()
        print(res)
        for result in res["results"]["bindings"]:
            communes.add(result["ville"]["value"].split("/")[-1])
        return list(communes)
    
    except KeyboardInterrupt:
        print("interrupted")
        res = d.query().convert()
        print(res)
        for result in res["results"]["bindings"]:
            communes.add(result["ville"]["value"].split("/")[-1])
        return list(communes)
    
    except:
        print("error")
        time.sleep(120)
        res = d.query().convert()
        print(res)
        for result in res["results"]["bindings"]:
            communes.add(result["ville"]["value"].split("/")[-1])
        return list(communes)
                
    

In [248]:
wikidata_get_wplabel("île-de-batz")

{'head': {'vars': ['ville']}, 'results': {'bindings': [{'ville': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q421927'}}]}}


['Q421927']

In [210]:
df=pd.read_csv("Data/commune2021.csv")
communes_ = np.array(df["LIBELLE"])
communes, counts = np.unique(communes_, return_counts=True)
exists = json.load(open("Data/wikidata_ids1.json"))
exists2 = json.load(open("Data/wikidata_ids2.json"))
exists3 = json.load(open("Data/wikidata_ids3.json"))
exists4 = json.load(open("Data/wikidata_ids4.json"))
exists5 = json.load(open("Data/wikidata_ids5.json"))
exists6 = json.load(open("Data/wikidata_ids6.json"))
exists7 = json.load(open("Data/wikidata_ids7.json"))
exists8 = json.load(open("Data/wikidata_ids8.json"))
exists9 = json.load(open("Data/wikidata_ids9.json"))
exists10 = json.load(open("Data/wikidata_ids10.json"))
exists11 = json.load(open("Data/wikidata_ids11.json"))
exists12 = json.load(open("Data/wikidata_ids12.json"))
exists13 = json.load(open("Data/wikidata_ids13.json"))
exists14 = json.load(open("Data/wikidata_ids14.json"))
exists15 = json.load(open("Data/wikidata_ids15.json"))
exists16 = json.load(open("Data/wikidata_ids16.json"))
exists17 = json.load(open("Data/wikidata_ids17.json"))

In [211]:
def cond(commune):
    if commune not in exists and commune not in exists2 and commune not in exists3 and commune not in exists4 \
    and commune not in exists5 and commune not in exists6 and commune not in exists7 and commune not in exists8 \
    and commune not in exists9 and commune not in exists10 and commune not in exists11 and commune not in exists12\
    and commune not in exists13 and commune not in exists14 and commune not in exists15 and commune not in exists16\
    and commune not in exists17:
        return True
    return False


In [220]:
import re
communes2 = []
for commune in communes:
    if cond(commune):
        print(commune)
        c = re.sub("\s\(.*\)", "", commune)
        c = re.sub("\s[0-9]+(e|er) Arrondissement", "", c)
        print(c)
        communes2.append(c)
        print("________________________________________________")

communes2 = list(set(communes2))
print(communes2)

Allemagne-en-Provence
Allemagne-en-Provence
________________________________________________
Bors (Canton de Charente-Sud)
Bors
________________________________________________
Bors (Canton de Tude-et-Lavalette)
Bors
________________________________________________
Castillon (Canton d'Arthez-de-Béarn)
Castillon
________________________________________________
Castillon (Canton de Lembeye)
Castillon
________________________________________________
Château-Chinon (Campagne)
Château-Chinon
________________________________________________
Château-Chinon (Ville)
Château-Chinon
________________________________________________
Hattonchâtel
Hattonchâtel
________________________________________________
Hautecourt-lès-Broville
Hautecourt-lès-Broville
________________________________________________
Longefoy
Longefoy
________________________________________________
Lyon 1er Arrondissement
Lyon
________________________________________________
Lyon 2e Arrondissement
Lyon
___________________________

In [221]:
wikidata_ids = {}
k = 0
for commune in communes2:
    if cond(commune):
        k=k+1
        ids = wikidata_get_wplabel(commune)      
        if(len(ids)>0):
            wikidata_ids[commune] = ids
        print(commune, ids)
        print("________________________________________________")
        
print(k)   

{'head': {'vars': ['ville']}, 'results': {'bindings': []}}
Prangey []
________________________________________________
{'head': {'vars': ['ville']}, 'results': {'bindings': []}}
Marbotte []
________________________________________________
{'head': {'vars': ['ville']}, 'results': {'bindings': []}}
Château-Chinon []
________________________________________________
{'head': {'vars': ['ville']}, 'results': {'bindings': []}}
Hautecourt-lès-Broville []
________________________________________________
{'head': {'vars': ['ville']}, 'results': {'bindings': []}}
Hattonchâtel []
________________________________________________
{'head': {'vars': ['ville']}, 'results': {'bindings': []}}
Longefoy []
________________________________________________
{'head': {'vars': ['ville']}, 'results': {'bindings': []}}
Moulins-en-Bessin []
________________________________________________
{'head': {'vars': ['ville']}, 'results': {'bindings': []}}
Tessens []
________________________________________________
{'head':

In [222]:
wikidata_ids

{}

In [201]:
with open("Data/wikidata_ids17.json", "w") as fp:
    json.dump(wikidata_ids,fp)

In [None]:
ids = ....
gentiles = {}
for label, ids in wikidata_ids.items() :
    result = set()
    for id in ids :
        try:
            page = wptools.page(wikibase=id, lang='fr').get()
            infobox = page.data["infobox"]
            if infobox is not None and "gentilé" in infobox:
                g = infobox["gentilé"]
                result.add(g)
        except:
            print("page does not exist")
          
    if len(result)>0:
        gentiles[label] = list(result)
        
    print(label, result)
        
with open("Data/wikidata_infobox_extracted_demonyms.json", "w") as fp:
    json.dump(gentiles,fp)

### Preprocess extracted text

In [232]:
exists = json.load(open("Data/wikidata_infobox_extracted_demonyms1.json"))
exists2 = json.load(open("Data/wikidata_infobox_extracted_demonyms2.json"))
exists3 = json.load(open("Data/wikidata_infobox_extracted_demonyms3.json"))
exists4 = json.load(open("Data/wikidata_infobox_extracted_demonyms4.json"))

In [225]:
len(exists)+ len(exists2)+len(exists3)+len(exists4)

23569

In [226]:
l = [exists, exists2 , exists3 , exists4]

In [227]:
gentiles = union = {k:v for d in l for k,v in d.items() }

In [229]:
with open("Data/wikidata_infobox_extracted_demonyms.json", "w") as fp:
    json.dump(gentiles,fp)

In [230]:
gentiles = json.load(open("Data/wikidata_infobox_extracted_demonyms.json"))

In [231]:
len(gentiles)

23569

In [245]:
import re

gentiles_ = {}
for com in gentiles:
    list_g = gentiles[com]
    result = set()
    print(com, list_g)
    
    for g in list_g:
        g1 = re.sub("<.*>", ",", g)
        g2 = re.sub("\sou\s|\set\s|;|/", ",", g1)
        g3 = re.sub("\(.*\)", "", g2)
        split_g = re.split("\s?,,?\s?", g3)
        print(split_g)
        for split_g_ in split_g:  
            result.add(split_g_)
    print("________________________________________________________________")
    result = list(result)
    gentiles_[com] = result   

Aast ['Aastais, Aastaises']
['Aastais', 'Aastaises']
________________________________________________________________
Abainville ['Abainvillois, Abainvilloises']
['Abainvillois', 'Abainvilloises']
________________________________________________________________
Abancourt ['Abancourtois, Abancourtoises']
['Abancourtois', 'Abancourtoises']
________________________________________________________________
Abaucourt ['Abaucourtois, Abaucourtoises']
['Abaucourtois', 'Abaucourtoises']
________________________________________________________________
Abaucourt-Hautecourt ['Abaucourtois, Abaucourtoises']
['Abaucourtois', 'Abaucourtoises']
________________________________________________________________
Abbans-Dessous ['Abbanais, Abbanaises']
['Abbanais', 'Abbanaises']
________________________________________________________________
Abbans-Dessus ['Abbanais, Abbanaises']
['Abbanais', 'Abbanaises']
________________________________________________________________
Abbaretz ['Abbarois, Abbaroises']
[

Autremencourt ['Autremencourtois']
['Autremencourtois']
________________________________________________________________
Autreppes ['Autreppois']
['Autreppois']
________________________________________________________________
Autretot ['Autretotais']
['Autretotais']
________________________________________________________________
Autreville ['Alteravillois, Alteravilloises', 'Autrevillois']
['Alteravillois', 'Alteravilloises']
['Autrevillois']
________________________________________________________________
Autreville-sur-Moselle ['Autrevillois']
['Autrevillois']
________________________________________________________________
Autreville-sur-la-Renne ['Autrevillois - Autrevilloises']
['Autrevillois - Autrevilloises']
________________________________________________________________
Autrey ['Altériciens, Altériciennes']
['Altériciens', 'Altériciennes']
________________________________________________________________
Autrey-lès-Gray ['Attuariens']
['Attuariens']
__________________________

['Blausascois']
________________________________________________________________
Blauvac ['Blauvacois, Blauvacoises']
['Blauvacois', 'Blauvacoises']
________________________________________________________________
Blay ['Blaviens']
['Blaviens']
________________________________________________________________
Blaymont ['Blaymontais']
['Blaymontais']
________________________________________________________________
Blaziert ['Blaziertois, Blaziertoise']
['Blaziertois', 'Blaziertoise']
________________________________________________________________
Blendecques ['Blendecquois']
['Blendecquois']
________________________________________________________________
Blennes ['Blennois']
['Blennois']
________________________________________________________________
Blesle ['Bleslois(es)']
['Bleslois']
________________________________________________________________
Blesme ['Blesmois, Blesmoises']
['Blesmois', 'Blesmoises']
________________________________________________________________
Blesmes ['Bl

Cabris ['Cabriencs']
['Cabriencs']
________________________________________________________________
Cabrières ['Cabriérois', 'Cabrièrois, Cabrièroises']
['Cabriérois']
['Cabrièrois', 'Cabrièroises']
________________________________________________________________
Cabrières-d'Aigues ['Cabriérains, Cabriéraines']
['Cabriérains', 'Cabriéraines']
________________________________________________________________
Cabrières-d'Avignon ['Cabriérois, Cabriéroises']
['Cabriérois', 'Cabriéroises']
________________________________________________________________
Cabriès ['Cabries(s)iens']
['Cabriesiens']
________________________________________________________________
Cachan ['Cachanais']
['Cachanais']
________________________________________________________________
Cachy ['Cachysiens']
['Cachysiens']
________________________________________________________________
Cadalen ['Cadalénois']
['Cadalénois']
________________________________________________________________
Cadarcet ['Cadarcetois']
['Cadarc

Châtelais ['Châtelaisiens']
['Châtelaisiens']
________________________________________________________________
Châtelaudren ['Châtelaudrinais']
['Châtelaudrinais']
________________________________________________________________
Châtelblanc ['Castelblanciens, Castelblanciennes']
['Castelblanciens', 'Castelblanciennes']
________________________________________________________________
Châtelneuf ['Castelnevins']
['Castelnevins']
________________________________________________________________
Châtelperron ['Châtelperronais, Châtelperronaises']
['Châtelperronais', 'Châtelperronaises']
________________________________________________________________
Châtelus ['Châteludaires', 'Chatellussiens']
['Châteludaires']
['Chatellussiens']
________________________________________________________________
Châtelus-Malvaleix ['Castelluciens, Castelluciennes']
['Castelluciens', 'Castelluciennes']
________________________________________________________________
Châtelus-le-Marcheix ['Castelmarchois']
['C

Céaux ['Céaussiens']
['Céaussiens']
________________________________________________________________
Cébazan ['Cébazanais, Cébazanaises']
['Cébazanais', 'Cébazanaises']
________________________________________________________________
Cébazat ['Cébazaires']
['Cébazaires']
________________________________________________________________
Cénac ['Cénacais']
['Cénacais']
________________________________________________________________
Cépet ['Cépetois, Cépetoises']
['Cépetois', 'Cépetoises']
________________________________________________________________
Cépie ['Cépinois']
['Cépinois']
________________________________________________________________
Cérans-Foulletourte ['Céranais-Foulletourtois']
['Céranais-Foulletourtois']
________________________________________________________________
Cérences ['Cérençais']
['Cérençais']
________________________________________________________________
Céreste ['Cérestain']
['Cérestain']
________________________________________________________________
Cé

Faverolles-en-Berry ['Faverollais']
['Faverollais']
________________________________________________________________
Faverolles-et-Coëmy ['Faverollais, Faverollaises']
['Faverollais', 'Faverollaises']
________________________________________________________________
Faverolles-la-Campagne ['Faverollais']
['Faverollais']
________________________________________________________________
Faverolles-sur-Cher ['Faverollais']
['Faverollais']
________________________________________________________________
Favières ['Favièrois', 'Fabériens', 'Faviérois']
['Favièrois']
['Fabériens']
['Faviérois']
________________________________________________________________
Favresse ['Favressois, Favressoises']
['Favressois', 'Favressoises']
________________________________________________________________
Fay ['Fayard', 'Fayens']
['Fayard']
['Fayens']
________________________________________________________________
Fay-aux-Loges ['Fayciens']
['Fayciens']
_______________________________________________________

________________________________________________________________
Grosley-sur-Risle ['Grosleyen']
['Grosleyen']
________________________________________________________________
Grosmagny ['Grosmagniens']
['Grosmagniens']
________________________________________________________________
Grosne ['Grosnois', 'Charnaysiens , Charnayoux , Chornoutis , Chournayons']
['Grosnois']
['Charnaysiens', 'Charnayoux', 'Chornoutis', 'Chournayons']
________________________________________________________________
Grospierres ['Grospierrois, Grospierroises']
['Grospierrois', 'Grospierroises']
________________________________________________________________
Grosrouvre ['Grosrouvrois']
['Grosrouvrois']
________________________________________________________________
Grossa ['Grossitains']
['Grossitains']
________________________________________________________________
Grosseto-Prugna ['Grossétais']
['Grossétais']
________________________________________________________________
Grossouvre ['Grossouvrois']
['G

________________________________________________________________
La Capelle-lès-Boulogne ['Capellois']
['Capellois']
________________________________________________________________
La Cassagne ['Cassagnais']
['Cassagnais']
________________________________________________________________
La Cassaigne ['Lacassaignois, Lacassaignoises']
['Lacassaignois', 'Lacassaignoises']
________________________________________________________________
La Caunette ['Caunettois']
['Caunettois']
________________________________________________________________
La Cavalerie ['Cavalerien(ne)']
['Cavalerien']
________________________________________________________________
La Celle ['Cellois, Celloises']
['Cellois', 'Celloises']
________________________________________________________________
La Celle-Guenand ['Cellois-guénandais']
['Cellois-guénandais']
________________________________________________________________
La Celle-Saint-Avant ['Cellois']
['Cellois']
_______________________________________________

Le Cergne ['Cergnerots, Cergnerottes']
['Cergnerots', 'Cergnerottes']
________________________________________________________________
Le Chaffal ['Chaffalois, Chaffaloises']
['Chaffalois', 'Chaffaloises']
________________________________________________________________
Le Chaffaut-Saint-Jurson ['Chaffaudiers']
['Chaffaudiers']
________________________________________________________________
Le Chalon ['Chalonnais, Chalonnaises']
['Chalonnais', 'Chalonnaises']
________________________________________________________________
Le Chambon ['Chambonniers']
['Chambonniers']
________________________________________________________________
Le Chambon-Feugerolles ['Chambonnaires']
['Chambonnaires']
________________________________________________________________
Le Champ-Saint-Père ['Pérois']
['Pérois']
________________________________________________________________
Le Champ-près-Froges ['Champiots']
['Champiots']
________________________________________________________________
Le Change ['Cha

['Longchampains']
________________________________________________________________
Longcochon ['Couchetards']
['Couchetards']
________________________________________________________________
Longeau-Percey ['Aqualonguiens, Aqualonguiennes']
['Aqualonguiens', 'Aqualonguiennes']
________________________________________________________________
Longeault ['Longaquéens']
['Longaquéens']
________________________________________________________________
Longeaux ['Longovicien']
['Longovicien']
________________________________________________________________
Longeville ['Longevillois, Longevilloises']
['Longevillois', 'Longevilloises']
________________________________________________________________
Longeville-en-Barrois ['longevillois, Longevilloises']
['longevillois', 'Longevilloises']
________________________________________________________________
Longeville-lès-Metz ['Longevillois']
['Longevillois']
________________________________________________________________
Longeville-lès-Saint-Avold

Merpins ['Merpinois']
['Merpinois']
________________________________________________________________
Merrey-sur-Arce ['Mérrotins, Mérrotines']
['Mérrotins', 'Mérrotines']
________________________________________________________________
Merris ['Merrisiens, Merrisiennes']
['Merrisiens', 'Merrisiennes']
________________________________________________________________
Merry-sur-Yonne ['Médéricien']
['Médéricien']
________________________________________________________________
Mers-les-Bains ['Mersois, Mersoise']
['Mersois', 'Mersoise']
________________________________________________________________
Mers-sur-Indre ['Mersiens']
['Mersiens']
________________________________________________________________
Merschweiller ['Merschweillois <br> Merschweillerois']
['Merschweillois', 'Merschweillerois']
________________________________________________________________
Merten ['Mertenois, Mertenoises']
['Mertenois', 'Mertenoises']
________________________________________________________________
Me

Mézel ['Mézeliens']
['Mézeliens']
________________________________________________________________
Mézens ['Mézensols']
['Mézensols']
________________________________________________________________
Mézeray ['Mézeréen']
['Mézeréen']
________________________________________________________________
Mézerolles ['Mézerollois']
['Mézerollois']
________________________________________________________________
Mézerville ['Mézervillais']
['Mézervillais']
________________________________________________________________
Mézidon-Canon ['Mézidonnais et Canonais']
['Mézidonnais', 'Canonais']
________________________________________________________________
Mézilles ['Mézillois']
['Mézillois']
________________________________________________________________
Mézin ['Mézinais']
['Mézinais']
________________________________________________________________
Méziré ['Mézirois']
['Mézirois']
________________________________________________________________
Mézières-en-Brenne ['Macériens']
['Macériens']
_____

Passy ['Passycois', 'Passerands']
['Passycois']
['Passerands']
________________________________________________________________
Passy-Grigny ['Passiats-Grigniats, Passiates-Grigniates']
['Passiats-Grigniats', 'Passiates-Grigniates']
________________________________________________________________
Pastricciola ['Pastricciolais']
['Pastricciolais']
________________________________________________________________
Patay ['Patichons']
['Patichons']
________________________________________________________________
Patrimonio ['Patrimoniens']
['Patrimoniens']
________________________________________________________________
Paucourt ['Paucourtois']
['Paucourtois']
________________________________________________________________
Paudy ['Paludiens / Paludiennes']
['Paludiens', 'Paludiennes']
________________________________________________________________
Pauillac ['Pauillacais']
['Pauillacais']
________________________________________________________________
Paule ['Paulois']
['Paulois']
_______

Renazé ['Renazéen']
['Renazéen']
________________________________________________________________
Rencurel ['Rencurelois-e-s']
['Rencurelois-e-s']
________________________________________________________________
Renescure ['Renescurois']
['Renescurois']
________________________________________________________________
Rennepont ['Rennepontais, Rennepontaises']
['Rennepontais', 'Rennepontaises']
________________________________________________________________
Rennes-les-Bains ['Rennois']
['Rennois']
________________________________________________________________
Rennes-sur-Loue ['Renniaux']
['Renniaux']
________________________________________________________________
Renneval ['Rennevalois(es)']
['Rennevalois']
________________________________________________________________
Renneville ['Rennevillois, Rennevilloises', 'Rennevillois']
['Rennevillois', 'Rennevilloises']
['Rennevillois']
________________________________________________________________
Renwez ['Renwézien et Renwézienne']
['

['Saint-Félixéens']
________________________________________________________________
Saint-Félix-de-Sorgues ['Saint-Félicien(ne)']
['Saint-Félicien']
________________________________________________________________
Saint-Félix-de-Tournegat ['Saint-Félixans']
['Saint-Félixans']
________________________________________________________________
Saint-Félix-de-Villadeix ['Saint-Féliciens']
['Saint-Féliciens']
________________________________________________________________
Saint-Félix-de-l'Héras ['Félissois']
['Félissois']
________________________________________________________________
Saint-Gal ['Saint-Galiens']
['Saint-Galiens']
________________________________________________________________
Saint-Galmier ['Baldomériens']
['Baldomériens']
________________________________________________________________
Saint-Gatien-des-Bois ['Saint-Gatiennais']
['Saint-Gatiennais']
________________________________________________________________
Saint-Gaudent ['Saint-Gaudentais']
['Saint-Gaudentais']
__

________________________________________________________________
Saint-Pierre-lès-Elbeuf ['Saint-Pierrais / Pierrotins']
['Saint-Pierrais', 'Pierrotins']
________________________________________________________________
Saint-Pierre-lès-Franqueville ['Saint-Pierrois(es)']
['Saint-Pierrois']
________________________________________________________________
Saint-Pierre-lès-Nemours ['Saint-Pierrois']
['Saint-Pierrois']
________________________________________________________________
Saint-Pierre-sur-Dives ['Pétruvien']
['Pétruvien']
________________________________________________________________
Saint-Pierre-sur-Dropt ['Saint-Pierrois']
['Saint-Pierrois']
________________________________________________________________
Saint-Pierre-sur-Erve ['Saint-Pierrois']
['Saint-Pierrois']
________________________________________________________________
Saint-Pierre-sur-Orthe ['Pétrucien']
['Pétrucien']
________________________________________________________________
Saint-Pierre-Église ['Saint-Pierr

["''Septfontois''"]
________________________________________________________________
Septmoncel ['Septmoncelands']
['Septmoncelands']
________________________________________________________________
Septmoncel les Molunes ['Septmoncelands-Molunois']
['Septmoncelands-Molunois']
________________________________________________________________
Septmonts ['Septmontois ou Septmontais']
['Septmontois', 'Septmontais']
________________________________________________________________
Septvaux ['Septvalien(ne)s']
['Septvaliens']
________________________________________________________________
Septèmes-les-Vallons ['Septémois']
['Septémois']
________________________________________________________________
Sepvigny ['Bacaoués']
['Bacaoués']
________________________________________________________________
Sepx ['Sepxois']
['Sepxois']
________________________________________________________________
Sequedin ['Sequedinois']
['Sequedinois']
_____________________________________________________________

Valpuiseaux ['Valpuisiens']
['Valpuisiens']
________________________________________________________________
Valros ['Valrossiens']
['Valrossiens']
________________________________________________________________
Valroufié ['Valroufiérois']
['Valroufiérois']
________________________________________________________________
Valréas ['Valréassiens']
['Valréassiens']
________________________________________________________________
Vals ['Valsois, Valsoises']
['Valsois', 'Valsoises']
________________________________________________________________
Vals-les-Bains ['Valsois']
['Valsois']
________________________________________________________________
Vals-près-le-Puy ['Valladiers']
['Valladiers']
________________________________________________________________
Valserhône ['Valserhônois et Valserhônoises']
['Valserhônois', 'Valserhônoises']
________________________________________________________________
Valserres ['Valserrois']
['Valserrois']
_________________________________________________

['Zinswillerois', 'Zinswilleroises']
________________________________________________________________
Zittersheim ['Zittersheimois, Zittersheimoises']
['Zittersheimois', 'Zittersheimoises']
________________________________________________________________
Zommange ['Zommmangeois, Zommangeoises']
['Zommmangeois', 'Zommangeoises']
________________________________________________________________
Zonza ['Zonzais, Zonzaises']
['Zonzais', 'Zonzaises']
________________________________________________________________
Zoteux ['Zotelois, Zoteloises']
['Zotelois', 'Zoteloises']
________________________________________________________________
Zouafques ['Zouafquois, Zouafquoises']
['Zouafquois', 'Zouafquoises']
________________________________________________________________
Zoufftgen ['Zoufftgenois, Zoufftgenoises']
['Zoufftgenois', 'Zoufftgenoises']
________________________________________________________________
Zoza ['Zozais, Zozaises']
['Zozais', 'Zozaises']
___________________________________

In [247]:
with open("Data/wikidata_infobox_extracted_demonyms_processed.json", "w") as fp:
    json.dump(gentiles_,fp)

In [26]:
wikidata_ids1 = json.load(open("Data/trash/wikidata_ids17.json"))
wikidata_ids2 = json.load(open("Data/trash/wikidata_ids_(1-16).json"))
wikidata_ids = {k:v for d in [wikidata_ids1, wikidata_ids2] for k,v in d.items() }

In [28]:
wikidata_ids["Précy-sur-Marne"]

['Q1455977']

In [30]:
page = wptools.page('Précy-sur-Marne', lang="fr")
x = page.get_restbase('/page/html/')
y = page.get_parse()
print(y.data["infobox"])

fr.wikipedia.org (restbase) /page/html/Précy-sur-Marne
Précy-sur-Marne (fr) data
{
  html: <str(297289)> <!DOCTYPE html><html prefix="dc: http://purl...
  requests: <list(1)> restbase
}
fr.wikipedia.org (parse) Précy-sur-Marne
fr.wikipedia.org (imageinfo) File:Précy-sur-Marne mairie.jpg


{'nom': 'Précy-sur-Marne', 'image': 'Précy-sur-Marne mairie.jpg', 'légende': 'La mairie.', 'blason': 'Blason Précy-sur-Marne.svg', 'légende blason': '#Héraldique', 'région': '[[Île-de-France]]', 'département': '[[Seine-et-Marne]]<br />([[Melun]])', 'arrondissement': '[[Arrondissement de Meaux|Meaux]]', 'canton': '[[Canton de Claye-Souilly]]', 'circonscription législative': '[[Septième circonscription de Seine-et-Marne|Septième circonscription]]', 'insee': '77376', 'cp': '77410', 'maire': 'Christine Lyna Augry', 'mandat maire': '[[Élections municipales françaises de 2020|2020]]-2026', 'intercomm': '[[communauté de communes Plaines et monts de France]]', 'longitude': '2.7750', 'latitude': '48.9306', 'alt mini': '39', 'alt maxi': '83', 'superficie': '4.81', 'type': 'Commune rurale', "aire d'attraction": 'Paris <br><small>(commune de la couronne)</small>', 'population': '{{Population de France/dernière_pop}}', 'année_pop': '{{Population de France/dernière_année_Infobox}}', 'gentilé': 'Préc

Précy-sur-Marne (fr) data
{
  html: <str(297289)> <!DOCTYPE html><html prefix="dc: http://purl...
  image: <list(1)> {'kind': 'parse-image', 'file': 'File:Précy-sur...
  infobox: <dict(27)> nom, image, légende, blason, légende blason,...
  iwlinks: <list(1)> https://fr.wiktionary.org/wiki/Pr%C3%A9cy-sur...
  pageid: 192186
  parsetree: <str(61789)> <root><comment>&lt;!--Sans accent pour u...
  requests: <list(3)> restbase, parse, imageinfo
  title: Précy-sur-Marne
  wikibase: Q1455977
  wikidata_url: https://www.wikidata.org/wiki/Q1455977
  wikitext: <str(41304)> <!--Sans accent pour un tri alphabétique ...
}


In [13]:
def get_wikipedia_url_from_wikidata_id(wikidata_id, lang='fr', debug=False):
    import requests
    from requests import utils

    url = (
        'https://www.wikidata.org/w/api.php'
        '?action=wbgetentities'
        '&props=sitelinks/urls'
        f'&ids={wikidata_id}'
        '&format=json')
    json_response = requests.get(url).json()
    if debug: print(wikidata_id, url, json_response) 

    entities = json_response.get('entities')    
    if entities:
        entity = entities.get(wikidata_id)
        if entity:
            sitelinks = entity.get('sitelinks')
            if sitelinks:
                if lang:
                    # filter only the specified language
                    sitelink = sitelinks.get(f'{lang}wiki')
                    if sitelink:
                        wiki_url = sitelink.get('url')
                        if wiki_url:
                            return requests.utils.unquote(wiki_url)
                else:
                    # return all of the urls
                    wiki_urls = {}
                    for key, sitelink in sitelinks.items():
                        wiki_url = sitelink.get('url')
                        if wiki_url:
                            wiki_urls[key] = requests.utils.unquote(wiki_url)
                    return wiki_urls
    return None  

In [29]:
get_wikipedia_url_from_wikidata_id("Q1455977")

'https://fr.wikipedia.org/wiki/Précy-sur-Marne'

In [31]:
wikidata_ids1 = json.load(open("Data/trash/wikidata_ids17.json"))
wikidata_ids2 = json.load(open("Data/trash/wikidata_ids_(1-16).json"))

In [32]:
wikidata_ids = {k:v for d in [wikidata_ids2, wikidata_ids1] for k,v in d.items() }


In [33]:
wikidata_ids

{'Aast': ['Q198483'],
 'Abainville': ['Q59411'],
 'Abancourt': ['Q1046381', 'Q1098767'],
 'Abaucourt': ['Q269866'],
 'Abaucourt-Hautecourt': ['Q228231'],
 'Abbans-Dessous': ['Q305853'],
 'Abbans-Dessus': ['Q305854'],
 'Abbaretz': ['Q1001990'],
 'Abbecourt': ['Q1346665'],
 'Abbenans': ['Q306217'],
 'Abbeville': ['Q700354', 'Q28520'],
 'Abbeville-Saint-Lucien': ['Q1106555'],
 'Abbécourt': ['Q237418'],
 'Abbéville-la-Rivière': ['Q249366'],
 'Abbéville-lès-Conflans': ['Q1087158'],
 'Abbévillers': ['Q306958'],
 'Abeilhan': ['Q200654'],
 'Abelcourt': ['Q318570'],
 'Abergement-Saint-Jean': ['Q82957453'],
 'Abergement-la-Ronce': ['Q657807'],
 'Abergement-le-Grand': ['Q1067551'],
 'Abergement-le-Petit': ['Q1069018'],
 'Abergement-lès-Thésy': ['Q319615'],
 'Abidos': ['Q856780'],
 'Abilly': ['Q1069923'],
 'Abitain': ['Q844039'],
 'Abjat-sur-Bandiat': ['Q321807'],
 'Ablain-Saint-Nazaire': ['Q318255'],
 'Ablaincourt-Pressoir': ['Q28526'],
 'Ablainzevelle': ['Q288543'],
 'Ablancourt': ['Q762205'],
 