In [12]:
def extract_wikidata_id(url):
    if not url:
        return None
    return url.rsplit("/", 1)[-1]

In [None]:
from SPARQLWrapper import SPARQLWrapper, JSON

def parse_group_concat(value):
    if value is None or value == "":
        return []
    return value.split("|")

def extract_year(date_string):
    """
    Extrait l'ann√©e d'une date Wikidata du type '+1887-01-28T00:00:00Z'
    et la retourne sous forme d'entier.
    """
    if not date_string:
        return None
    
    year_str = date_string.strip("+").split("-")[0]
    
    # Convertit en int pour permettre le tri
    try:
        return int(year_str)
    except ValueError:
        return None


def get_first_and_last_year(dates):
    """
    Prend une liste de dates Wikidata et renvoie :
    - premi√®re ann√©e (min)
    - derni√®re ann√©e (max)
    """
    if not dates:
        return None, None

    # Conversion des dates Wikidata en ann√©es enti√®res
    years = [extract_year(d) for d in dates if d]

    # Filtre les valeurs invalides
    years = [y for y in years if y is not None]

    if not years:
        return None, None

    # üî• TRI CHRONOLOGIQUE
    years = sorted(years)

    # Retourne la premi√®re et la derni√®re ann√©e
    return years[0], years[-1]


def get_first_or_none(values):
    if not values:
        return None
    return values[0]

def get_first_or_none_list(values):
    """
    Renvoie une liste contenant uniquement la premi√®re valeur de la liste.
    Si la liste est vide, renvoie [None].
    """
    if not values:
        return [None]
    return [values[0]]

def get_monument_data(qid):
    sparql = SPARQLWrapper("https://query.wikidata.org/sparql")

    query = f"""
    SELECT
        (GROUP_CONCAT(DISTINCT STR(?inception); SEPARATOR="|") AS ?inception)
        (GROUP_CONCAT(DISTINCT STR(?instanceOf); SEPARATOR="|") AS ?instanceOf)
        (GROUP_CONCAT(DISTINCT STR(?image); SEPARATOR="|") AS ?image)
        (GROUP_CONCAT(DISTINCT STR(?country); SEPARATOR="|") AS ?country)
        (GROUP_CONCAT(DISTINCT STR(?city); SEPARATOR="|") AS ?city)
        (GROUP_CONCAT(DISTINCT STR(?lat); SEPARATOR="|") AS ?lat)
        (GROUP_CONCAT(DISTINCT STR(?lon); SEPARATOR="|") AS ?lon)
        (GROUP_CONCAT(DISTINCT STR(?architect); SEPARATOR="|") AS ?architect)
    WHERE {{
      VALUES ?item {{ wd:{qid} }}

      OPTIONAL {{ ?item wdt:P571 ?inception. }}
      OPTIONAL {{ ?item wdt:P31 ?instanceOf. }}
      OPTIONAL {{ ?item wdt:P18 ?image. }}
      OPTIONAL {{ ?item wdt:P17 ?country. }}
      OPTIONAL {{ ?item wdt:P131 ?city. }}

      OPTIONAL {{
        ?item wdt:P625 ?coords.
        BIND(geof:latitude(?coords) AS ?lat)
        BIND(geof:longitude(?coords) AS ?lon)
      }}

      OPTIONAL {{ ?item wdt:P84 ?architect. }}
    }}
    GROUP BY ?item
    """

    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()

    row = results["results"]["bindings"][0]

    # Lecture des variables brutes
    inception_list = [
        extract_wikidata_id(a) for a in parse_group_concat(row.get("inception", {}).get("value")) if a
    ]
    instance_of_list = [
        extract_wikidata_id(a) for a in parse_group_concat(row.get("instanceOf", {}).get("value")) if a
    ]
    image_list       = parse_group_concat(row.get("image", {}).get("value"))
    country_list = [
        extract_wikidata_id(a) for a in parse_group_concat(row.get("country", {}).get("value")) if a
    ]
    city_list = [
        extract_wikidata_id(a) for a in parse_group_concat(row.get("city", {}).get("value")) if a
    ]

    lat_list         = parse_group_concat(row.get("lat", {}).get("value"))
    lon_list         = parse_group_concat(row.get("lon", {}).get("value"))
    architect_list = [
        extract_wikidata_id(a) for a in parse_group_concat(row.get("architect", {}).get("value")) if a
    ]


    # üéØ Transformation :
    first_image   = get_first_or_none(image_list)
    first_lat     = float(get_first_or_none(lat_list)) if lat_list else None
    first_lon     = float(get_first_or_none(lon_list)) if lon_list else None
    first_city    = get_first_or_none_list(city_list)
    first_country = get_first_or_none_list(country_list)
    first_year, last_year = get_first_and_last_year(inception_list)


    # Variables finales
    return (
        first_year,       # ‚úî ann√©e de d√©but
        last_year,        # ‚úî ann√©e de fin
        instance_of_list,
        first_image,      
        first_country,
        first_city,
        first_lat,        # ‚úîÔ∏è float
        first_lon,        # ‚úîÔ∏è float
        architect_list
    )


# Exemple : Tour Eiffel
(
    first_year,
    last_year,
    instance_of,
    image,
    countries,
    cities,
    lat,
    lon,
    architects
) = get_monument_data("Q243")

print("Ann√©e inception (d√©but) :", first_year)
print("Ann√©e inception (fin) :", last_year)
print("Instance of :", instance_of)
print("Image :", image)
print("Latitude :", lat)
print("Longitude :", lon)
print("Countries :", countries)
print("Cities :", cities)
print("Architects :", architects)


Ann√©e inception (d√©but) : 1887
Ann√©e inception (fin) : 1889
Instance of : ['Q2319498', 'Q1440476', 'Q1440300', 'Q570116']
Image : http://commons.wikimedia.org/wiki/Special:FilePath/Tour%20Eiffel%20Wikimedia%20Commons.jpg
Latitude : 48.858296
Longitude : 2.294479
Countries : ['Q142']
Cities : ['Q259463']
Architects : ['Q778243']


In [None]:
import csv

def wikidata_to_xml_ids_or_qid(qid_list, csv_path):
    # si pas trouv√© ‚Üí on garde le QID
    # mapping wikidata_qid -> xml_id
    # √† utiliser pour les personnes et les lieux
    mapping = {}

    with open(csv_path, newline="", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for row in reader:
            mapping[row["wikidata_qid"]] = row["xml_id"]

    # si pas trouv√© ‚Üí on garde le QID
    return [mapping.get(qid, qid) for qid in qid_list]

import csv

def wikidata_to_xml_ids_or_none(qid_list, csv_path):
    """
    √Ä utiliser pour les typologies / techniques.
    - √©vite les doublons
    - ignore les labels vides
    - retourne un seul label ou None
    """
    if isinstance(qid_list, str):
        qid_list = [qid_list]

    labels_set = set()

    with open(csv_path, newline="", encoding="utf-8") as f:
        reader = csv.DictReader(f, delimiter=";")
        for row in reader:
            qid = row.get("wikidata_qid")
            label = row.get("label_fr")

            if qid in qid_list and label:
                labels_set.add(label.strip())

    if not labels_set:
        return None

    # retourne UN seul label (stable)
    return sorted(labels_set)[0]


print("Ann√©e inception (d√©but) :", first_year)
print("Ann√©e inception (fin) :", last_year)
print("Instance of :", instance_of)
print("Image :", image)
print("Latitude :", lat)
print("Longitude :", lon)
print("Countries :", countries)
print("Cities :", cities)
print("Architects :", architects)


Ann√©e inception (d√©but) : 1887
Ann√©e inception (fin) : 1889
Instance of : ['Q2319498', 'Q1440476', 'Q1440300', 'Q570116']
Image : http://commons.wikimedia.org/wiki/Special:FilePath/Tour%20Eiffel%20Wikimedia%20Commons.jpg
Latitude : 48.858296
Longitude : 2.294479
Countries : ['Q142']
Cities : ['Q259463']
Architects : ['Q778243']


# Partie 2 constitution du CSV

In [16]:
import csv

input_csv=""

def monuments_wikidata_to_csv(input_csv, output_csv):
    with open(input_csv, newline="", encoding="utf-8") as infile, \
         open(output_csv, "w", newline="", encoding="utf-8") as outfile:

        reader = csv.DictReader(infile)
        writer = csv.writer(outfile, delimiter=";")

        # En-t√™te du CSV de sortie
        writer.writerow([
            "qid",
            "first_year",
            "last_year",
            "instance_of",
            "image",
            "country",
            "city",
            "latitude",
            "longitude",
            "architects"
        ])

        for row in reader:
            qid = extract_wikidata_id(row.get("url_wikidata"))
            if not qid:
                continue

            (
                first_year,
                last_year,
                instance_of,
                image,
                country,
                city,
                lat,
                lon,
                architects
            ) = get_monument_data(qid)

            writer.writerow([
                qid,
                first_year,
                last_year,
                instance_of,
                image,
                country,
                city,
                lat,
                lon,
                architects
            ])

monuments_wikidata_to_csv("ex_wikidata.csv", "ex_wikidata2.csv")

In [20]:
import csv
import ast
from collections import Counter
from SPARQLWrapper import SPARQLWrapper, JSON

def compter_instance_of(csv_path, colonne="instance_of"):
    counter = Counter()

    with open(csv_path, newline="", encoding="utf-8") as f:
        reader = csv.DictReader(f, delimiter=";")
        for row in reader:
            value = row.get(colonne)
            if not value:
                continue

            # Convertit la string "['Q1', 'Q2']" en vraie liste
            try:
                qids = ast.literal_eval(value)
            except (ValueError, SyntaxError):
                continue

            for qid in qids:
                counter[qid] += 1

    return counter

counts = compter_instance_of("ex_wikidata2.csv")

# Top 10 des instance_of les plus fr√©quents

sparql = SPARQLWrapper("https://query.wikidata.org/sparql")

with open("instance_of_counts.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f, delimiter=";")
    writer.writerow(["wikidata_qid", "label_fr", "count"])

    for qid, nb in counts.most_common():  # TOUS les QID
        query = f"""
        SELECT ?label WHERE {{
          wd:{qid} rdfs:label ?label .
          FILTER (lang(?label) = "fr")
        }}
        LIMIT 1
        """

        sparql.setQuery(query)
        sparql.setReturnFormat(JSON)

        try:
            results = sparql.query().convert()
            label = results["results"]["bindings"][0]["label"]["value"]
        except Exception:
            label = ""

        writer.writerow([qid, label, nb])


In [25]:
import csv

with open("instance_of_counts.csv", newline="", encoding="utf-8") as f, \
     open("labels_typology.csv", "w", newline="", encoding="utf-8") as o:

    reader = csv.reader(f, delimiter=";")
    writer = csv.writer(o, delimiter=";")

    for row in reader:
        del row[2]  # supprime la 4·µâ colonne
        writer.writerow(row)