Récupération des données sur des êtres vivant 

Données que nous récupérons :
- Nom 
- Nom scientifique
- Image
- Famille 
- Lieu géographique

In [None]:
import sys
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd

In [None]:
endpoint_url = "https://query.wikidata.org/sparql"

# Get taxons
query = """SELECT DISTINCT ?taxon ?name ?image ?sciName ?familyLabel ?locationLabel
WHERE {
  ?taxon wdt:P31 wd:Q16521;
          rdfs:label ?name;
          wdt:P18 ?image;
          wdt:P225 ?sciName;
          wdt:P171 ?family;
          wdt:P9714 ?location.
  FILTER(lang(?name) = "en")
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}
LIMIT 1000"""


def get_results(endpoint_url, query):
    user_agent = "WDQS-example Python/%s.%s" % (
        sys.version_info[0],
        sys.version_info[1],
    )
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    return sparql.query().convert()


array = []
results = get_results(endpoint_url, query)

for result in results["results"]["bindings"]:
    array.append(
        (
            result["name"]["value"],
            result["image"]["value"],
            result["sciName"]["value"],
            result["familyLabel"]["value"],
            result["locationLabel"]["value"],
        )
    )

In [None]:
dataframe = pd.DataFrame(array, columns=["name", "image", "scientific_name","family", "location"])

dataframe = dataframe.astype(
    dtype={"name" : "<U200", "image" : "<U200", "scientific_name" : "<U200","family" : "<U200", "location" : "<U200"}
)
dataframe

Téléchargement des images 

In [None]:
import requests
import shutil
import os
import json

def download_and_create_json(dataframe):
    if not os.path.exists("metadata"):
        os.mkdir("metadata")
    print(dataframe)
    for index, row in dataframe.iterrows():
        image_url = row["image"]
        response_code = download_image(image_url)
        if response_code == 200:
            image_filename = os.path.basename(image_url)
            if len(image_filename) > 50:
                continue
            json_filename = os.path.splitext(image_filename)[0] + ".json"
            json_path = os.path.join("metadata", json_filename)
            json_data = {
                "name": row["name"],
                "scientific_name": row["scientific_name"],
                "family": row["family"],
                "location": row["location"]
            }
            with open(json_path, "w") as json_file:
                json.dump(json_data, json_file, indent=4)

    
    


def download_image(url):
    if not os.path.exists("images"):
        os.mkdir("images")
    headers = {"User-Agent": "Mozilla/5.0"}
    request = requests.get(url, allow_redirects=True, headers=headers, stream=True)
    if request.status_code == 200:
        filename = os.path.join("images", os.path.basename(url))
        print(filename)
        
        if len(filename) > 50:
            return
        
        with open(filename, "wb") as image:
            request.raw.decode_content = True
            shutil.copyfileobj(request.raw, image)
    return request.status_code

In [None]:
download_and_create_json(dataframe)