Récupération des données sur des êtres vivant 

Données que nous récupérons :
- Nom 
- Nom scientifique
- Image
- Famille 
- Lieu géographique

In [2]:
import sys
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd

In [3]:
endpoint_url = "https://query.wikidata.org/sparql"

# Get taxons
query = """SELECT DISTINCT ?taxon ?name ?image ?sciName ?familyLabel ?locationLabel
WHERE {
  ?taxon wdt:P31 wd:Q16521;
          rdfs:label ?name;
          wdt:P18 ?image;
          wdt:P225 ?sciName;
          wdt:P171 ?family;
          wdt:P9714 ?location.
  FILTER(lang(?name) = "en")
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}
LIMIT 1000"""


def get_results(endpoint_url, query):
    user_agent = "WDQS-example Python/%s.%s" % (
        sys.version_info[0],
        sys.version_info[1],
    )
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    return sparql.query().convert()


array = []
results = get_results(endpoint_url, query)

for result in results["results"]["bindings"]:
    array.append(
        (
            result["name"]["value"],
            result["image"]["value"],
            result["sciName"]["value"],
            result["familyLabel"]["value"],
            result["locationLabel"]["value"],
        )
    )

In [4]:
dataframe = pd.DataFrame(array, columns=["name", "image", "scientific_name","family", "location"])

dataframe = dataframe.astype(
    dtype={"name" : "<U200", "image" : "<U200", "scientific_name" : "<U200","family" : "<U200", "location" : "<U200"}
)
dataframe

Unnamed: 0,name,image,scientific_name,family,location
0,Encephalartos concinnus,http://commons.wikimedia.org/wiki/Special:File...,Encephalartos concinnus,Encephalartos,Africa
1,Encephalartos cupidus,http://commons.wikimedia.org/wiki/Special:File...,Encephalartos cupidus,Encephalartos,Africa
2,Encephalartos cycadifolius,http://commons.wikimedia.org/wiki/Special:File...,Encephalartos cycadifolius,Encephalartos,Africa
3,Microcos paniculata,http://commons.wikimedia.org/wiki/Special:File...,Microcos paniculata,Microcos,Africa
4,Stoebe passerinoides,http://commons.wikimedia.org/wiki/Special:File...,Stoebe passerinoides,Stoebe,Africa
5,Encephalartos aemulans,http://commons.wikimedia.org/wiki/Special:File...,Encephalartos aemulans,Encephalartos,Africa
6,Encephalartos barteri,http://commons.wikimedia.org/wiki/Special:File...,Encephalartos barteri,Encephalartos,Africa
7,Encephalartos bubalinus,http://commons.wikimedia.org/wiki/Special:File...,Encephalartos bubalinus,Encephalartos,Africa
8,Encephalartos aplanatus,http://commons.wikimedia.org/wiki/Special:File...,Encephalartos aplanatus,Encephalartos,Africa
9,Encephalartos equatorialis,http://commons.wikimedia.org/wiki/Special:File...,Encephalartos equatorialis,Encephalartos,Africa


Téléchargement des images 

In [5]:
import requests
import shutil
import os
import json

def download_and_create_json(dataframe):
    if not os.path.exists("metadata"):
        os.mkdir("metadata")
    print(dataframe)
    for index, row in dataframe.iterrows():
        image_url = row["image"]
        response_code = download_image(image_url)
        if response_code == 200:
            image_filename = os.path.basename(image_url)
            if len(image_filename) > 50:
                continue
            json_filename = os.path.splitext(image_filename)[0] + ".json"
            json_path = os.path.join("metadata", json_filename)
            json_data = {
                "name": row["name"],
                "scientific_name": row["scientific_name"],
                "family": row["family"],
                "location": row["location"]
            }
            with open(json_path, "w") as json_file:
                json.dump(json_data, json_file, indent=4)

    
    


def download_image(url):
    if not os.path.exists("images"):
        os.mkdir("images")
    headers = {"User-Agent": "Mozilla/5.0"}
    request = requests.get(url, allow_redirects=True, headers=headers, stream=True)
    if request.status_code == 200:
        filename = os.path.join("images", os.path.basename(url))
        print(filename)
        
        if len(filename) > 50:
            return
        
        with open(filename, "wb") as image:
            request.raw.decode_content = True
            shutil.copyfileobj(request.raw, image)
    return request.status_code

In [6]:
download_and_create_json(dataframe)

                              name   
0          Encephalartos concinnus  \
1            Encephalartos cupidus   
2       Encephalartos cycadifolius   
3              Microcos paniculata   
4             Stoebe passerinoides   
5           Encephalartos aemulans   
6            Encephalartos barteri   
7          Encephalartos bubalinus   
8          Encephalartos aplanatus   
9       Encephalartos equatorialis   
10   Encephalartos eugene-maraisii   
11       Encephalartos dolomiticus   
12           Encephalartos kisambo   
13           Encephalartos lanatus   
14          Encephalartos inopinus   
15           Encephalartos cerinus   
16            Encephalartos caffer   
17  Encephalartos chimanimaniensis   
18           Encephalartos humilis   
19       Encephalartos ghellinckii   
20          Encephalartos hirsutus   
21       Encephalartos laevifolius   
22         Encephalartos latifrons   
23         Encephalartos msinganus   
24        Encephalartos manikensis   
25      Ence

KeyboardInterrupt: 