Pour enregistrer les données sur son espace de stockage s3 : 
- BUCKET_PERSONNEL="amanseur"
- mc cp data/raw/data.csv s3/${BUCKET_PERSONNEL}/ensae-reproductibilite/data/raw/data.csv

In [1]:
import s3fs

fs = s3fs.S3FileSystem(client_kwargs={"endpoint_url": "https://minio.lab.sspcloud.fr"})

MY_BUCKET = "amanseur"
CHEMIN = "ensae-reproductibilite/data/raw"
fs.ls(f"s3://{MY_BUCKET}/{CHEMIN}")

['amanseur/ensae-reproductibilite/data/raw/data.csv']

In [2]:
MY_BUCKET = "amanseur"
CHEMIN_FICHIER = "ensae-reproductibilite/data/raw/data.csv"

In [3]:
import s3fs
import pandas as pd

fs = s3fs.S3FileSystem(client_kwargs={"endpoint_url": "https://minio.lab.sspcloud.fr"})

with fs.open(f"s3://{MY_BUCKET}/{CHEMIN_FICHIER}") as f:
    df = pd.read_csv(f)

df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [4]:
import s3fs
from pyarrow import csv

fs = s3fs.S3FileSystem(client_kwargs={"endpoint_url": "https://minio.lab.sspcloud.fr"})

with fs.open(f"s3://{MY_BUCKET}/{CHEMIN_FICHIER}") as f:
    df = csv.read_csv(f)

df

pyarrow.Table
PassengerId: int64
Survived: int64
Pclass: int64
Name: string
Sex: string
Age: double
SibSp: int64
Parch: int64
Ticket: string
Fare: double
Cabin: string
Embarked: string
----
PassengerId: [[1,2,3,4,5,...,887,888,889,890,891]]
Survived: [[0,1,1,1,0,...,0,1,0,1,0]]
Pclass: [[3,1,3,1,3,...,2,1,3,1,3]]
Name: [["Braund, Mr. Owen Harris","Cumings, Mrs. John Bradley (Florence Briggs Thayer)","Heikkinen, Miss. Laina","Futrelle, Mrs. Jacques Heath (Lily May Peel)","Allen, Mr. William Henry",...,"Montvila, Rev. Juozas","Graham, Miss. Margaret Edith","Johnston, Miss. Catherine Helen "Carrie"","Behr, Mr. Karl Howell","Dooley, Mr. Patrick"]]
Sex: [["male","female","female","female","male",...,"male","female","female","male","male"]]
Age: [[22,38,26,35,35,...,27,19,null,26,32]]
SibSp: [[1,1,0,1,0,...,0,0,1,0,0]]
Parch: [[0,0,0,0,0,...,0,0,2,0,0]]
Ticket: [["A/5 21171","PC 17599","STON/O2. 3101282","113803","373450",...,"211536","112053","W./C. 6607","111369","370376"]]
Fare: [[7.25,71

In [5]:
import os
import duckdb

con = duckdb.connect(database=":memory:")

con.execute(
    f"""
CREATE SECRET secret (
    TYPE S3,
    KEY_ID '{os.environ["AWS_ACCESS_KEY_ID"]}',
    SECRET '{os.environ["AWS_SECRET_ACCESS_KEY"]}',
    ENDPOINT 'minio.lab.sspcloud.fr',
    SESSION_TOKEN '{os.environ["AWS_SESSION_TOKEN"]}',
    REGION 'us-east-1',
    URL_STYLE 'path',
    SCOPE 's3://{MY_BUCKET}/'
);
"""
)

query_definition = f"SELECT * FROM read_csv('s3://{MY_BUCKET}/{CHEMIN_FICHIER}')"
df = con.sql(query_definition)

df

┌─────────────┬──────────┬────────┬─────────────────────────────────────────────────────┬─────────┬────────┬───────┬───────┬──────────────────┬─────────┬─────────┬──────────┐
│ PassengerId │ Survived │ Pclass │                        Name                         │   Sex   │  Age   │ SibSp │ Parch │      Ticket      │  Fare   │  Cabin  │ Embarked │
│    int64    │  int64   │ int64  │                       varchar                       │ varchar │ double │ int64 │ int64 │     varchar      │ double  │ varchar │ varchar  │
├─────────────┼──────────┼────────┼─────────────────────────────────────────────────────┼─────────┼────────┼───────┼───────┼──────────────────┼─────────┼─────────┼──────────┤
│           1 │        0 │      3 │ Braund, Mr. Owen Harris                             │ male    │   22.0 │     1 │     0 │ A/5 21171        │    7.25 │ NULL    │ S        │
│           2 │        1 │      1 │ Cumings, Mrs. John Bradley (Florence Briggs Thayer) │ female  │   38.0 │     1 │     0 │ 

Exemple données Parquet :
exécuter ceci dans le terminal : BUCKET_PERSONNEL="nom_utilisateur_sspcloud"

2curl -o rp.parquet "https://minio.lab.sspcloud.fr/projet-formation/bonnes-pratiques/data/REGION=11/part-0.parquet"

mc cp rp.parquet s3/${BUCKET_PERSONNEL}/ensae-reproductibilite/data/example/rp.parquet

rm rp.parquet

In [7]:
MY_BUCKET = "amanseur"
CHEMIN_FICHIER = "ensae-reproductibilite/data/example/rp.parquet"

In [8]:
import s3fs
import pandas as pd

fs = s3fs.S3FileSystem(client_kwargs={"endpoint_url": "https://minio.lab.sspcloud.fr"})

df = pd.read_parquet(f"s3://{MY_BUCKET}/{CHEMIN_FICHIER}", filesystem=fs)

df

Unnamed: 0,CANTVILLE,NUMMI,ACHLR,AEMMR,AGED,AGER20,AGEREV,AGEREVQ,ANAI,ANEMR,...,TP,TRANS,TRIRIS,TYPC,TYPFC,TYPL,TYPMC,TYPMR,VOIT,WC
0,7701,7508,4,7,84,80,83,80,1932,05,...,Z,Z,770191,2,Z,1,4,41,2,Z
1,7701,7509,4,9,45,54,44,40,1977,02,...,Z,Z,770381,3,2,2,4,42,1,Z
2,7701,7509,4,9,6,5,5,5,2016,02,...,Z,Z,770381,3,2,2,4,42,1,Z
3,7701,7509,4,9,19,19,18,15,2003,02,...,Z,Z,770381,3,2,2,4,42,1,Z
4,7701,7509,4,9,53,54,52,50,1969,02,...,1,5,770381,3,2,2,4,42,1,Z
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4370408,9401,517,4,9,32,39,32,30,1987,01,...,1,6,940041,3,2,2,4,41,1,Z
4370409,9401,517,4,9,26,29,25,25,1993,01,...,1,6,940041,3,2,2,4,41,1,Z
4370410,9401,518,2,9,50,54,49,45,1972,03,...,1,6,940041,1,2,1,4,41,1,Z
4370411,9401,518,2,9,48,54,47,45,1974,03,...,2,5,940041,1,2,1,4,41,1,Z


In [9]:
import os
import duckdb

con = duckdb.connect(database=":memory:")

con.execute(
    f"""
CREATE SECRET secret (
    TYPE S3,
    KEY_ID '{os.environ["AWS_ACCESS_KEY_ID"]}',
    SECRET '{os.environ["AWS_SECRET_ACCESS_KEY"]}',
    ENDPOINT 'minio.lab.sspcloud.fr',
    SESSION_TOKEN '{os.environ["AWS_SESSION_TOKEN"]}',
    REGION 'us-east-1',
    URL_STYLE 'path',
    SCOPE 's3://{MY_BUCKET}/'
);
"""
)

query_definition = f"SELECT * FROM read_parquet('s3://{MY_BUCKET}/{CHEMIN_FICHIER}')"
df = con.sql(query_definition)

df

┌───────────┬─────────┬─────────┬─────────┬───────┬─────────┬────────┬─────────┬───────┬─────────┬─────────┬─────────┬─────────┬─────────┬─────────┬─────────┬─────────┬─────────┬─────────┬─────────┬─────────┬─────────┬─────────┬─────────┬─────────┬─────────┬─────────┬─────────┬─────────┬─────────┬─────────┬─────────┬─────────┬─────────┬─────────┬─────────┬─────────┬─────────┬─────────┬─────────┬─────────┬─────────┬─────────┬─────────┬─────────┬───────────────────┬─────────┬───────────┬─────────┬─────────┬─────────┬──────────┬─────────┬─────────┬─────────┬─────────┬─────────┬─────────┬─────────┬─────────┬─────────┬─────────┬─────────┬─────────┬─────────┬─────────┬─────────┬─────────┬─────────┬─────────┬─────────┬───────────┬─────────┬─────────┬─────────┬─────────┬─────────┬─────────┬─────────┬─────────┬─────────┬─────────┬─────────┬─────────┬─────────┬─────────┬─────────┐
│ CANTVILLE │  NUMMI  │  ACHLR  │  AEMMR  │ AGED  │ AGER20  │ AGEREV │ AGEREVQ │ ANAI  │  ANEMR  │  APAF   │   ARM  