https://drive.google.com/file/d/1Kf_ZraDClLgH7VjwRBKQbo8tFEXldu_u/view?usp=drive_link

In [3]:
import polars as pl
import pandas as pd
import time
import psutil
import os
import pyarrow

In [5]:
# Chargement d'un fichier parquet
parquet_file = r"yellow_tripdata_2022-01.parquet"

# Fonction pour mesurer l'utilisation de la mémoire
def memory_usage():
    process = psutil.Process(os.getpid())
    return process.memory_info().rss / (1024 ** 2)  # Conversion en Mo

# Comparaison du temps de chargement et de la mémoire utilisée

# Mesurer les performances avec Pandas
start_time = time.time()
mem_before = memory_usage()
df_pandas = pd.read_parquet(parquet_file)
time_pandas = time.time() - start_time
mem_after = memory_usage()
mem_usage_pandas = mem_after - mem_before

# Mesurer les performances avec Polars
start_time = time.time()
mem_before = memory_usage()
df_polars = pl.read_parquet(parquet_file)
time_polars = time.time() - start_time
mem_after = memory_usage()
mem_usage_polars = mem_after - mem_before

# Afficher les résultats
print(f"Pandas - Temps de chargement : {time_pandas:.4f} secondes")
print(f"Pandas - Utilisation de la mémoire : {mem_usage_pandas:.2f} Mo\n")

print(f"Polars - Temps de chargement : {time_polars:.4f} secondes")
print(f"Polars - Utilisation de la mémoire : {mem_usage_polars:.2f} Mo\n")

# Comparaison des performances sur une opération simple, par exemple, la somme d'une colonne
start_time = time.time()
sum_pandas = df_pandas["total_amount"].sum()
time_sum_pandas = time.time() - start_time

start_time = time.time()
sum_polars = df_polars["total_amount"].sum()
time_sum_polars = time.time() - start_time

print(f"Pandas - Temps pour somme de 'total_amount' : {time_sum_pandas:.4f} secondes")
print(f"Polars - Temps pour somme de 'total_amount' : {time_sum_polars:.4f} secondes")

Pandas - Temps de chargement : 4.8827 secondes
Pandas - Utilisation de la mémoire : 729.88 Mo

Polars - Temps de chargement : 1.6175 secondes
Polars - Utilisation de la mémoire : 74.49 Mo

Pandas - Temps pour somme de 'total_amount' : 0.0123 secondes
Polars - Temps pour somme de 'total_amount' : 0.0015 secondes


In [6]:
# 1. Lecture de données depuis un fichier Parquet

df = pl.read_parquet(parquet_file)
df.head()

VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
i64,datetime[ns],datetime[ns],f64,f64,f64,str,i64,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1,2022-01-01 00:35:40,2022-01-01 00:53:29,2.0,3.8,1.0,"""N""",142,236,1,14.5,3.0,0.5,3.65,0.0,0.3,21.95,2.5,0.0
1,2022-01-01 00:33:43,2022-01-01 00:42:07,1.0,2.1,1.0,"""N""",236,42,1,8.0,0.5,0.5,4.0,0.0,0.3,13.3,0.0,0.0
2,2022-01-01 00:53:21,2022-01-01 01:02:19,1.0,0.97,1.0,"""N""",166,166,1,7.5,0.5,0.5,1.76,0.0,0.3,10.56,0.0,0.0
2,2022-01-01 00:25:21,2022-01-01 00:35:23,1.0,1.09,1.0,"""N""",114,68,2,8.0,0.5,0.5,0.0,0.0,0.3,11.8,2.5,0.0
2,2022-01-01 00:36:48,2022-01-01 01:14:20,1.0,4.3,1.0,"""N""",68,163,1,23.5,0.5,0.5,3.0,0.0,0.3,30.3,2.5,0.0


In [7]:
df.schema

Schema([('VendorID', Int64),
        ('tpep_pickup_datetime', Datetime(time_unit='ns', time_zone=None)),
        ('tpep_dropoff_datetime', Datetime(time_unit='ns', time_zone=None)),
        ('passenger_count', Float64),
        ('trip_distance', Float64),
        ('RatecodeID', Float64),
        ('store_and_fwd_flag', String),
        ('PULocationID', Int64),
        ('DOLocationID', Int64),
        ('payment_type', Int64),
        ('fare_amount', Float64),
        ('extra', Float64),
        ('mta_tax', Float64),
        ('tip_amount', Float64),
        ('tolls_amount', Float64),
        ('improvement_surcharge', Float64),
        ('total_amount', Float64),
        ('congestion_surcharge', Float64),
        ('airport_fee', Float64)])

In [8]:
# 2. Sélection de colonnes spécifiques

df_selected = df.select(["VendorID", "trip_distance", "total_amount"])
df_selected.head()

VendorID,trip_distance,total_amount
i64,f64,f64
1,3.8,21.95
1,2.1,13.3
2,0.97,10.56
2,1.09,11.8
2,4.3,30.3


In [9]:
df.select(
    pl.col(['passenger_count','trip_distance']),
    pl.col('passenger_count').alias('Passenger Count'),
    pl.col(pl.Int64)
).head(5)

passenger_count,trip_distance,Passenger Count,VendorID,PULocationID,DOLocationID,payment_type
f64,f64,f64,i64,i64,i64,i64
2.0,3.8,2.0,1,142,236,1
1.0,2.1,1.0,1,236,42,1
1.0,0.97,1.0,2,166,166,1
1.0,1.09,1.0,2,114,68,2
1.0,4.3,1.0,2,68,163,1


In [10]:
# 3. Filtrage des lignes (équivalent de WHERE en SQL)

df_filtered = df.filter(pl.col("total_amount") > 50)
df_filtered.head()

VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
i64,datetime[ns],datetime[ns],f64,f64,f64,str,i64,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1,2022-01-01 00:40:15,2022-01-01 01:09:48,1.0,10.3,1.0,"""N""",138,161,1,33.0,3.0,0.5,13.0,6.55,0.3,56.35,2.5,0.0
2,2022-01-01 18:04:06,2022-01-01 18:40:52,1.0,9.7,1.0,"""N""",138,48,1,34.5,0.5,0.5,9.22,6.55,0.3,55.32,2.5,1.25
1,2022-01-01 00:10:27,2022-01-01 00:41:59,1.0,17.1,2.0,"""N""",132,170,1,52.0,3.75,0.5,15.75,6.55,0.3,78.85,2.5,1.25
2,2022-01-01 00:31:06,2022-01-01 00:58:26,3.0,19.14,2.0,"""N""",132,263,1,52.0,0.0,0.5,12.37,6.55,0.3,75.47,2.5,1.25
2,2022-01-01 00:56:26,2022-01-01 01:25:09,1.0,18.81,2.0,"""N""",132,148,1,52.0,0.0,0.5,11.31,0.0,0.3,67.86,2.5,1.25


In [11]:
df.filter(
    pl.col('trip_distance') <= 100,
    pl.col('total_amount').is_not_null()
).select(
    pl.col('trip_distance','total_amount')
).sort('trip_distance',descending=True).head(5)

trip_distance,total_amount
f64,f64
99.46,206.85
99.2,20.75
98.9,16.8
98.4,18.35
98.15,303.1


In [12]:
# 4. Ajout de nouvelles colonnes

df_with_new_col = df.with_columns(
    (pl.col("tip_amount") / pl.col("total_amount") * 100).alias("tip_percentage")
    )
df_with_new_col.head()

VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,tip_percentage
i64,datetime[ns],datetime[ns],f64,f64,f64,str,i64,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1,2022-01-01 00:35:40,2022-01-01 00:53:29,2.0,3.8,1.0,"""N""",142,236,1,14.5,3.0,0.5,3.65,0.0,0.3,21.95,2.5,0.0,16.628702
1,2022-01-01 00:33:43,2022-01-01 00:42:07,1.0,2.1,1.0,"""N""",236,42,1,8.0,0.5,0.5,4.0,0.0,0.3,13.3,0.0,0.0,30.075188
2,2022-01-01 00:53:21,2022-01-01 01:02:19,1.0,0.97,1.0,"""N""",166,166,1,7.5,0.5,0.5,1.76,0.0,0.3,10.56,0.0,0.0,16.666667
2,2022-01-01 00:25:21,2022-01-01 00:35:23,1.0,1.09,1.0,"""N""",114,68,2,8.0,0.5,0.5,0.0,0.0,0.3,11.8,2.5,0.0,0.0
2,2022-01-01 00:36:48,2022-01-01 01:14:20,1.0,4.3,1.0,"""N""",68,163,1,23.5,0.5,0.5,3.0,0.0,0.3,30.3,2.5,0.0,9.90099


In [13]:
# 5. Groupement et agrégation

df_grouped = df.group_by("VendorID").agg(pl.col("trip_distance").sum().alias("total_distance"))
df_grouped.head()


VendorID,total_distance
i64,f64
6,45929.08
5,536.8
1,2217400.0
2,10974000.0


In [14]:
# 6. Tri des données

df_sorted = df.sort("total_amount", descending=True)
df_sorted.head()

VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
i64,datetime[ns],datetime[ns],f64,f64,f64,str,i64,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1,2022-01-07 11:36:43,2022-01-07 11:47:12,1.0,3.3,1.0,"""N""",107,140,4,401092.32,2.5,0.5,0.0,0.0,0.3,401095.62,2.5,0.0
2,2022-01-29 02:23:46,2022-01-29 02:32:06,1.0,1.32,1.0,"""N""",79,249,1,7.0,0.5,0.5,888.88,0.0,0.3,899.68,2.5,0.0
2,2022-01-20 17:53:58,2022-01-20 17:54:04,1.0,0.0,5.0,"""N""",163,230,1,720.0,0.0,0.0,0.0,0.0,0.3,722.8,2.5,0.0
2,2022-01-02 23:23:27,2022-01-03 03:54:44,5.0,254.88,3.0,"""N""",154,264,2,668.0,0.5,0.0,0.0,18.3,0.3,688.35,0.0,1.25
1,2022-01-23 12:07:11,2022-01-23 16:14:46,1.0,257.7,5.0,"""N""",132,265,1,650.0,1.25,0.0,0.0,31.55,0.3,683.1,0.0,1.25


In [16]:
# 7. Écriture de données dans un fichier Parquet

output_file = r"Nouveau.parquet"  
df_sorted.write_parquet(output_file)
print(f"Données triées écrites dans : {output_file}")

Données triées écrites dans : Nouveau.parquet


In [17]:
# 8. Utilisation du LazyFrame

lazy_df = df.lazy()
result = (
    lazy_df
    .filter(pl.col("total_amount") > 50)
    .group_by("VendorID")
    .agg(pl.col("trip_distance").sum().alias("total_distance"))
    .sort("total_distance", descending=True)
)
print(result)
final_df = result.collect()  # Évaluation de la requête lazyframe
final_df


naive plan: (run LazyFrame.explain(optimized=True) to see the optimized plan)

SORT BY [col("total_distance")]
  AGGREGATE
  	[col("trip_distance").sum().alias("total_distance")] BY [col("VendorID")] FROM
    FILTER [(col("total_amount")) > (50.0)] FROM
      DF ["VendorID", "tpep_pickup_datetime", "tpep_dropoff_datetime", "passenger_count"]; PROJECT */19 COLUMNS; SELECTION: None


VendorID,total_distance
i64,f64
2,1681500.0
1,570949.8
6,19700.48
5,492.03


In [18]:
# 9. Utilisation avancée de LazyFrame et optimisation des requêtes

lazy_df = df.lazy()
result_lazy = (
    lazy_df
    .filter(pl.col("total_amount") > 50)
    .group_by("VendorID")
    .agg([
        pl.col("trip_distance").sum().alias("total_distance"),
        pl.col("tip_amount").mean().alias("average_tip"),
    ])
    .sort("total_distance", descending=True)
)

# Optimisations possibles via `collect()` ou `fetch()` pour un échantillon
print(result_lazy.fetch())
result_lazy.collect()


shape: (2, 3)
┌──────────┬────────────────┬─────────────┐
│ VendorID ┆ total_distance ┆ average_tip │
│ ---      ┆ ---            ┆ ---         │
│ i64      ┆ f64            ┆ f64         │
╞══════════╪════════════════╪═════════════╡
│ 2        ┆ 4990.39        ┆ 8.871524    │
│ 1        ┆ 1942.5         ┆ 9.18        │
└──────────┴────────────────┴─────────────┘


  print(result_lazy.fetch())


VendorID,total_distance,average_tip
i64,f64,f64
2,1681500.0,8.841613
1,570949.8,7.78816
6,19700.48,0.0
5,492.03,2.427586


In [19]:
# 10. Fusion de DataFrames

df_additional = pl.DataFrame({
    "VendorID": [1, 2],
    "vendor_name": ["Vendor A", "Vendor B"]
})
df_merged = df.join(df_additional, on="VendorID", how="left")
df_merged.head()

VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,vendor_name
i64,datetime[ns],datetime[ns],f64,f64,f64,str,i64,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str
1,2022-01-01 00:35:40,2022-01-01 00:53:29,2.0,3.8,1.0,"""N""",142,236,1,14.5,3.0,0.5,3.65,0.0,0.3,21.95,2.5,0.0,"""Vendor A"""
1,2022-01-01 00:33:43,2022-01-01 00:42:07,1.0,2.1,1.0,"""N""",236,42,1,8.0,0.5,0.5,4.0,0.0,0.3,13.3,0.0,0.0,"""Vendor A"""
2,2022-01-01 00:53:21,2022-01-01 01:02:19,1.0,0.97,1.0,"""N""",166,166,1,7.5,0.5,0.5,1.76,0.0,0.3,10.56,0.0,0.0,"""Vendor B"""
2,2022-01-01 00:25:21,2022-01-01 00:35:23,1.0,1.09,1.0,"""N""",114,68,2,8.0,0.5,0.5,0.0,0.0,0.3,11.8,2.5,0.0,"""Vendor B"""
2,2022-01-01 00:36:48,2022-01-01 01:14:20,1.0,4.3,1.0,"""N""",68,163,1,23.5,0.5,0.5,3.0,0.0,0.3,30.3,2.5,0.0,"""Vendor B"""


In [20]:
# 11. Utilisation des fenêtres (window functions)

df_with_window = df.with_columns(
    pl.col("total_amount").cum_sum().over("VendorID").alias("cumulative_sum_total_amount")
)
df_with_window.head()

VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,cumulative_sum_total_amount
i64,datetime[ns],datetime[ns],f64,f64,f64,str,i64,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1,2022-01-01 00:35:40,2022-01-01 00:53:29,2.0,3.8,1.0,"""N""",142,236,1,14.5,3.0,0.5,3.65,0.0,0.3,21.95,2.5,0.0,21.95
1,2022-01-01 00:33:43,2022-01-01 00:42:07,1.0,2.1,1.0,"""N""",236,42,1,8.0,0.5,0.5,4.0,0.0,0.3,13.3,0.0,0.0,35.25
2,2022-01-01 00:53:21,2022-01-01 01:02:19,1.0,0.97,1.0,"""N""",166,166,1,7.5,0.5,0.5,1.76,0.0,0.3,10.56,0.0,0.0,10.56
2,2022-01-01 00:25:21,2022-01-01 00:35:23,1.0,1.09,1.0,"""N""",114,68,2,8.0,0.5,0.5,0.0,0.0,0.3,11.8,2.5,0.0,22.36
2,2022-01-01 00:36:48,2022-01-01 01:14:20,1.0,4.3,1.0,"""N""",68,163,1,23.5,0.5,0.5,3.0,0.0,0.3,30.3,2.5,0.0,52.66


In [21]:
# 12. Jointure avec condition multiple (complex join)

print("Jointure avec condition multiple (ex. sur 'VendorID' et 'payment_type')")

df_additional = pl.DataFrame({
    "VendorID": [1, 2],
    "payment_type": [1, 2],
    "discount": [0.1, 0.15]
})
df_joined = df.join(df_additional, on=["VendorID", "payment_type"], how="left")
df_joined.head()

Jointure avec condition multiple (ex. sur 'VendorID' et 'payment_type')


VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,discount
i64,datetime[ns],datetime[ns],f64,f64,f64,str,i64,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1,2022-01-01 00:35:40,2022-01-01 00:53:29,2.0,3.8,1.0,"""N""",142,236,1,14.5,3.0,0.5,3.65,0.0,0.3,21.95,2.5,0.0,0.1
1,2022-01-01 00:33:43,2022-01-01 00:42:07,1.0,2.1,1.0,"""N""",236,42,1,8.0,0.5,0.5,4.0,0.0,0.3,13.3,0.0,0.0,0.1
2,2022-01-01 00:53:21,2022-01-01 01:02:19,1.0,0.97,1.0,"""N""",166,166,1,7.5,0.5,0.5,1.76,0.0,0.3,10.56,0.0,0.0,
2,2022-01-01 00:25:21,2022-01-01 00:35:23,1.0,1.09,1.0,"""N""",114,68,2,8.0,0.5,0.5,0.0,0.0,0.3,11.8,2.5,0.0,0.15
2,2022-01-01 00:36:48,2022-01-01 01:14:20,1.0,4.3,1.0,"""N""",68,163,1,23.5,0.5,0.5,3.0,0.0,0.3,30.3,2.5,0.0,


In [22]:
# 13. Pivot de données (pivot/reshape)

print("Pivot de données (ex. pivot sur 'VendorID' et somme de 'total_amount')")

df_pivot = df.pivot(
    values="total_amount",
    index="VendorID",
    on="payment_type",
    aggregate_function="sum"
)

df_pivot


Pivot de données (ex. pivot sur 'VendorID' et somme de 'total_amount')


VendorID,1,2,4,3,5,0
i64,f64,f64,f64,f64,f64,f64
1,11102000.0,2304500.0,452188.01,129282.16,11.8,303494.16
2,25891000.0,5798500.0,-123957.87,-38391.83,,1199500.0
6,,,,,,212038.93
5,,,,,,2159.92


In [23]:
# 14. Filtrage conditionnel avancé

df_filtered_advanced = df.filter(
    (pl.col("trip_distance") > 2.0) & (pl.col("total_amount") > 20)
)
df_filtered_advanced.head()

VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
i64,datetime[ns],datetime[ns],f64,f64,f64,str,i64,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1,2022-01-01 00:35:40,2022-01-01 00:53:29,2.0,3.8,1.0,"""N""",142,236,1,14.5,3.0,0.5,3.65,0.0,0.3,21.95,2.5,0.0
2,2022-01-01 00:36:48,2022-01-01 01:14:20,1.0,4.3,1.0,"""N""",68,163,1,23.5,0.5,0.5,3.0,0.0,0.3,30.3,2.5,0.0
1,2022-01-01 00:40:15,2022-01-01 01:09:48,1.0,10.3,1.0,"""N""",138,161,1,33.0,3.0,0.5,13.0,6.55,0.3,56.35,2.5,0.0
2,2022-01-01 00:20:50,2022-01-01 00:34:58,1.0,5.07,1.0,"""N""",233,87,1,17.0,0.5,0.5,5.2,0.0,0.3,26.0,2.5,0.0
1,2022-01-01 00:33:52,2022-01-01 00:47:28,3.0,4.2,1.0,"""N""",148,141,1,14.0,2.5,0.5,3.45,0.0,0.3,20.75,2.5,0.0


In [24]:
df_complex = df.with_columns([
    (pl.col("tip_amount") / pl.col("total_amount") * 100).alias("tip_percentage"),
    pl.when(pl.col("trip_distance") > 10)
      .then(pl.lit("long_trip"))
      .otherwise(pl.lit("short_trip"))
      .alias("trip_category")
])

df_complex.head()

VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,tip_percentage,trip_category
i64,datetime[ns],datetime[ns],f64,f64,f64,str,i64,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str
1,2022-01-01 00:35:40,2022-01-01 00:53:29,2.0,3.8,1.0,"""N""",142,236,1,14.5,3.0,0.5,3.65,0.0,0.3,21.95,2.5,0.0,16.628702,"""short_trip"""
1,2022-01-01 00:33:43,2022-01-01 00:42:07,1.0,2.1,1.0,"""N""",236,42,1,8.0,0.5,0.5,4.0,0.0,0.3,13.3,0.0,0.0,30.075188,"""short_trip"""
2,2022-01-01 00:53:21,2022-01-01 01:02:19,1.0,0.97,1.0,"""N""",166,166,1,7.5,0.5,0.5,1.76,0.0,0.3,10.56,0.0,0.0,16.666667,"""short_trip"""
2,2022-01-01 00:25:21,2022-01-01 00:35:23,1.0,1.09,1.0,"""N""",114,68,2,8.0,0.5,0.5,0.0,0.0,0.3,11.8,2.5,0.0,0.0,"""short_trip"""
2,2022-01-01 00:36:48,2022-01-01 01:14:20,1.0,4.3,1.0,"""N""",68,163,1,23.5,0.5,0.5,3.0,0.0,0.3,30.3,2.5,0.0,9.90099,"""short_trip"""


In [25]:
# 15. Calculs complexes avec des expressions multiples

df_complex = df.with_columns([
    (pl.col("tip_amount") / pl.col("total_amount") * 100).alias("tip_percentage"),
    pl.when(pl.col("trip_distance") > 10)
      .then(pl.lit("long_trip"))
      .otherwise(pl.lit("short_trip"))
      .alias("trip_category")
])

df_complex.head()

VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,tip_percentage,trip_category
i64,datetime[ns],datetime[ns],f64,f64,f64,str,i64,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str
1,2022-01-01 00:35:40,2022-01-01 00:53:29,2.0,3.8,1.0,"""N""",142,236,1,14.5,3.0,0.5,3.65,0.0,0.3,21.95,2.5,0.0,16.628702,"""short_trip"""
1,2022-01-01 00:33:43,2022-01-01 00:42:07,1.0,2.1,1.0,"""N""",236,42,1,8.0,0.5,0.5,4.0,0.0,0.3,13.3,0.0,0.0,30.075188,"""short_trip"""
2,2022-01-01 00:53:21,2022-01-01 01:02:19,1.0,0.97,1.0,"""N""",166,166,1,7.5,0.5,0.5,1.76,0.0,0.3,10.56,0.0,0.0,16.666667,"""short_trip"""
2,2022-01-01 00:25:21,2022-01-01 00:35:23,1.0,1.09,1.0,"""N""",114,68,2,8.0,0.5,0.5,0.0,0.0,0.3,11.8,2.5,0.0,0.0,"""short_trip"""
2,2022-01-01 00:36:48,2022-01-01 01:14:20,1.0,4.3,1.0,"""N""",68,163,1,23.5,0.5,0.5,3.0,0.0,0.3,30.3,2.5,0.0,9.90099,"""short_trip"""


In [26]:
# 16. Manipulation de colonnes avec `melt` (dénormalisation)
print("Manipulation de colonnes avec `melt` (ex. denormalisation de 'VendorID' et 'total_amount')")
df_melted = df.melt(
    id_vars="VendorID",
    value_vars=["total_amount", "tip_amount"],
    variable_name="amount_type",
    value_name="amount"
)
df_melted.head()

Manipulation de colonnes avec `melt` (ex. denormalisation de 'VendorID' et 'total_amount')


  df_melted = df.melt(


VendorID,amount_type,amount
i64,str,f64
1,"""total_amount""",21.95
1,"""total_amount""",13.3
2,"""total_amount""",10.56
2,"""total_amount""",11.8
2,"""total_amount""",30.3


#### Exemple 

In [27]:
df.filter(
    pl.col('trip_distance') <= 100,
    pl.col('total_amount').is_not_null()
).select(
    pl.col('trip_distance','total_amount')
).sort('trip_distance',descending=True).head(5)

trip_distance,total_amount
f64,f64
99.46,206.85
99.2,20.75
98.9,16.8
98.4,18.35
98.15,303.1


In [28]:
df.filter(
    pl.col('trip_distance') <= 100,
    (pl.col('tpep_dropoff_datetime')-pl.col('tpep_pickup_datetime')).dt.total_minutes() <=45,
    pl.col('total_amount').is_not_null()
).select(
    pl.col('passenger_count').max().alias('Max passenger count'),
    pl.col('passenger_count').mean().alias('Avg passenger per trip'),
    pl.col('trip_distance').mean().alias('Avg Trip_distance'),
    (pl.col('tpep_dropoff_datetime')-pl.col('tpep_pickup_datetime')).mean().alias('Avg travel time'),
    (pl.col('tpep_dropoff_datetime')-pl.col('tpep_pickup_datetime')).median().alias('Median')
)

Max passenger count,Avg passenger per trip,Avg Trip_distance,Avg travel time,Median
f64,f64,f64,duration[ns],duration[ns]
9.0,1.388825,2.940509,12m 8s 373439203ns,10m 3s


In [29]:
df = df.with_columns(
    ((pl.col('tpep_dropoff_datetime')-pl.col('tpep_pickup_datetime')).dt.total_minutes()).alias("total_minutes")
)

df.head(5)

VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,total_minutes
i64,datetime[ns],datetime[ns],f64,f64,f64,str,i64,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64
1,2022-01-01 00:35:40,2022-01-01 00:53:29,2.0,3.8,1.0,"""N""",142,236,1,14.5,3.0,0.5,3.65,0.0,0.3,21.95,2.5,0.0,17
1,2022-01-01 00:33:43,2022-01-01 00:42:07,1.0,2.1,1.0,"""N""",236,42,1,8.0,0.5,0.5,4.0,0.0,0.3,13.3,0.0,0.0,8
2,2022-01-01 00:53:21,2022-01-01 01:02:19,1.0,0.97,1.0,"""N""",166,166,1,7.5,0.5,0.5,1.76,0.0,0.3,10.56,0.0,0.0,8
2,2022-01-01 00:25:21,2022-01-01 00:35:23,1.0,1.09,1.0,"""N""",114,68,2,8.0,0.5,0.5,0.0,0.0,0.3,11.8,2.5,0.0,10
2,2022-01-01 00:36:48,2022-01-01 01:14:20,1.0,4.3,1.0,"""N""",68,163,1,23.5,0.5,0.5,3.0,0.0,0.3,30.3,2.5,0.0,37


In [30]:
df.filter(
    pl.col('trip_distance') <= 100,
    pl.col('total_minutes') <=45,
    pl.col('total_amount').is_not_null()
).select(
    pl.col('passenger_count').max().alias('Max passenger count'),
    pl.col('passenger_count').mean().alias('Avg passenger per trip'),
    pl.col('trip_distance').mean().alias('Avg Trip_distance'),
    pl.col('total_minutes').mean().alias('Avg travel time'),
    pl.col('total_minutes').median().alias('Median')
)

Max passenger count,Avg passenger per trip,Avg Trip_distance,Avg travel time,Median
f64,f64,f64,f64,f64
9.0,1.388825,2.940509,11.655341,10.0


In [31]:
df.select(
    pl.col('tpep_pickup_datetime').dt.strftime('%A').alias('day_name'),
    pl.col('total_minutes')
).group_by('day_name').agg(
    avg_total_time=pl.col('total_minutes').mean(),
    median_total_time= pl.col('total_minutes').median()
)

day_name,avg_total_time,median_total_time
str,f64,f64
"""Saturday""",13.529008,9.0
"""Tuesday""",13.452088,10.0
"""Monday""",13.532508,10.0
"""Friday""",14.339303,10.0
"""Wednesday""",13.51629,10.0
"""Thursday""",13.995267,10.0
"""Sunday""",13.700802,9.0
