In [1]:
# DECLARATIONS #
################

import pandas as _pd
import dask.bag as _DaskBag
import dask.dataframe as _DaskDataFrame

import json

import folium
import geopandas as _gpd

from shapely.geometry import Polygon
from shapely.geometry import Point
from math import radians, cos, sin, asin, sqrt

import math
import numpy as _np

from datetime import datetime
import time

In [2]:
# READING THE JSON FILE WITH ZONES #
####################################

# 2024-01-10 - Cell execution time : 1.933 seconds

start_time = time.time()
print ("Current Time :", datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

def is_valid_polygon(coords):
    try:
        polygon = Polygon(coords)
        return polygon.is_valid
    except:
        return False

# File read.
dask_zones_bag = _DaskBag.read_text('zones.json').map(json.loads)

# Polygons conversions
zones_polygons = dask_zones_bag.map(lambda x: x['zones']).flatten()
zones_polygons = zones_polygons.map(lambda zone: [(point['lng'],point['lat']) for point in zone['polygon']])
zones_polygons = zones_polygons.map(lambda polygon: {"polygon": polygon})
zones_dask_df = zones_polygons.to_dataframe()

# Validation
zones_dask_df['is_valid'] = zones_dask_df['polygon'].apply(is_valid_polygon, meta=('is_valid', 'bool'))

count_valid = zones_dask_df[zones_dask_df['is_valid']].shape[0].compute()
count_invalid = zones_dask_df[~zones_dask_df['is_valid']].shape[0].compute()

print(f"Valid polygon count : {count_valid}")
print(f"Invalid polygon count : {count_invalid}")

# Keep only valid polygon
zones_dask_df = zones_dask_df[zones_dask_df['is_valid']].compute()

# New fields
zones_dask_df['zone_id'] = zones_dask_df.index + 1
zones_dask_df['zone_id'] = zones_dask_df['zone_id'].astype(int)

print(zones_dask_df.head())

print ("Current Time :", datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
elapsed_time = round(time.time() - start_time, 3)
print(f"\nCell execution time : {elapsed_time} seconds")  

Current Time : 2024-01-13 12:25:45
Valid polygon count : 51
Invalid polygon count : 0
                                             polygon  is_valid  zone_id
0  [(-77.28648789023627, -11.784234676000215), (-...      True        1
1  [(-76.82596521434296, -12.479157220410556), (-...      True        2
2  [(-76.64127121444726, -12.201069396462607), (-...      True        3
3  [(-76.7336521407365, -12.205585634694005), (-7...      True        4
4  [(-76.7336521407365, -12.116068939612953), (-7...      True        5
Current Time : 2024-01-13 12:25:47

Cell execution time : 1.924 seconds


In [3]:
# LOAD THE GPS DRIVERS CSV FILE #
#################################

# 2024-01-10 - Cell execution time : 1.427 seconds

start_time = time.time()
print ("Current Time :", datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

drivers_gps_dask_df = _DaskDataFrame.read_csv('drivers.csv')
print(drivers_gps_dask_df.head())

print ("Current Time :", datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
elapsed_time = round(time.time() - start_time, 3)
print(f"\nCell execution time : {elapsed_time} seconds")

Current Time : 2024-01-13 12:25:47
     driver                 timestamp   latitude  longitude
0  c473205b  2017-08-31T12:24:25.000Z -12.106778 -76.998078
1  a0f3b4e1  2017-08-31T12:24:26.000Z -12.103913 -76.963727
2  1236f9fe  2017-08-31T12:24:26.000Z -12.133777 -77.004266
3  ae4a06a2  2017-08-31T12:24:26.000Z -12.085963 -76.987582
4  ab7a6c63  2017-08-31T12:24:26.000Z -12.072973 -77.061448
Current Time : 2024-01-13 12:25:48

Cell execution time : 1.087 seconds


In [4]:
# HOW BIG IS THE DRIVERS GPS FILE ??? #
#######################################

# 2024-01-10 - Cell execution time : 4.676 seconds

start_time = time.time()
print ("Current Time :", datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

total_rows = drivers_gps_dask_df.shape[0].compute()
unique_drivers = drivers_gps_dask_df['driver'].nunique().compute()
print(f"Total Rows : {total_rows}")
print(f"Unique Drivers : {unique_drivers}")

print ("Current Time :", datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
elapsed_time = round(time.time() - start_time, 3)
print(f"\nCell execution time : {elapsed_time} seconds")

Current Time : 2024-01-13 12:25:48
Total Rows : 4079631
Unique Drivers : 14652
Current Time : 2024-01-13 12:25:53

Cell execution time : 4.507 seconds


In [5]:
# DATA TYPE, CLEANING & SORTING #
#################################

# 2024-01-04 - Cell execution time : 32.924 seconds

start_time = time.time()
print ("Current Time :", datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

drivers_gps_dask_df['timestamp'] = _DaskDataFrame.to_datetime(drivers_gps_dask_df['timestamp'])
drivers_gps_dask_df['latitude'] = drivers_gps_dask_df['latitude'].astype(float)
drivers_gps_dask_df['longitude'] = drivers_gps_dask_df['longitude'].astype(float)
drivers_gps_dask_df = drivers_gps_dask_df.dropna()

drivers_gps_dask_df['zone_id'] = 0
drivers_gps_dask_df['zone_id'] = drivers_gps_dask_df['zone_id'].astype(int)

drivers_gps_dask_df = drivers_gps_dask_df.sort_values(by=['driver', 'timestamp'])
print(drivers_gps_dask_df.head())

print ("Current Time :", datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
elapsed_time = round(time.time() - start_time, 3)
print(f"\nCell execution time : {elapsed_time} seconds")

Current Time : 2024-01-13 12:25:53
          driver                 timestamp   latitude  longitude  zone_id
607830  000aa4d2 2017-08-31 14:50:31+00:00 -11.958991 -76.987914        0
423918  000aa4d2 2017-08-31 14:50:49+00:00 -11.958511 -76.987508        0
838040  000aa4d2 2017-08-31 14:51:01+00:00 -11.958629 -76.987368        0
605397  000aa4d2 2017-08-31 14:51:13+00:00 -11.958646 -76.987370        0
837365  000aa4d2 2017-08-31 14:51:37+00:00 -11.958791 -76.987198        0
Current Time : 2024-01-13 12:26:25

Cell execution time : 32.019 seconds


In [6]:
# NEW DATAFRAME WITH UNIQUE DRIVERS #
#####################################

# 2024-01-04 - Cell execution time : 74.851 seconds

start_time = time.time()
print ("Current Time :", datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

grouped = drivers_gps_dask_df.groupby('driver')

# Calculer la première et la dernière date pour chaque conducteur
drivers_dask_df = grouped.agg({'timestamp': ['min', 'max']})

# Aplatir les colonnes (après l'agrégation, les colonnes seront multi-indexées)
drivers_dask_df.columns = ['first_date', 'last_date']

# Réinitialiser l'index pour rendre 'driver' à nouveau une colonne
drivers_dask_df = drivers_dask_df.reset_index()

drivers_dask_df = drivers_dask_df.sort_values(by=['driver'])

# Saving
drivers_dask_df.to_parquet('drivers_dask_df.parquet')
drivers_pandas_df = drivers_dask_df.compute()
drivers_pandas_df.to_parquet('drivers_pandas_df.parquet')

#print(drivers_dask_df.head())
#print(drivers_dask_df.describe().compute())

print ("Current Time :", datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
elapsed_time = round(time.time() - start_time, 3)
print(f"\nCell execution time : {elapsed_time} seconds")

Current Time : 2024-01-13 12:26:25
Current Time : 2024-01-13 12:27:11

Cell execution time : 46.361 seconds


In [7]:
# CREATING A GEO DATA FRAME WITH GEO PANDA #
############################################

# 2024-01-10 - Cell execution time : 0.026 seconds

start_time = time.time()
print ("Current Time :", datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

zones_dask_df['geometry'] = zones_dask_df['polygon'].apply(lambda x: Polygon(x))

zones_geodataframe = _gpd.GeoDataFrame(zones_dask_df, geometry='geometry')

zones_geodataframe.set_crs(epsg=4326, inplace=True)

print ("Current Time :", datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
elapsed_time = round(time.time() - start_time, 3)
print(f"\nCell execution time : {elapsed_time} seconds")  

Current Time : 2024-01-13 12:27:11
Current Time : 2024-01-13 12:27:11

Cell execution time : 0.011 seconds


In [8]:
# CREATING A FIRST MAP WITH A RANDOM DRIVER PATH #
##################################################

# 2024-01-10 - Cell execution time : 25.993 seconds

start_time = time.time()
print ("Current Time :", datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

# Compute centroid to center the map
zones_geodataframe['centroid'] = zones_geodataframe['geometry'].centroid
mean_latitude = zones_geodataframe['centroid'].y.mean()
mean_longitude = zones_geodataframe['centroid'].x.mean()

print(f"Mean Latitude : {mean_latitude}, Mean Longitude : {mean_longitude}")

# Select a specific driver
specific_driver_df = drivers_gps_dask_df[(drivers_gps_dask_df['driver'] == 'fff76584')].compute()
specific_driver_df = specific_driver_df.sort_values(by='timestamp')
start_location = specific_driver_df.iloc[0][['latitude', 'longitude']]

# Create map object
oMap = folium.Map(location=[mean_latitude, mean_longitude], zoom_start=12)

# Draw zones
for _, row in zones_geodataframe.iterrows():
    simpl_geo = row['geometry'].simplify(tolerance=0.001, preserve_topology=True)
    folium.GeoJson(simpl_geo).add_to(oMap)

for _, row in zones_geodataframe.iterrows():
    simpl_geo = row['geometry'].simplify(tolerance=0.001, preserve_topology=True)
    folium.GeoJson(simpl_geo).add_to(oMap)

    # Calculer le centroïde de la zone
    centroid = row['geometry'].centroid
    # Créer un marqueur avec DivIcon pour afficher le zone_id
    folium.Marker(
        [centroid.y, centroid.x],
        icon=folium.DivIcon(
            icon_size=(150,36),
            icon_anchor=(7,20),
            html=f'<div style="font-size: 12pt; color : black">{row["zone_id"]}</div>'
        )
    ).add_to(oMap)
    
# Draw GPS Dots
for _, row in specific_driver_df.iterrows():
    folium.CircleMarker(location=[row['latitude'], row['longitude']], radius=5, color='blue').add_to(oMap)

# Add lines
points = specific_driver_df[['latitude', 'longitude']].values
folium.PolyLine(points, color='blue').add_to(oMap)

# Draw result
oMap

Current Time : 2024-01-13 12:27:11



  zones_geodataframe['centroid'] = zones_geodataframe['geometry'].centroid


Mean Latitude : -11.994491625753374, Mean Longitude : -76.92079690854929


In [9]:
print ("Current Time :", datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
elapsed_time = round(time.time() - start_time, 3)
print(f"\nCell execution time : {elapsed_time} seconds") 

Current Time : 2024-01-13 12:27:36

Cell execution time : 24.158 seconds


In [10]:
# COMPUTING DISTANCE AND SPEED BETWEEN EACH GPS VALUE => PANDA METHOD #
#######################################################################

# 2024-01-04 - Cell execution time : 230.406 seconds

# Notes : Impossible to use Dask here because we need a sequential work to have the N and N-1 record.
#         Parallelism seems useless here. To be confirmed...

start_time = time.time()
print ("Current Time :", datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

def haversine(lon1, lat1, lon2, lat2):
    # Convertir les coordonnées en radians
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # Formule de Haversine
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a))
    r = 6371  # Rayon de la Terre en kilomètres
    return c * r

def calculate_distance_and_speed_old(df):
    df['prev_latitude'] = df['latitude'].shift(1)
    df['prev_longitude'] = df['longitude'].shift(1)
    df['distance'] = df.apply(lambda x: haversine(x['prev_longitude'], x['prev_latitude'], x['longitude'], x['latitude']) if not _np.isnan(x['prev_longitude']) else 0, axis=1)
    df['time_diff'] = df['timestamp'].diff().dt.total_seconds() / 3600
    df['speed'] = df['distance'] / df['time_diff']
    df['speed'] = df['speed'].fillna(0) 
    return df


def calculate_distance_and_speed(df):
    df['prev_latitude'] = df['latitude'].shift(1)
    df['prev_longitude'] = df['longitude'].shift(1)
    # Find the first line of each drivers
    df['is_first_row'] = df['driver'] != df['driver'].shift(1)
    # Haversine
    df['distance'] = df.apply(lambda x: 0 if x['is_first_row'] else haversine(x['prev_longitude'], x['prev_latitude'], x['longitude'], x['latitude']), axis=1)
    # time_diff
    df['time_diff'] = df['timestamp'].diff().dt.total_seconds() / 3600
    # Speed
    df['speed'] = df.apply(lambda x: 0 if x['is_first_row'] or x['time_diff'] <= 0 else (x['distance'] / x['time_diff']), axis=1)
    # Cleaning
    df.drop(['prev_latitude', 'prev_longitude', 'is_first_row'], axis=1, inplace=True)

    return df

# Convert Dask DataFrame To Pandas
drivers_gps_pandas_df = drivers_gps_dask_df.compute()
drivers_gps_pandas_df = drivers_gps_pandas_df.sort_values(['driver', 'timestamp'])

# Compute....
drivers_gps_pandas_df = calculate_distance_and_speed(drivers_gps_pandas_df)

# Convert back Pandas DataFrame to Dask
drivers_gps_dask_df = _DaskDataFrame.from_pandas(drivers_gps_pandas_df, npartitions=4)

# Sorting again the Dask Dataframe
drivers_gps_dask_df = drivers_gps_dask_df.sort_values(by=['driver', 'timestamp'])

#print(drivers_gps_dask_df.head())
#print(drivers_gps_pandas_df.head())

print ("Current Time :", datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
elapsed_time = round(time.time() - start_time, 3)
print(f"\nCell execution time : {elapsed_time} seconds") 


Current Time : 2024-01-13 12:27:36
Current Time : 2024-01-13 12:31:11

Cell execution time : 215.389 seconds


In [11]:
# COMPUTING MEAN SPEED AND TOTAL DISTANCE FOR EACH DRIVERS => DASK    #
#######################################################################

# 2024-01-12 - Cell execution time : 71.882 seconds

start_time = time.time()
print ("Current Time :", datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

# Dask Version
mean_speed = drivers_gps_dask_df.groupby('driver')['speed'].mean().compute()
total_distance = drivers_gps_dask_df.groupby('driver')['distance'].sum().compute()

# Create result dataset
driver_stats = _pd.DataFrame({
    'driver': mean_speed.index,
    'mean_speed': mean_speed.values,
    'total_distance': total_distance.values
})

# Update
drivers_dask_df = drivers_dask_df.merge(driver_stats, on='driver', how='left')

drivers_dask_df['mean_speed'] = drivers_dask_df['mean_speed'].astype(float)
drivers_dask_df['total_distance'] = drivers_dask_df['total_distance'].astype(float)

drivers_pandas_df = drivers_dask_df.compute()
drivers_pandas_df.to_parquet('drivers_pandas_df.parquet')

print ("Current Time :", datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
elapsed_time = round(time.time() - start_time, 3)
print(f"\nCell execution time : {elapsed_time} seconds") 

Current Time : 2024-01-13 12:31:11
Current Time : 2024-01-13 12:32:18

Cell execution time : 67.238 seconds


In [12]:
# DETERMINE ZONE ID FOR EACH GPS VALUE. PARALLEL METHOD WITH DASK & GEOPANDAS #
###############################################################################

# 2024-01-12 - Cell execution time : 875.037 seconds => +- 15 min

start_time = time.time()
print ("Current Time :", datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

# Compute each partition
def process_partition(partition, zones_gdf):
    # Create geographicak points.
    partition['geometry'] = partition.apply(lambda row: Point(row['longitude'], row['latitude']), axis=1)
    
    # Convert to Geodataframe
    drivers_gdf = _gpd.GeoDataFrame(partition, geometry='geometry')
    
    # Check CRS
    drivers_gdf.set_crs("EPSG:4326", inplace=True)

    if drivers_gdf.crs != zones_gdf.crs:
        drivers_gdf = drivers_gdf.to_crs(zones_gdf.crs)
        
    # Spatial joint...
    joined = _gpd.sjoin(drivers_gdf, zones_gdf[['geometry', 'zone_id']], how='left', predicate='within')
    
    # Check if the zone_id is valid (inside a zone) or outside.
    if 'zone_id_right' in joined:
        partition['zone_id'] = joined['zone_id_right']
    else:
        partition['zone_id'] = -1          

    return partition.drop(columns=['geometry'])


results = drivers_gps_dask_df.map_partitions(process_partition, zones_geodataframe, align_dataframes=False, meta=drivers_gps_dask_df)

# Update Original dataframe...
drivers_gps_dask_df = results
drivers_gps_dask_df.compute()
drivers_gps_dask_df['zone_id'] = drivers_gps_dask_df['zone_id'].fillna(-1).astype(int)
drivers_gps_dask_df['zone_id'] = drivers_gps_dask_df['zone_id'].astype(int)
drivers_gps_dask_df = drivers_gps_dask_df.sort_values(by=['driver', 'timestamp'])

# Save Result...
drivers_gps_dask_df.to_parquet('drivers_gps_dask_df.parquet')

drivers_gps_pandas_df = drivers_gps_dask_df.compute()
drivers_gps_pandas_df.to_parquet('drivers_gps_pandas_df.parquet')

print ("Current Time :", datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
elapsed_time = round(time.time() - start_time, 3)
print(f"\nCell execution time : {elapsed_time} seconds")

Current Time : 2024-01-13 12:32:18
Current Time : 2024-01-13 12:49:05

Cell execution time : 1006.271 seconds


In [13]:
# COMPUTING MEAN TIME ON ZONE AND FAVORITE ZONE => DASK    #
############################################################

# 2024-01-12 - Cell execution time : 472.539 seconds

start_time = time.time()
print ("Current Time :", datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

# Compute time_diff sum
time_sum = drivers_gps_dask_df.groupby(['driver', 'zone_id'])['time_diff'].sum()

# Find zone_id with max total time_diff for each driver
favorite_zone_by_time = time_sum.groupby('driver').idxmax().compute()
favorite_zone_by_time = favorite_zone_by_time.apply(lambda x: x[1] if _pd.notna(x) else None)

# Count lignes for each drivers and zone_id
count_zones = drivers_gps_dask_df.groupby(['driver', 'zone_id']).size()

# Find zone_id with max number for each driver
favorite_zone_by_values = count_zones.groupby('driver').idxmax().compute()
favorite_zone_by_values = favorite_zone_by_values.apply(lambda x: x[1] if _pd.notna(x) else None)

# Result
driver_stats = _pd.DataFrame({
    'driver': favorite_zone_by_time.index,
    'favorite_zone_by_time': favorite_zone_by_time.values,
    'favorite_zone_by_values': favorite_zone_by_values.values
})

# Update Dataframe
drivers_dask_df = drivers_dask_df.merge(driver_stats, on='driver', how='left')

drivers_dask_df['favorite_zone_by_time'] = drivers_dask_df['favorite_zone_by_time'].astype(int)
drivers_dask_df['favorite_zone_by_values'] = drivers_dask_df['favorite_zone_by_values'].astype(int)

drivers_pandas_df = drivers_dask_df.compute()
drivers_pandas_df.to_parquet('drivers_pandas_df.parquet')

print ("Current Time :", datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
elapsed_time = round(time.time() - start_time, 3)
print(f"\nCell execution time : {elapsed_time} seconds") 

Current Time : 2024-01-13 12:49:05
Current Time : 2024-01-13 12:56:57

Cell execution time : 472.539 seconds


In [14]:




# Total GPS Count: 3949761
# Nombre de fois où zone_id est -1: 129870


start_time = time.time()
print ("Current Time :", datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

# Calcul de gps_count
gps_count = drivers_gps_pandas_df['zone_id'].value_counts()

elapsed_time = round(time.time() - start_time, 3)
print(f"\nCell execution time : {elapsed_time} seconds") 

# Calcul de unique_drivers
unique_drivers = drivers_gps_pandas_df.groupby('zone_id')['driver'].nunique()

elapsed_time = round(time.time() - start_time, 3)
print(f"\nCell execution time : {elapsed_time} seconds") 

# Convertir gps_count et unique_drivers en DataFrame
gps_count_df = gps_count.reset_index().rename(columns={'index': 'zone_id'})
gps_count_df = gps_count.reset_index().rename(columns={'count': 'gps_count'})
unique_drivers_df = unique_drivers.reset_index().rename(columns={'driver': 'unique_drivers'})

elapsed_time = round(time.time() - start_time, 3)
print(f"\nCell execution time : {elapsed_time} seconds") 

# Fusionner ces DataFrames avec zones_dask_df
zones_dask_df = zones_dask_df.merge(gps_count_df, on='zone_id', how='left')
zones_dask_df = zones_dask_df.merge(unique_drivers_df, on='zone_id', how='left')

elapsed_time = round(time.time() - start_time, 3)
print(f"\nCell execution time : {elapsed_time} seconds") 

# Remplir les valeurs NaN par 0 si nécessaire
zones_dask_df['gps_count'] = zones_dask_df['gps_count'].fillna(0)
zones_dask_df['unique_drivers'] = zones_dask_df['unique_drivers'].fillna(0)
zones_dask_df['gps_count'] = zones_dask_df['gps_count'].astype(int)
zones_dask_df['unique_drivers'] = zones_dask_df['unique_drivers'].astype(int)

# Refresh zones_geodataframe
zones_geodataframe = _gpd.GeoDataFrame(zones_dask_df, geometry='geometry')
zones_geodataframe.set_crs(epsg=4326, inplace=True)

total_gps_count = zones_dask_df['gps_count'].sum()
print("Total GPS Count:", total_gps_count)


# Filtrer les lignes où zone_id est égal à -1 et compter
count_zone_id_minus_one = drivers_gps_dask_df[drivers_gps_dask_df['zone_id'] == -1]['zone_id'].count().compute()

print("Nombre de fois où zone_id est -1:", count_zone_id_minus_one)


zones_pandas_df = zones_dask_df

print ("Current Time :", datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
elapsed_time = round(time.time() - start_time, 3)
print(f"\nCell execution time : {elapsed_time} seconds")


Current Time : 2024-01-13 12:56:57

Cell execution time : 0.08 seconds

Cell execution time : 1.655 seconds

Cell execution time : 1.655 seconds

Cell execution time : 1.671 seconds
Total GPS Count: 3949761
Nombre de fois où zone_id est -1: 129870


AttributeError: 'DataFrame' object has no attribute 'compute'

In [37]:
# COUNTING NUMBER OF GPS VALUES FOR EACH DRIVERS    #
#####################################################

# 2024-01-13 - Cell execution time : 292.241 seconds

start_time = time.time()
print ("Current Time :", datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

# Compter le nombre de données GPS pour chaque conducteur
gps_counts = drivers_gps_dask_df.groupby('driver').size().compute()
gps_counts = gps_counts.reset_index().rename(columns={0: 'gps_count'})

# Fusionner les comptages GPS avec le DataFrame Dask original
drivers_dask_df = drivers_dask_df.merge(gps_counts, on='driver', how='left')

drivers_dask_df.compute()

drivers_pandas_df = drivers_dask_df.compute()

print ("Current Time :", datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
elapsed_time = round(time.time() - start_time, 3)
print(f"\nCell execution time : {elapsed_time} seconds")

Current Time : 2024-01-13 13:49:44
Current Time : 2024-01-13 13:54:36

Cell execution time : 292.241 seconds


In [20]:
# SAVE CURRENT SITUATION    #
#############################

# 2024-01-13 - 

start_time = time.time()
print ("Current Time :", datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

# Dasks Dataframes
drivers_dask_df.to_parquet('drivers_dask_df.parquet')
drivers_gps_dask_df.to_parquet('drivers_gps_dask_df.parquet')

# Pandas Dataframes
drivers_pandas_df.to_parquet('drivers_pandas_df.parquet')
drivers_gps_pandas_df.to_parquet('drivers_gps_pandas_df.parquet')

drivers_pandas_df.to_csv('drivers_pandas_df.csv', index=False)
drivers_gps_pandas_df.to_csv('drivers_gps_pandas_df.csv', index=False)
zones_pandas_df.to_csv('zones_pandas_df.csv', index=False)

print ("Current Time :", datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
elapsed_time = round(time.time() - start_time, 3)
print(f"\nCell execution time : {elapsed_time} seconds")

Current Time : 2024-01-13 13:34:00


ArrowInvalid: ('Could not convert <POLYGON ((-77.286 -11.784, -77.286 -11.874, -77.195 -11.869, -77.195 -11.78...> with type Polygon: did not recognize Python value type when inferring an Arrow data type', 'Conversion failed for column geometry with type object')

In [38]:
drivers_pandas_df

Unnamed: 0,driver,first_date,last_date,mean_speed,total_distance,favorite_zone_by_time,favorite_zone_by_values,gps_count
0,000aa4d2,2017-08-31 14:50:31+00:00,2017-08-31 16:59:53+00:00,15.602668,30.887439,22,22,286
1,001b6172,2017-08-31 15:32:06+00:00,2017-08-31 16:59:41+00:00,15.268061,20.722036,22,22,220
2,00219be2,2017-08-31 16:21:42+00:00,2017-08-31 16:59:54+00:00,19.230185,12.496524,14,14,53
3,0022cc1b,2017-08-31 16:23:46+00:00,2017-08-31 16:51:00+00:00,10.560539,4.380521,22,22,62
4,0022f5ae,2017-08-31 12:00:16+00:00,2017-08-31 16:59:39+00:00,45.675928,40.787504,22,22,395
...,...,...,...,...,...,...,...,...
14647,fff16f06,2017-08-31 12:41:19+00:00,2017-08-31 16:56:09+00:00,18.572603,76.852291,21,21,665
14648,fff1ff46,2017-08-31 15:47:19+00:00,2017-08-31 16:59:07+00:00,12.904825,11.232070,-1,-1,139
14649,fff76584,2017-08-31 13:00:21+00:00,2017-08-31 16:59:29+00:00,17.928649,55.618581,13,14,347
14650,fff7da8b,2017-08-31 13:17:42+00:00,2017-08-31 16:59:31+00:00,12.505129,29.201487,-1,-1,366


In [40]:
zones_dask_df

Unnamed: 0,polygon,is_valid,zone_id,geometry,gps_count,unique_drivers
0,"[(-77.28648789023627, -11.784234676000215), (-...",True,1,POLYGON ((-77.28648789023627 -11.7842346760002...,0,0
1,"[(-76.82596521434296, -12.479157220410556), (-...",True,2,POLYGON ((-76.82596521434296 -12.4791572204105...,22,3
2,"[(-76.64127121444726, -12.201069396462607), (-...",True,3,POLYGON ((-76.64127121444726 -12.2010693964626...,0,0
3,"[(-76.7336521407365, -12.205585634694005), (-7...",True,4,POLYGON ((-76.7336521407365 -12.20558563469400...,0,0
4,"[(-76.7336521407365, -12.116068939612953), (-7...",True,5,POLYGON ((-76.7336521407365 -12.11606893961295...,0,0
5,"[(-76.82596521434296, -12.120520083794023), (-...",True,6,POLYGON ((-76.82596521434296 -12.1205200837940...,0,0
6,"[(-76.82596521434296, -12.210067657646714), (-...",True,7,POLYGON ((-76.82596521434296 -12.2100676576467...,16,1
7,"[(-76.82596521434296, -12.299690140508726), (-...",True,8,POLYGON ((-76.82596521434296 -12.2996901405087...,749,12
8,"[(-76.7336521407365, -12.295177256777961), (-7...",True,9,POLYGON ((-76.7336521407365 -12.29517725677796...,0,0
9,"[(-76.7336521407365, -12.384843153818448), (-7...",True,10,POLYGON ((-76.7336521407365 -12.38484315381844...,0,0


In [None]:
start_time = time.time()
print ("Current Time :", datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

import folium
from folium.plugins import HeatMap

# Créer une carte centrée sur un point moyen
mean_lat, mean_lon = drivers_gps_pandas_df['latitude'].mean(), drivers_gps_pandas_df['longitude'].mean()
m = folium.Map(location=[mean_lat, mean_lon], zoom_start=12)

# Ajouter une carte de chaleur
HeatMap(data=drivers_gps_pandas_df[['latitude', 'longitude']], radius=10).add_to(m)

m


Current Time : 2024-01-13 14:03:04


In [None]:
print ("Current Time :", datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
elapsed_time = round(time.time() - start_time, 3)
print(f"\nCell execution time : {elapsed_time} seconds")