In [1]:
import dask.dataframe as dd
from dask.distributed import Client
import pandas as pd


In [2]:
# Initialiser Dask
client = Client()

Perhaps you already have a cluster running?
Hosting the HTTP server on port 33727 instead


2024-08-29 19:18:51,112 - distributed.core - ERROR - Exception while handling op kill
Traceback (most recent call last):
  File "/usr/lib/python3.9/asyncio/tasks.py", line 452, in wait_for
    fut.result()
asyncio.exceptions.CancelledError

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/usr/local/lib/python3.9/dist-packages/distributed/core.py", line 829, in _handle_comm
    result = await result
  File "/usr/local/lib/python3.9/dist-packages/distributed/nanny.py", line 400, in kill
    await self.process.kill(reason=reason, timeout=timeout)
  File "/usr/local/lib/python3.9/dist-packages/distributed/nanny.py", line 875, in kill
    await process.join(max(0, deadline - time()))
  File "/usr/local/lib/python3.9/dist-packages/distributed/process.py", line 330, in join
    await wait_for(asyncio.shield(self._exit_future), timeout)
  File "/usr/local/lib/python3.9/dist-packages/distributed/utils.py", line 1957, in wait_for
 

In [24]:
# Charger les données avec Dask
output_path = "./data/filtered_df_output.parquet"
df = dd.read_parquet(output_path)


In [25]:
# Convertir les heures en périodes de la journée
def time_of_day(hour):
    if 5 <= hour <= 11:
        return 'matin'
    elif 12 <= hour <= 16:
        return 'après-midi'
    elif 17 <= hour <= 23:
        return 'soir'
    else:
        return 'nuit'

In [26]:
# Ajouter une colonne pour extraire l'heure de l'événement
df['event_hour'] = df['event_time'].dt.hour

In [27]:

# Appliquer la fonction sur toute la colonne
df['most_active_time'] = df['event_hour'].apply(time_of_day, meta=('event_hour', 'object'))

In [28]:
# Fonction pour calculer les statistiques pour une période donnée
def compute_user_stats(df, period_label, period_offset):
    period_df = df[df['event_time'] >= df['event_time'].max() - pd.DateOffset(months=period_offset)]

    # Calculer les vues, les paniers et les achats
    number_of_views = period_df[period_df['event_type'] == 'view'].groupby('user_id').size().rename(f'number_of_views_{period_label}')
    number_of_carts = period_df[period_df['event_type'] == 'cart'].groupby('user_id').size().rename(f'number_of_carts_{period_label}')
    count_products = period_df[period_df['event_type'] == 'purchase'].groupby('user_id').size().rename(f'count_products_{period_label}')

    # Calculer le prix moyen des achats
    avg_price = period_df[period_df['event_type'] == 'purchase'].groupby('user_id')['price'].mean().rename(f'avg_price_{period_label}')

    # Calculer le nombre de sessions
    number_of_sessions = period_df.groupby('user_id')['user_session'].nunique().rename(f'number_of_sessions_{period_label}')

    # Fusionner toutes les statistiques
    stats = dd.concat([number_of_views, number_of_carts, count_products, avg_price, number_of_sessions], axis=1)

    return stats

In [29]:
# Calculer les statistiques pour les périodes de 2, 5, et 7 mois
stats_2m = compute_user_stats(df, '2m', 2)
stats_5m = compute_user_stats(df, '5m', 5)
stats_7m = compute_user_stats(df, '7m', 7)

In [30]:
# Fusionner toutes les statistiques ensemble
user_stats_df = stats_2m.merge(stats_5m, on='user_id', how='outer').merge(stats_7m, on='user_id', how='outer')

In [31]:

# Calculer les statistiques supplémentaires
last_purchase = df[df['event_type'] == 'purchase'].groupby('user_id')['event_time'].max()
days_since_last_purchase = (df['event_time'].max() - last_purchase).dt.days
total_purchase_value = df[df['event_type'] == 'purchase'].groupby('user_id')['price'].sum()

In [32]:
# Ajouter les statistiques supplémentaires au DataFrame
user_stats_df['last_purchase_temp'] = last_purchase
user_stats_df['days_since_last_purchase'] = days_since_last_purchase
user_stats_df['total_purchase_value'] = total_purchase_value

In [33]:
# Calculer les abandons de panier
cart_count = df[df['event_type'] == 'cart'].groupby('user_id').size()
purchase_count = df[df['event_type'] == 'purchase'].groupby('user_id').size()
cart_abandonments = cart_count - purchase_count

In [34]:

# Utiliser .where pour remplacer les valeurs négatives par 0
cart_abandonments = cart_abandonments.where(cart_abandonments >= 0, 0)

In [35]:
user_stats_df['cart_count'] = cart_count
user_stats_df['purchase_count'] = purchase_count
user_stats_df['cart_abandonments'] = cart_abandonments

In [36]:
# Convertir la partie nécessaire en Pandas pour calculer les préférences de marque et de catégorie
df_pandas = df[df['event_type'] == 'purchase'].compute()

In [37]:
# Calculer les préférences de marque
preferred_brand = df_pandas.groupby(['user_id', 'brand']).size().reset_index()
preferred_brand.columns = ['user_id', 'brand', 'count']  # Renommer les colonnes correctement
preferred_brand = preferred_brand.loc[preferred_brand.groupby('user_id')['count'].idxmax()].set_index('user_id')['brand']


In [38]:
# Calculer les préférences de catégorie
preferred_category = df_pandas.groupby(['user_id', 'category_code']).size().reset_index()
preferred_category.columns = ['user_id', 'category_code', 'count']  # Renommer les colonnes correctement
preferred_category = preferred_category.loc[preferred_category.groupby('user_id')['count'].idxmax()].set_index('user_id')['category_code']


In [39]:

# Ajouter les préférences au DataFrame principal
user_stats_df['preferred_brand'] = preferred_brand
user_stats_df['preferred_category'] = preferred_category

In [40]:
# Calculer la période la plus active pour chaque utilisateur
most_active_time_df = df.groupby(['user_id', 'most_active_time']).size().reset_index()
most_active_time_df.columns = ['user_id', 'most_active_time', 'activity_count']  # Renommer les colonnes correctement


In [42]:
# Convertir cette partie en Pandas pour faire le rank
most_active_time_df = most_active_time_df.compute()
most_active_time_df['rank'] = most_active_time_df.groupby('user_id')['activity_count'].rank(method='first', ascending=False)


AttributeError: 'DataFrame' object has no attribute 'compute'

2024-08-29 20:37:45,159 - distributed.worker - ERROR - Failed to communicate with scheduler during heartbeat.
Traceback (most recent call last):
  File "/usr/local/lib/python3.9/dist-packages/distributed/comm/tcp.py", line 225, in read
    frames_nosplit_nbytes_bin = await stream.read_bytes(fmt_size)
tornado.iostream.StreamClosedError: Stream is closed

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/usr/local/lib/python3.9/dist-packages/distributed/worker.py", line 1250, in heartbeat
    response = await retry_operation(
  File "/usr/local/lib/python3.9/dist-packages/distributed/utils_comm.py", line 459, in retry_operation
    return await retry(
  File "/usr/local/lib/python3.9/dist-packages/distributed/utils_comm.py", line 438, in retry
    return await coro()
  File "/usr/local/lib/python3.9/dist-packages/distributed/core.py", line 1254, in send_recv_from_rpc
    return await send_recv(comm=comm, op=key, **kwargs)
  

In [None]:
# Filtrer pour ne garder que la période la plus active
most_active_time_df = most_active_time_df[most_active_time_df['rank'] == 1].drop(columns=['rank', 0])

In [None]:
# Supprimer la colonne 'most_active_time' s'il existe déjà dans user_stats_df
if 'most_active_time' in user_stats_df.columns:
    user_stats_df = user_stats_df.drop(columns=['most_active_time'])

In [None]:
# Réaliser la jointure
user_stats_df = user_stats_df.merge(most_active_time_df[['user_id', 'most_active_time']], on='user_id', how='inner')


In [None]:




# Vérification des colonnes
print(user_stats_df.columns)

# Afficher les premières lignes
print(user_stats_df.head())

# Optionnel : Sauvegarder le DataFrame final
user_stats_df.to_parquet('./data/user_stats_df_final.parquet', engine='pyarrow')


In [43]:
import dask.dataframe as dd
from dask.distributed import Client
import pandas as pd

# Initialiser Dask
client = Client()

# Charger les données avec Dask
output_path = "./data/filtered_df_output.parquet"
df = dd.read_parquet(output_path)

# Convertir les heures en périodes de la journée
def time_of_day(hour):
    if 5 <= hour <= 11:
        return 'matin'
    elif 12 <= hour <= 16:
        return 'après-midi'
    elif 17 <= hour <= 23:
        return 'soir'
    else:
        return 'nuit'

# Ajouter une colonne pour extraire l'heure de l'événement
df['event_hour'] = df['event_time'].dt.hour

# Appliquer la fonction sur toute la colonne
df['most_active_time'] = df['event_hour'].apply(time_of_day, meta=('event_hour', 'object'))

# Fonction pour calculer les statistiques pour une période donnée
def compute_user_stats(df, period_label, period_offset):
    period_df = df[df['event_time'] >= df['event_time'].max() - pd.DateOffset(months=period_offset)]
    
    # Calculer les vues, les paniers et les achats
    number_of_views = period_df[period_df['event_type'] == 'view'].groupby('user_id').size().rename(f'number_of_views_{period_label}')
    number_of_carts = period_df[period_df['event_type'] == 'cart'].groupby('user_id').size().rename(f'number_of_carts_{period_label}')
    count_products = period_df[period_df['event_type'] == 'purchase'].groupby('user_id').size().rename(f'count_products_{period_label}')
    
    # Calculer le prix moyen des achats
    avg_price = period_df[period_df['event_type'] == 'purchase'].groupby('user_id')['price'].mean().rename(f'avg_price_{period_label}')
    
    # Calculer le nombre de sessions
    number_of_sessions = period_df.groupby('user_id')['user_session'].nunique().rename(f'number_of_sessions_{period_label}')
    
    # Fusionner toutes les statistiques
    stats = dd.concat([number_of_views, number_of_carts, count_products, avg_price, number_of_sessions], axis=1)
    return stats

# Calculer les statistiques pour les périodes de 2, 5, et 7 mois
stats_2m = compute_user_stats(df, '2m', 2)
stats_5m = compute_user_stats(df, '5m', 5)
stats_7m = compute_user_stats(df, '7m', 7)

# Fusionner toutes les statistiques ensemble
user_stats_df = stats_2m.merge(stats_5m, on='user_id', how='outer').merge(stats_7m, on='user_id', how='outer')

# Calculer les statistiques supplémentaires
last_purchase = df[df['event_type'] == 'purchase'].groupby('user_id')['event_time'].max()
days_since_last_purchase = (df['event_time'].max() - last_purchase).dt.days
total_purchase_value = df[df['event_type'] == 'purchase'].groupby('user_id')['price'].sum()

# Ajouter les statistiques supplémentaires au DataFrame
user_stats_df['last_purchase_temp'] = last_purchase
user_stats_df['days_since_last_purchase'] = days_since_last_purchase
user_stats_df['total_purchase_value'] = total_purchase_value

# Calculer les abandons de panier
cart_count = df[df['event_type'] == 'cart'].groupby('user_id').size()
purchase_count = df[df['event_type'] == 'purchase'].groupby('user_id').size()
cart_abandonments = cart_count - purchase_count

# Utiliser .where pour remplacer les valeurs négatives par 0
cart_abandonments = cart_abandonments.where(cart_abandonments >= 0, 0)

user_stats_df['cart_count'] = cart_count
user_stats_df['purchase_count'] = purchase_count
user_stats_df['cart_abandonments'] = cart_abandonments

# Convertir la partie nécessaire en Pandas pour calculer les préférences de marque et de catégorie
df_pandas = df[df['event_type'] == 'purchase'].compute()

# Calculer les préférences de marque
preferred_brand = df_pandas.groupby(['user_id', 'brand']).size().reset_index()
preferred_brand.columns = ['user_id', 'brand', 'count']  # Renommer les colonnes correctement
preferred_brand = preferred_brand.loc[preferred_brand.groupby('user_id')['count'].idxmax()].set_index('user_id')['brand']

# Calculer les préférences de catégorie
preferred_category = df_pandas.groupby(['user_id', 'category_code']).size().reset_index()
preferred_category.columns = ['user_id', 'category_code', 'count']  # Renommer les colonnes correctement
preferred_category = preferred_category.loc[preferred_category.groupby('user_id')['count'].idxmax()].set_index('user_id')['category_code']

# Ajouter les préférences au DataFrame principal
user_stats_df['preferred_brand'] = preferred_brand
user_stats_df['preferred_category'] = preferred_category

# Calculer la période la plus active pour chaque utilisateur
most_active_time_df = df.groupby(['user_id', 'most_active_time']).size().reset_index(name='activity_count')

# Convertir cette partie en Pandas pour faire le rank
most_active_time_df = most_active_time_df.compute()  # C'est déjà un DataFrame Pandas maintenant
most_active_time_df['rank'] = most_active_time_df.groupby('user_id')['activity_count'].rank(method='first', ascending=False)

# Filtrer pour ne garder que la période la plus active
most_active_time_df = most_active_time_df[most_active_time_df['rank'] == 1].drop(columns=['rank', 'activity_count'])

# Supprimer la colonne 'most_active_time' s'il existe déjà dans user_stats_df
if 'most_active_time' in user_stats_df.columns:
    user_stats_df = user_stats_df.drop(columns=['most_active_time'])

# Réaliser la jointure
user_stats_df = user_stats_df.merge(most_active_time_df[['user_id', 'most_active_time']], on='user_id', how='inner')

# Vérification des colonnes
print(user_stats_df.columns)

# Afficher les premières lignes
print(user_stats_df.head().compute())  # Utilisez compute() avant d'afficher les résultats

# Optionnel : Sauvegarder le DataFrame final
user_stats_df.to_parquet('./data/user_stats_df_final.parquet', engine='pyarrow')


Perhaps you already have a cluster running?
Hosting the HTTP server on port 41179 instead


TypeError: reset_index() got an unexpected keyword argument 'name'

Perhaps you already have a cluster running?
Hosting the HTTP server on port 35199 instead


Index(['user_id', 'number_of_views_2m', 'number_of_carts_2m',
       'count_products_2m', 'avg_price_2m', 'number_of_sessions_2m',
       'number_of_views_5m', 'number_of_carts_5m', 'count_products_5m',
       'avg_price_5m', 'number_of_sessions_5m', 'number_of_views_7m',
       'number_of_carts_7m', 'count_products_7m', 'avg_price_7m',
       'number_of_sessions_7m', 'last_purchase_temp',
       'days_since_last_purchase', 'total_purchase_value', 'cart_count',
       'purchase_count', 'cart_abandonments', 'preferred_brand',
       'preferred_category', 'most_active_time'],
      dtype='object')


AssertionError: 



In [23]:
# import dask.dataframe as dd
# from dask.distributed import Client
# import pandas as pd

# # Initialiser Dask
# client = Client()

# # Charger les données avec Dask
# output_path = "./data/filtered_df_output.parquet"
# df = dd.read_parquet(output_path)

# # Convertir les heures en périodes de la journée
# def time_of_day(hour):
#     if 5 <= hour <= 11:
#         return 'matin'
#     elif 12 <= hour <= 16:
#         return 'après-midi'
#     elif 17 <= hour <= 23:
#         return 'soir'
#     else:
#         return 'nuit'

# # Ajouter une colonne pour extraire l'heure de l'événement
# df['event_hour'] = df['event_time'].dt.hour

# # Appliquer la fonction sur toute la colonne
# df['most_active_time'] = df['event_hour'].apply(time_of_day, meta=('event_hour', 'object'))

# # Calculer les statistiques pour une période donnée
# def compute_user_stats(df, period_label, period_offset):
#     period_df = df[df['event_time'] >= df['event_time'].max() - pd.DateOffset(months=period_offset)]
    
#     # Calculer les vues, les paniers et les achats
#     number_of_views = period_df[period_df['event_type'] == 'view'].groupby('user_id').size().rename(f'number_of_views_{period_label}')
#     number_of_carts = period_df[period_df['event_type'] == 'cart'].groupby('user_id').size().rename(f'number_of_carts_{period_label}')
#     count_products = period_df[period_df['event_type'] == 'purchase'].groupby('user_id').size().rename(f'count_products_{period_label}')
    
#     # Calculer le prix moyen des achats
#     avg_price = period_df[period_df['event_type'] == 'purchase'].groupby('user_id')['price'].mean().rename(f'avg_price_{period_label}')
    
#     # Calculer le nombre de sessions
#     number_of_sessions = period_df.groupby('user_id')['user_session'].nunique().rename(f'number_of_sessions_{period_label}')
    
#     # Fusionner toutes les statistiques
#     stats = dd.concat([number_of_views, number_of_carts, count_products, avg_price, number_of_sessions], axis=1)
#     return stats

# # Calculer les statistiques pour les périodes de 2, 5, et 7 mois
# stats_2m = compute_user_stats(df, '2m', 2)
# stats_5m = compute_user_stats(df, '5m', 5)
# stats_7m = compute_user_stats(df, '7m', 7)

# # Fusionner toutes les statistiques ensemble
# user_stats_df = stats_2m.merge(stats_5m, on='user_id', how='outer').merge(stats_7m, on='user_id', how='outer')

# # Calculer les statistiques supplémentaires
# last_purchase = df[df['event_type'] == 'purchase'].groupby('user_id')['event_time'].max()
# days_since_last_purchase = (df['event_time'].max() - last_purchase).dt.days
# total_purchase_value = df[df['event_type'] == 'purchase'].groupby('user_id')['price'].sum()

# # Ajouter les statistiques supplémentaires au DataFrame
# user_stats_df['last_purchase_temp'] = last_purchase
# user_stats_df['days_since_last_purchase'] = days_since_last_purchase
# user_stats_df['total_purchase_value'] = total_purchase_value

# # Calculer les abandons de panier
# cart_count = df[df['event_type'] == 'cart'].groupby('user_id').size()
# purchase_count = df[df['event_type'] == 'purchase'].groupby('user_id').size()
# cart_abandonments = cart_count - purchase_count

# # Utiliser .where pour remplacer les valeurs négatives par 0
# cart_abandonments = cart_abandonments.where(cart_abandonments >= 0, 0)

# user_stats_df['cart_count'] = cart_count
# user_stats_df['purchase_count'] = purchase_count
# user_stats_df['cart_abandonments'] = cart_abandonments

# # Convertir la partie nécessaire en Pandas pour calculer les préférences de marque et de catégorie
# df_pandas = df[df['event_type'] == 'purchase'].compute()

# # Calculer les préférences de marque
# preferred_brand = df_pandas.groupby(['user_id', 'brand']).size().reset_index()
# preferred_brand.columns = ['user_id', 'brand', 'count']  # Renommer les colonnes correctement
# preferred_brand = preferred_brand.loc[preferred_brand.groupby('user_id')['count'].idxmax()].set_index('user_id')['brand']

# # Calculer les préférences de catégorie
# preferred_category = df_pandas.groupby(['user_id', 'category_code']).size().reset_index()
# preferred_category.columns = ['user_id', 'category_code', 'count']  # Renommer les colonnes correctement
# preferred_category = preferred_category.loc[preferred_category.groupby('user_id')['count'].idxmax()].set_index('user_id')['category_code']

# # Ajouter les préférences au DataFrame principal
# user_stats_df['preferred_brand'] = preferred_brand
# user_stats_df['preferred_category'] = preferred_category

# # Calculer la période la plus active pour chaque utilisateur
# most_active_time_df = df.groupby(['user_id', 'most_active_time']).size().reset_index()
# most_active_time_df.columns = ['user_id', 'most_active_time', 'activity_count']  # Renommer la colonne de taille pour éviter les erreurs

# # Calculer le rang de l'activité
# most_active_time_df['rank'] = most_active_time_df.groupby('user_id')['activity_count'].rank(method='first', ascending=False)

# # Filtrer pour ne garder que la période la plus active
# most_active_time_df = most_active_time_df[most_active_time_df['rank'] == 1].drop(columns=['rank', 'activity_count'])

# # Supprimer la colonne 'most_active_time' s'il existe déjà dans user_stats_df
# if 'most_active_time' in user_stats_df.columns:
#     user_stats_df = user_stats_df.drop(columns=['most_active_time'])

# # Réaliser la jointure
# user_stats_df = user_stats_df.merge(most_active_time_df[['user_id', 'most_active_time']], on='user_id', how='inner')

# # Vérification des colonnes
# print(user_stats_df.columns)

# # Afficher les premières lignes
# print(user_stats_df.head().compute())  # Utilisez compute() avant d'afficher les résultats

# # Optionnel : Sauvegarder le DataFrame final
# user_stats_df.to_parquet('./data/user_stats_df_final.parquet', engine='pyarrow')


Perhaps you already have a cluster running?
Hosting the HTTP server on port 41951 instead


AttributeError: 'Column not found: rank'