In [2]:
import psycopg2
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import os
from sqlalchemy import text
from dotenv import load_dotenv

## Mise en dataframe des données

In [2]:
# Mise en DataFrame des csv
# Liste pour stockers les dataframes
df_list = {}
# Dossier avec les csv
data_folder = "data"
# Boucle sur les fichiers du dossier 
for csv in os.listdir(data_folder):
    if csv.endswith(".csv"): # Filtre les fichiers au format csv
        file_path = os.path.join(data_folder, csv)
        df = pd.read_csv(file_path) # Lecture des csv
        # Stockage dans le dictionnaire df_list
        name = os.path.splitext(csv)[0]
        df_list[name] = df
        print(f'{csv} chargé : {df.shape[0]} lignes, {df.shape[1]} colonnes')

olist_order_payments_dataset.csv chargé : 103886 lignes, 5 colonnes
olist_order_reviews_dataset.csv chargé : 99224 lignes, 7 colonnes
olist_products_dataset.csv chargé : 32951 lignes, 9 colonnes
olist_customers_dataset.csv chargé : 99441 lignes, 5 colonnes
olist_sellers_dataset.csv chargé : 3095 lignes, 4 colonnes
olist_orders_dataset.csv chargé : 99441 lignes, 8 colonnes
olist_geolocation_dataset.csv chargé : 1000163 lignes, 5 colonnes
product_category_name_translation.csv chargé : 71 lignes, 2 colonnes
olist_order_items_dataset.csv chargé : 112650 lignes, 7 colonnes


## Connexion à supabase

In [3]:
# Configuration de connexion (à adapter selon votre provider)
load_dotenv()


host = os.getenv('DB_HOST')  #  host Supabase
password = os.getenv('DB_PASSWORD_BIS')
user = os.getenv('DB_USER_BIS')
database = os.getenv('DB_DATABASE')
port = os.getenv('DB_PORT')


# Création de l'engine SQLAlchemy
connection_string = f"postgresql+psycopg2://{user}:{password}@{host}:{port}/{database}"
engine = create_engine(connection_string)

# Test de connexion
def test_connection():
    """
    Testez votre connexion à la base

    Étapes :
    1. Utilisez pd.read_sql() pour exécuter "SELECT version()"
    2. Affichez la version PostgreSQL
    3. Gérez les erreurs de connexion
    """
    try:
        df = pd.read_sql('SELECT version();', engine)
        print('Connexion Ok | Version : ', df.iloc[0,0])
    except Exception as e:
        print(f"Erreur de connexion : {e}")
        return False
    return True
test_connection()

Connexion Ok | Version :  PostgreSQL 17.4 on aarch64-unknown-linux-gnu, compiled by gcc (GCC) 13.2.0, 64-bit


True

## Création tables


In [8]:
### 🗃️ Création des tables SQL

for table_name, df in df_list.items():
    df.to_sql(name=table_name, con=engine, if_exists='replace', index=False)


## Analyse RFM

In [None]:
# RECENCE

recency_query = text("""
    SELECT 
        customer_id,
        DATE_PART(
            'day', 
            CAST(:ref_date AS timestamp) - CAST(MAX(order_purchase_timestamp) AS timestamp)
        ) AS recency_days,
        NTILE(5) OVER (
            ORDER BY DATE_PART(
                'day', 
                CAST(:ref_date AS timestamp) - CAST(MAX(order_purchase_timestamp) AS timestamp)
            )
        ) AS r_score
    FROM olist_orders_dataset
    GROUP BY customer_id
""")

# S’assurer que ref_date est bien un datetime natif
ref_date = pd.to_datetime(ref_date).to_pydatetime()

df_rfm_recency = pd.read_sql_query(recency_query, con=engine, params={"ref_date": ref_date})

print(df_rfm_recency)

In [26]:
verif_order = text("""
SELECT 
    c.customer_unique_id,
    COUNT(o.order_id) AS nb_commandes
FROM olist_orders_dataset o
JOIN olist_customers_dataset c ON o.customer_id = c.customer_id
GROUP BY c.customer_unique_id
ORDER BY nb_commandes DESC;

""")

df_verif = pd.read_sql_query(verif_order, con=engine)
print(df_verif)

                     customer_unique_id  nb_commandes
0      8d50f5eadf50201ccdcedfb9e2ac8455            17
1      3e43e6105506432c953e165fb2acf44c             9
2      6469f99c1f9dfae7733b25662e7f1782             7
3      ca77025e7201e3b30c44b472ff346268             7
4      1b6c7548a2a1f9037c1fd3ddfed95f33             7
...                                 ...           ...
96091  f9d2915378a9e9c4fd47edf5717c4949             1
96092  d4141c6e605904528699a2ad1d5e1787             1
96093  c16948c2abc51af80a151392f718e895             1
96094  dc9f99eb8458b74aea47c32170ae1374             1
96095  f98ea135d87661aac5b16fe7fed30c30             1

[96096 rows x 2 columns]


In [23]:
# Fréquence

freq = text("""
SELECT
    c.customer_unique_id,
    COUNT(o.order_id) AS order_count,
    NTILE(5) OVER (ORDER BY COUNT(o.order_id)) AS f_score
FROM olist_orders_dataset o
JOIN olist_customers_dataset c ON o.customer_id = c.customer_id
GROUP BY c.customer_unique_id
ORDER BY order_count DESC;
""")

total_order = pd.read_sql(freq, con=engine)
print(total_order.value_counts())


customer_unique_id                order_count  f_score
0000366f3b9a7992bf8c76cfdf3221e2  1            4          1
0000b849f77a49e4a4ce2b2a4ca5be3f  1            2          1
0000f46a3911fa3c0805444483337064  1            5          1
0000f6ccb0745a6a4b88665a16c9f078  1            4          1
0004aac84e0df4da2b147fca70cf8255  1            5          1
                                                         ..
fffcf5a5ff07b0908bd4e2dbc735a684  1            1          1
fffea47cd6d3cc0a88bd621562a9d061  1            2          1
ffff371b4d645b6ecea244b27531430a  1            2          1
ffff5962728ec6157033ef9805bacc48  1            4          1
ffffd2657e2aad2907e67c3e9daecbeb  1            5          1
Name: count, Length: 96096, dtype: int64
