In [1]:
import psycopg2
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import os
from sqlalchemy import text
from dotenv import load_dotenv

## Mise en dataframe des données

In [2]:
# Mise en DataFrame des csv
# Liste pour stockers les dataframes
df_list = {}
# Dossier avec les csv
data_folder = "data"
# Boucle sur les fichiers du dossier 
for csv in os.listdir(data_folder):
    if csv.endswith(".csv"): # Filtre les fichiers au format csv
        file_path = os.path.join(data_folder, csv)
        df = pd.read_csv(file_path) # Lecture des csv
        # Stockage dans le dictionnaire df_list
        name = os.path.splitext(csv)[0]
        df_list[name] = df
        print(f'{csv} chargé : {df.shape[0]} lignes, {df.shape[1]} colonnes')

olist_order_payments_dataset.csv chargé : 103886 lignes, 5 colonnes
olist_order_reviews_dataset.csv chargé : 99224 lignes, 7 colonnes
olist_products_dataset.csv chargé : 32951 lignes, 9 colonnes
olist_customers_dataset.csv chargé : 99441 lignes, 5 colonnes
olist_sellers_dataset.csv chargé : 3095 lignes, 4 colonnes
olist_orders_dataset.csv chargé : 99441 lignes, 8 colonnes
olist_geolocation_dataset.csv chargé : 1000163 lignes, 5 colonnes
product_category_name_translation.csv chargé : 71 lignes, 2 colonnes
olist_order_items_dataset.csv chargé : 112650 lignes, 7 colonnes


## Connexion à supabase

In [3]:
# Configuration de connexion (à adapter selon votre provider)
load_dotenv()


host = os.getenv('DB_HOST')  #  host Supabase
password = os.getenv('DB_PASSWORD_BIS')
user = os.getenv('DB_USER_BIS')
database = os.getenv('DB_DATABASE')
port = os.getenv('DB_PORT')


# Création de l'engine SQLAlchemy
connection_string = f"postgresql+psycopg2://{user}:{password}@{host}:{port}/{database}"
engine = create_engine(connection_string)

# Test de connexion
def test_connection():
    """
    Testez votre connexion à la base

    Étapes :
    1. Utilisez pd.read_sql() pour exécuter "SELECT version()"
    2. Affichez la version PostgreSQL
    3. Gérez les erreurs de connexion
    """
    try:
        df = pd.read_sql('SELECT version();', engine)
        print('Connexion Ok | Version : ', df.iloc[0,0])
    except Exception as e:
        print(f"Erreur de connexion : {e}")
        return False
    return True
test_connection()

Connexion Ok | Version :  PostgreSQL 17.4 on aarch64-unknown-linux-gnu, compiled by gcc (GCC) 13.2.0, 64-bit


True

## Création tables


In [4]:
### 🗃️ Création des tables SQL

for table_name, df in df_list.items():
    df.to_sql(name=table_name, con=engine, if_exists='replace', index=False)


## Analyse RFM

In [None]:
# RECENCE
recence_query_max = """
SELECT MAX(order_purchase_timestamp) AS ref_date FROM olist_orders_dataset;
"""
df_recence = pd.read_sql_query(recence_query_max, con=engine)
print(df_recence)
ref_date = df_recence['ref_date'].iloc[0]
print(ref_date)

# S’assurer que ref_date est bien un datetime natif
ref_date = pd.to_datetime(ref_date).to_pydatetime()
print(type(ref_date))

recency_query = text("""
    SELECT 
        customer_id,
        DATE_PART(
            'day', 
            CAST(:ref_date AS timestamp) - CAST(MAX(order_purchase_timestamp) AS timestamp)
        ) AS recency_days,
        NTILE(5) OVER (
            ORDER BY DATE_PART(
                'day', 
                CAST(:ref_date AS timestamp) - CAST(MAX(order_purchase_timestamp) AS timestamp)
            )
        ) AS r_score
    FROM olist_orders_dataset
    GROUP BY customer_id
                     
""")



df_rfm_recency = pd.read_sql_query(recency_query, con=engine, params={"ref_date": ref_date})

print(df_rfm_recency)

              ref_date
0  2018-10-17 17:30:18
2018-10-17 17:30:18
<class 'datetime.datetime'>
                            customer_id  recency_days  r_score
0      856336203359aa6a61bf3826f7d84c49           0.0        1
1      a4b417188addbc05b26b72d5e44837a1           0.0        1
2      4c2ec60c29d10c34bd49cb88aa85cfc4          13.0        1
3      bf6181a85bbb4115736c0a8db1a53be3          16.0        1
4      2823ffda607a2316375088e0d00005ec          18.0        1
...                                 ...           ...      ...
99436  b106b360fe2ef8849fbbd056f777b4d5         744.0        5
99437  86dc2ffce2dfff336de2f386a786e574         762.0        5
99438  622e13439d6b5a0b486c435618b2679e         764.0        5
99439  08c5351a6aca1c1589a38f244edeee9d         772.0        5
99440  683c54fc24d40ee9f8a6fc179fd9856c         772.0        5

[99441 rows x 3 columns]


In [None]:
verif_order = """
CREATE VIEW frequency AS
    SELECT 
        c.customer_unique_id,
        COUNT(DISTINCT o.order_id) AS nb_commandes,
        NTILE(5) OVER (ORDER BY COUNT(o.order_id)) AS f_score
    FROM olist_orders_dataset o
    JOIN olist_customers_dataset c ON o.customer_id = c.customer_id
    WHERE o.order_status = 'delivered'
    GROUP BY c.customer_unique_id
"""


frequence = """
SELECT * FROM frequency
ORDER BY nb_commandes DESC;
"""

df_verif = pd.read_sql_query(frequence, con=engine)
print(df_verif)

                     customer_unique_id  nb_commandes  f_score
0      8d50f5eadf50201ccdcedfb9e2ac8455            15        5
1      3e43e6105506432c953e165fb2acf44c             9        5
2      ca77025e7201e3b30c44b472ff346268             7        5
3      6469f99c1f9dfae7733b25662e7f1782             7        5
4      1b6c7548a2a1f9037c1fd3ddfed95f33             7        5
...                                 ...           ...      ...
93353  7cce791a0d926ac8490ca7135968da8a             1        2
93354  7ccedf663c0cde4d0210c10493fac745             1        2
93355  7ccf289b6c5f2c9bb62bb044cc15fa5e             1        2
93356  7ccf66a9802ab7edf4361d8b6adead71             1        2
93357  7ccff7239174883e2ed56008ac04e184             1        2

[93358 rows x 3 columns]


In [30]:
# Fréquence

freq = text("""
SELECT
    c.customer_unique_id,
    COUNT(o.order_id) AS order_count,
    NTILE(5) OVER (ORDER BY COUNT(o.order_id)) AS f_score
FROM olist_orders_dataset o
JOIN olist_customers_dataset c ON o.customer_id = c.customer_id
GROUP BY c.customer_unique_id
ORDER BY order_count DESC;
""")

total_order = pd.read_sql(freq, con=engine)
print(total_order.value_counts())


customer_unique_id                order_count  f_score
0000366f3b9a7992bf8c76cfdf3221e2  1            4          1
0000b849f77a49e4a4ce2b2a4ca5be3f  1            2          1
0000f46a3911fa3c0805444483337064  1            5          1
0000f6ccb0745a6a4b88665a16c9f078  1            4          1
0004aac84e0df4da2b147fca70cf8255  1            5          1
                                                         ..
fffcf5a5ff07b0908bd4e2dbc735a684  1            1          1
fffea47cd6d3cc0a88bd621562a9d061  1            2          1
ffff371b4d645b6ecea244b27531430a  1            2          1
ffff5962728ec6157033ef9805bacc48  1            4          1
ffffd2657e2aad2907e67c3e9daecbeb  1            5          1
Name: count, Length: 96096, dtype: int64
