In [1]:
import pandas as pd
from sqlalchemy import create_engine, text

# Remplacez ces valeurs par les informations de votre base de données
db_host = "localhost"
db_name = "olist"
db_user = "postgres"
db_password = "admin"

# Créer une connexion avec la base de données en utilisant sqlalchemy
engine = create_engine(f"postgresql://{db_user}:{db_password}@{db_host}/{db_name}")


# Customer

In [2]:
# Lire le fichier CSV avec pandas
data = pd.read_csv("data/olist_customers_dataset.csv", encoding="utf-8")
data.head()

Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state
0,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP
1,18955e83d337fd6b2def6b18a428ac77,290c77bc529b7ac935b93aa66c333dc3,9790,sao bernardo do campo,SP
2,4e7b3e00288586ebd08712fdd0374a03,060e732b5b29e8181a18229c7b0b2b5e,1151,sao paulo,SP
3,b2b6027bc5c5109e529d4dc6358b12c3,259dac757896d24d7702b9acbbff3f3c,8775,mogi das cruzes,SP
4,4f2d8ab171c80ec8364f7c12e35b23ad,345ecd01c38d18a9036ed96c73b8d066,13056,campinas,SP


In [3]:
zip_codes = data[['customer_zip_code_prefix']].drop_duplicates()
zip_codes.columns = ['zip_code_prefix']

states = data[['customer_state']].drop_duplicates()
states.columns = ['state_code']

cities = data[['customer_city', 'customer_state']].drop_duplicates()
cities.columns = ['city_name', 'state_code']

city_zip_codes = data[['customer_city', 'customer_state', 'customer_zip_code_prefix']].drop_duplicates()
city_zip_codes.columns = ['city_name', 'state_code', 'zip_code_prefix']

customers = data[['customer_id', 'customer_unique_id', 'customer_zip_code_prefix']].drop_duplicates()
customers.columns = ['customer_id', 'customer_unique_id', 'zip_code_prefix']

zip_codes.to_sql('olist_zipcode', engine, if_exists='append', index=False)
states.to_sql('olist_state', engine, if_exists='append', index=False)
cities.to_sql('olist_city', engine, if_exists='append', index=False)
customers.to_sql('olist_customer', engine, if_exists='append', index=False)

# Récupérer les city_id à partir de la base de données
cities_db = pd.read_sql("SELECT city_id, city_name, state_code FROM olist_city", engine)

# Joindre les DataFrames
city_zip_codes_joined = city_zip_codes.merge(cities_db, on=['city_name', 'state_code'])

# Insérer les données dans la table city_zip_code
city_zip_codes_joined[['city_id', 'zip_code_prefix']].to_sql('olist_cityzipcode', engine, if_exists='append', index=False)

# Orders

In [None]:
# Lire le fichier CSV avec pandas
data = pd.read_csv("data/olist_orders_dataset.csv", encoding="utf-8")
data.head()

In [None]:
date_columns = ['order_approved_at', 'order_purchase_timestamp', 'order_delivered_carrier_date', 'order_delivered_customer_date', 'order_estimated_delivery_date']
for column in date_columns:
    data[column] = pd.to_datetime(data[column])

data.to_sql('olist_order', engine, if_exists='append', index=False)

# Order Payments

In [None]:
# Lire le fichier CSV avec pandas
data = pd.read_csv("data/olist_order_payments_dataset.csv", encoding="utf-8")
data.head()

In [None]:
data.to_sql('olist_orderpayment', engine, if_exists='append', index=False)

# Order Reviews

In [None]:
# Lire le fichier CSV avec pandas
data = pd.read_csv("data/olist_order_reviews_dataset.csv", encoding="utf-8")
data.head()

In [None]:
data.to_sql('olist_orderreview', engine, if_exists='append', index=False)

# Product Category Name Traductions

In [None]:
# Lire le fichier CSV avec pandas
data = pd.read_csv("data/product_category_name_translation.csv", encoding="utf-8")
data.head()

In [None]:
data.to_sql('olist_productcategorynametranslation', engine, if_exists='append', index=False)

# Products

In [None]:
# Lire le fichier CSV avec pandas
data = pd.read_csv("data/olist_products_dataset.csv", encoding="utf-8")
data.head()

In [None]:
data = data.rename(columns={"product_category_name": "product_category_name_translation"})

unique_categories = set(data["product_category_name_translation"].dropna().unique())
existing_categories = pd.read_sql("SELECT product_category_name FROM olist_productcategorynametranslation", engine)
missing_categories = unique_categories - set(existing_categories["product_category_name"].values)

if missing_categories:
    missing_categories_df = pd.DataFrame({"product_category_name": list(missing_categories),
                                          "product_category_name_english": ["unknown"] * len(missing_categories)})
    missing_categories_df.to_sql('olist_productcategorynametranslation', engine, if_exists='append', index=False)

data["product_category_name_translation"].fillna("unknown_category", inplace=True)

data.to_sql('olist_product', engine, if_exists='append', index=False)

# Sellers

In [None]:
# Lire le fichier CSV avec pandas
data = pd.read_csv("data/olist_sellers_dataset.csv", encoding="utf-8")
data.head()

In [None]:
# Ajouter les codes postaux manquants
unique_zip_codes = data['seller_zip_code_prefix'].drop_duplicates().to_frame()
unique_zip_codes.columns = ['zip_code_prefix']
unique_zip_codes['zip_code_prefix'] = unique_zip_codes['zip_code_prefix'].astype(str)

existing_zip_codes = pd.read_sql("SELECT zip_code_prefix FROM olist_zipcode", engine)
existing_zip_codes['zip_code_prefix'] = existing_zip_codes['zip_code_prefix'].astype(str)

# Fusionner les deux dataframes pour trouver les codes postaux manquants
missing_zip_codes = unique_zip_codes.merge(existing_zip_codes, on='zip_code_prefix', how='left', indicator=True)
missing_zip_codes = missing_zip_codes[missing_zip_codes['_merge'] == 'left_only'][['zip_code_prefix']]

# Ajouter les codes postaux manquants à la table olist_zipcode
missing_zip_codes.to_sql('olist_zipcode', engine, if_exists='append', index=False)

# Ajouter les états manquants
unique_states = data['seller_state'].drop_duplicates()
states_db = pd.read_sql("SELECT state_code FROM olist_state", engine)
missing_states = unique_states.loc[~unique_states.isin(states_db['state_code'])]
missing_states = pd.DataFrame(missing_states, columns=['state_code'])
missing_states.to_sql('olist_state', engine, if_exists='append', index=False)

# Ajouter les villes manquantes
unique_cities = data[['seller_city', 'seller_state']].drop_duplicates()
cities_db = pd.read_sql("SELECT city_name, state_code FROM olist_city", engine)
missing_cities = unique_cities.merge(cities_db, left_on=['seller_city', 'seller_state'], right_on=['city_name', 'state_code'], how='left', indicator=True)
missing_cities = missing_cities[missing_cities['_merge'] == 'left_only'][['seller_city', 'seller_state']]
missing_cities.columns = ['city_name', 'state_code'] # Renommer les colonnes pour correspondre à la table city
missing_cities.to_sql('olist_city', engine, if_exists='append', index=False)

# Maintenant, ajouter les relations manquantes entre les villes et les codes postaux
cities_db = pd.read_sql("SELECT city_id, city_name, state_code FROM olist_city", engine)
city_zip_codes = data[['seller_city', 'seller_state', 'seller_zip_code_prefix']].drop_duplicates()
city_zip_codes_joined = city_zip_codes.merge(cities_db, left_on=['seller_city', 'seller_state'], right_on=['city_name', 'state_code'])
city_zip_codes_joined = city_zip_codes_joined[['city_id', 'seller_zip_code_prefix']]
city_zip_codes_joined.columns = ['city_id', 'zip_code_prefix']  # Renommer les colonnes pour correspondre à la table city_zip_code

existing_city_zip_codes = pd.read_sql("SELECT city_id, zip_code_prefix FROM olist_cityzipcode", engine)

# Convertir les types de données pour qu'ils correspondent
city_zip_codes_joined['city_id'] = city_zip_codes_joined['city_id'].astype(int)
city_zip_codes_joined['zip_code_prefix'] = city_zip_codes_joined['zip_code_prefix'].astype(str)
existing_city_zip_codes['city_id'] = existing_city_zip_codes['city_id'].astype(int)
existing_city_zip_codes['zip_code_prefix'] = existing_city_zip_codes['zip_code_prefix'].astype(str)

city_zip_codes_to_insert = city_zip_codes_joined.merge(existing_city_zip_codes, on=['city_id', 'zip_code_prefix'], how='left', indicator=True)
city_zip_codes_to_insert = city_zip_codes_to_insert[city_zip_codes_to_insert['_merge'] == 'left_only'][['city_id', 'zip_code_prefix']]

city_zip_codes_to_insert.to_sql('olist_cityzipcode', engine, if_exists='append', index=False)

# Insérer les données des vendeurs dans la table Seller
sellers_data = data[['seller_id', 'seller_zip_code_prefix']].copy()
sellers_data.columns = ['seller_id', 'zip_code_prefix']  # Renommer la colonne pour correspondre au modèle
sellers_data.to_sql('olist_seller', engine, if_exists='append', index=False)