### Predictive modeling of Orders (Order_payment, review, items and Orders)


In [19]:
import pandas as pd
import snowflake.connector
import os
from dotenv import load_dotenv
from pathlib import Path

# Carrega as variáveis de ambiente
env_path = Path('.') / 'environment.env'
load_dotenv(dotenv_path=env_path)
SF_USER = os.getenv("SF_USER")
SF_PASSWORD = os.getenv("SF_PASSWORD")
SF_ACCOUNT = os.getenv("SF_ACCOUNT")
SF_WAREHOUSE = os.getenv("SF_WAREHOUSE")
SF_DATABASE = os.getenv("SF_DATABASE")
SF_SCHEMA = os.getenv("SF_SCHEMA")

# Conecta ao Snowflake
conn = snowflake.connector.connect(
    user=SF_USER,
    password=SF_PASSWORD,
    account=SF_ACCOUNT,
    warehouse=SF_WAREHOUSE,
    database=SF_DATABASE,
    schema=SF_SCHEMA
)

# Carrega as tabelas
print("Carregando tabelas do Snowflake...")
df_orders = pd.read_sql("SELECT * FROM orders_refined", conn)
df_orders_reviews = pd.read_sql("SELECT * FROM order_reviews_refined", conn)
df_order_payments = pd.read_sql("SELECT * FROM order_payments_refined", conn)
df_order_items = pd.read_sql("SELECT * FROM order_items_refined", conn)
df_products = pd.read_sql("SELECT * FROM products_refined", conn)
# 💡 NOVO: Carregando a tabela de clientes
df_customers = pd.read_sql("SELECT * FROM customers_refined", conn)

# Fecha a conexão
conn.close()

# Padroniza os nomes das colunas
for df in [df_orders, df_orders_reviews, df_order_payments, df_order_items, df_products, df_customers]:
    df.columns = df.columns.str.lower()

# Realiza as junções sequenciais
print("Unindo as tabelas...")
df_full_orders = df_orders.merge(df_orders_reviews, on='order_id', how='left')
df_full_orders = df_full_orders.merge(df_order_payments, on='order_id', how='left')
df_full_orders = df_full_orders.merge(df_order_items, on='order_id', how='left')
df_full_orders = df_full_orders.merge(df_products, on='product_id', how='left')
# 💡 NOVO: Juntando a tabela de clientes
df_full_orders = df_full_orders.merge(
    df_customers[['customer_id', 'customer_state', 'customer_zip_code_prefix']],
    on='customer_id',
    how='left'
)

print("Junção completa. O DataFrame final tem o formato:", df_full_orders.shape)
print(df_full_orders.head())

Carregando tabelas do Snowflake...


  df_orders = pd.read_sql("SELECT * FROM orders_refined", conn)
  df_orders_reviews = pd.read_sql("SELECT * FROM order_reviews_refined", conn)
  df_order_payments = pd.read_sql("SELECT * FROM order_payments_refined", conn)
  df_order_items = pd.read_sql("SELECT * FROM order_items_refined", conn)
  df_products = pd.read_sql("SELECT * FROM products_refined", conn)
  df_customers = pd.read_sql("SELECT * FROM customers_refined", conn)


Unindo as tabelas...
Junção completa. O DataFrame final tem o formato: (476572, 56)
                           order_id                       customer_id  \
0  949d5b44dbf5de918fe9c16f97b45f8a  f88197465ea7920adcdbec7375364d82   
1  949d5b44dbf5de918fe9c16f97b45f8a  f88197465ea7920adcdbec7375364d82   
2  949d5b44dbf5de918fe9c16f97b45f8a  f88197465ea7920adcdbec7375364d82   
3  949d5b44dbf5de918fe9c16f97b45f8a  f88197465ea7920adcdbec7375364d82   
4  85ce859fd6dc634de8d2f1e290444043  059f7fc5719c7da6cbafe370971a8d70   

  order_status order_purchase_timestamp    order_approved_at  \
0    delivered      1511033286000000000  1511034359000000000   
1    delivered      1511033286000000000  1511034359000000000   
2    delivered      1511033286000000000  1511034359000000000   
3    delivered      1511033286000000000  1511034359000000000   
4    delivered      1511222621000000000  1511223262000000000   

  order_delivered_carrier_date order_delivered_customer_date  \
0          15113579990000000

### Treinamento 

In [37]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from xgboost import XGBClassifier, XGBRegressor
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import joblib
import os

# --- 1. Preparação dos Dados ---
print("Iniciando a preparação dos dados...")

# Lista completa de features numéricas e categóricas
all_numeric_features = [
    'price',
    'freight_value',
    'payment_installments',
    'total_delivery_time_hours',
    'shipping_time_hours',
    'product_weight_g',
    'product_volume_cm3'
]

categorical_features = [
    'customer_state'
]

# 💡 CORREÇÃO CRUCIAL: Converte todas as features numéricas para o tipo correto
for col in all_numeric_features:
    df_full_orders[col] = pd.to_numeric(df_full_orders[col], errors='coerce')

# Preenche os valores nulos com a mediana para as colunas numéricas
for col in all_numeric_features:
    median_val = df_full_orders[col].median()
    df_full_orders[col].fillna(median_val, inplace=True)

# 💡 CORREÇÃO: Garante que a coluna review_score seja numérica e sem nulos ANTES de usá-la
df_full_orders['review_score'] = pd.to_numeric(df_full_orders['review_score'], errors='coerce')
df_full_orders.dropna(subset=['review_score', 'delivery_delay_hours'], inplace=True)

# 💡 NOVO: Codifica as features categóricas com One-Hot Encoding
one_hot_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoded_features = one_hot_encoder.fit_transform(df_full_orders[categorical_features])
encoded_df = pd.DataFrame(encoded_features, columns=one_hot_encoder.get_feature_names_out(categorical_features))
encoded_df.index = df_full_orders.index

# Combina as features numéricas e categóricas
X_combined = pd.concat([df_full_orders[all_numeric_features], encoded_df], axis=1)


# --- 2. Modelagem para Prever 'is_satisfied' (Classificação) ---
print("\n--- Modelagem para prever se o cliente está satisfeito ---")

# 💡 CORREÇÃO: Garante que a coluna review_score seja numérica e sem nulos ANTES de usá-la
df_full_orders['is_satisfied'] = df_full_orders['review_score'].apply(lambda score: 1 if score >= 4 else 0)
X = X_combined
y_satisfied = df_full_orders['is_satisfied']

X_train, X_test, y_train, y_test = train_test_split(
    X, y_satisfied, test_size=0.20, random_state=42
)

models_classification = {
    "Logistic Regression": LogisticRegression(random_state=42, solver='liblinear'),
    "Random Forest Classifier": RandomForestClassifier(n_estimators=10, random_state=42),
    "XGBoost Classifier": XGBClassifier(n_estimators=10, random_state=42)
}

for name, model in models_classification.items():
    print(f"\nTreinando {name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    print(f"Métricas para {name}:")
    print(f"Acurácia: {accuracy:.4f}")
    print(f"Precisão: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")

# --- 💡 NOVO: Salvando o modelo e o encoder ---
print("\n--- Salvando o modelo e o encoder ---")
# Define o diretório para salvar
model_dir = 'models'
if not os.path.exists(model_dir):
    os.makedirs(model_dir)

# Salva o modelo de classificação que você quer (ex: Random Forest)
joblib.dump(models_classification['Random Forest Classifier'], os.path.join(model_dir, 'rf_classifier_satisfied.joblib'))
# Salva o encoder para garantir que a API use a mesma codificação
joblib.dump(one_hot_encoder, os.path.join(model_dir, 'one_hot_encoder.joblib'))
print("Modelo e encoder salvos com sucesso!")


# --- 3. Modelagem para Prever 'delivery_delay_hours' (Regressão) ---
print("\n--- Previsão de Delivery Delay Hours ---")

# Garantindo que a coluna de atraso está pronta para o modelo
df_full_orders['delivery_delay_hours'] = pd.to_numeric(df_full_orders['delivery_delay_hours'], errors='coerce')
df_full_orders.dropna(subset=['delivery_delay_hours'], inplace=True)

X_delay = X_combined
y_delay = df_full_orders['delivery_delay_hours']

X_train_delay, X_test_delay, y_train_delay, y_test_delay = train_test_split(
    X_delay, y_delay, test_size=0.20, random_state=42
)

models_regression = {
    "Linear Regression": LinearRegression(),
    "Random Forest Regressor": RandomForestRegressor(n_estimators=10, random_state=42),
    "XGBoost Regressor": XGBRegressor(n_estimators=10, random_state=42)
}

for name, model in models_regression.items():
    print(f"\nTreinando {name}...")
    model.fit(X_train_delay, y_train_delay)
    y_pred_delay = model.predict(X_test_delay)
    r2 = r2_score(y_test_delay, y_pred_delay)
    mae = mean_absolute_error(y_test_delay, y_pred_delay)
    rmse = np.sqrt(mean_squared_error(y_test_delay, y_pred_delay))
    print(f"Métricas para {name}:")
    print(f"R²: {r2:.4f}")
    print(f"MAE: {mae:.4f}")
    print(f"RMSE: {rmse:.4f}")

Iniciando a preparação dos dados...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_full_orders[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_full_orders[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are se


--- Modelagem para prever se o cliente está satisfeito ---

Treinando Logistic Regression...
Métricas para Logistic Regression:
Acurácia: 0.7902
Precisão: 0.7913
Recall: 0.9880
F1-Score: 0.8788

Treinando Random Forest Classifier...
Métricas para Random Forest Classifier:
Acurácia: 0.9959
Precisão: 0.9967
Recall: 0.9980
F1-Score: 0.9974

Treinando XGBoost Classifier...
Métricas para XGBoost Classifier:
Acurácia: 0.8010
Precisão: 0.8037
Recall: 0.9811
F1-Score: 0.8836

--- Salvando o modelo e o encoder ---
Modelo e encoder salvos com sucesso!

--- Previsão de Delivery Delay Hours ---

Treinando Linear Regression...
Métricas para Linear Regression:
R²: 0.4912
MAE: 128.0181
RMSE: 174.9696

Treinando Random Forest Regressor...
Métricas para Random Forest Regressor:
R²: 0.9848
MAE: 12.7521
RMSE: 30.2541

Treinando XGBoost Regressor...
Métricas para XGBoost Regressor:
R²: 0.5353
MAE: 118.3635
RMSE: 167.2154
