In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import IsolationForest
from sklearn.cluster import KMeans
import numpy as np

# 🚀 1. Carregar os arquivos CSV
orders = pd.read_csv("data/orders.csv")
drivers = pd.read_csv("data/drivers_data.csv")
missing_items = pd.read_csv("data/missing_items_data.csv")
customers = pd.read_csv("data/customers_data.csv")
products = pd.read_csv("data/products_data.csv")

# 🚀 2. Limpeza e conversão de tipos
orders["date"] = pd.to_datetime(orders["date"])
orders["order_amount"] = pd.to_numeric(orders["order_amount"], errors="coerce")
orders["delivery_hour"] = pd.to_numeric(orders["delivery_hour"], errors="coerce")
products.rename(columns={"produc_id": "product_id"}, inplace=True)  # Corrigir erro no nome da coluna

# 🚀 3. Unindo os DataFrames
df = orders.merge(drivers, on="driver_id", how="left") \
           .merge(customers, on="customer_id", how="left") \
           .merge(missing_items, on="order_id", how="left")

# 🚀 4. Criando colunas úteis para análise de fraude
df["items_received"] = df["items_delivered"] - df[["product_id_1", "product_id_2", "product_id_3"]].notnull().sum(axis=1)
df["missing_rate"] = df["items_missing"] / df["items_delivered"]
df["high_missing"] = df["missing_rate"] > df["missing_rate"].mean()

# 🔥 5. Análise Exploratória

# 🚨 Motoristas com maior taxa de itens faltantes
top_drivers = df.groupby("driver_id")["items_missing"].sum().sort_values(ascending=False)
print("\n🚨 Motoristas com maior número de itens faltantes:")
print(top_drivers.head(10))

# 📊 Visualização da distribuição de pedidos problemáticos
plt.figure(figsize=(12, 6))
sns.histplot(df["missing_rate"], bins=30, kde=True)
plt.title("Distribuição da Taxa de Itens Faltantes")
plt.show()

# 🚀 6. Detecção de Fraude com Isolation Forest
features = df[["items_delivered", "items_missing", "order_amount"]]
# Imputing missing values with median - you can choose a strategy that suits your data better
# Check for columns with missing values
columns_with_missing = features.columns[features.isnull().any()]

# Impute missing values with median for each column
for column in columns_with_missing:
    features[column] = features[column].fillna(features[column].median())

iso_forest = IsolationForest(contamination=0.05, random_state=42)
df["anomaly"] = iso_forest.fit_predict(features)

# 🔴 Mostrar pedidos suspeitos
print("\n🔴 Pedidos suspeitos de fraude:")
display(df[df["anomaly"] == -1])

# 🚀 7. Clusterização com K-Means
kmeans = KMeans(n_clusters=3, random_state=42)
# Before clustering, ensure features DataFrame has no missing values

# Instead of dropping rows, fill NaN values with 0
# This ensures the DataFrame is not empty
features = features.fillna(0)  

df["cluster"] = kmeans.fit_predict(features)
# Reassign cluster labels to the original DataFrame using the index of features
df.loc[features.index, "cluster"] = kmeans.labels_

# 📊 Plotando clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(x=df["items_delivered"], y=df["items_missing"], hue=df["cluster"], palette="viridis")
plt.title("Clusterização de Pedidos")
plt.show()