In [None]:
# ==============================================================================
# PROJECT: THELOOK ECOMMERCE - ADVANCED DATA ANALYTICS & BI
# Author: Christian Panazzolo
# Objective: Data extraction, statistical processing, and interactive visualization
# ==============================================================================

# --- 1. Dependencies and Environment Setup ---
# Importing core libraries for data manipulation and high-level visualization
from google.colab import auth
from google.cloud import bigquery
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

# Authentication for Google Cloud Services
auth.authenticate_user()
print('GCP Authentication Successful')

# Initialize BigQuery Client
# Replace 'your-project-id' with your actual Google Cloud Project ID
project_id = 'your-project-id'
client = bigquery.Client(project=project_id)

# --- 2. Data Extraction Strategy ---
# SQL Query designed to join Orders, Items, and Products for a holistic view.
# Filtering out 'Cancelled' orders to ensure financial accuracy.
query_base = """
SELECT
    o.order_id,
    o.user_id,
    o.status,
    o.created_at,
    oi.sale_price,
    p.category,
    p.name AS product_name,
    p.retail_price,
    p.cost,
    (oi.sale_price - p.cost) AS profit
FROM `bigquery-public-data.thelook_ecommerce.orders` AS o
JOIN `bigquery-public-data.thelook_ecommerce.order_items` AS oi
    ON o.order_id = oi.order_id
JOIN `bigquery-public-data.thelook_ecommerce.products` AS p
    ON oi.product_id = p.id
WHERE o.status != 'Cancelled'
"""

# Converting BigQuery result set to a Pandas DataFrame
df = client.query(query_base).to_dataframe()

# Inspecting dataset structure and schema
print(f"Dataset Loaded: {df.shape[0]} rows and {df.shape[1]} columns.")
df.head()

# --- 3. Advanced Visualization Engineering ---
# Note: The following cells generate the D3.js interactive components
# that are embedded into the Looker Studio dashboard via GitHub Pages.

In [7]:
# --- bubble chart with plotly ---

# Creating an interactive bubble chart
fig = px.scatter(
    df,
    x="category",
    y="avg_margin_per_unit",
    size="revenue",
    color="total_profit",
    hover_name="category",
    title="Product Mix Analysis: Volume vs. Profitability",
    labels={
        "category": "Product Category",
        "avg_margin_per_unit": "Avg Margin per Unit ($)",
        "revenue": "Total Revenue",
        "total_profit": "Total Profit ($)"
    },
    # color scale
    color_continuous_scale=px.colors.sequential.Blues
)

# Refining layout
fig.update_layout(
    template="plotly_white",
    font=dict(family="Arial, sans-serif", size=12),
    margin=dict(l=50, r=50, t=80, b=50),
    title_font_size=24,
    coloraxis_colorbar=dict(title="Profit ($)")
)

# Customizing the marker border for better visibility
fig.update_traces(marker=dict(line=dict(width=1, color='DarkSlateGrey')))

fig.show()

# Exporting into html
file_name = "advanced_analysis_plotly_en.html"
fig.write_html(file_name)
print(f"Success! {file_name} has been generated.")

Success! advanced_analysis_plotly_en.html has been generated.


In [8]:
# --- HEATMAP: ORDERS BY DAY AND HOUR ---
query_heatmap = """
SELECT
    EXTRACT(DAYOFWEEK FROM created_at) AS day_of_week,
    EXTRACT(HOUR FROM created_at) AS hour_of_day,
    COUNT(*) AS total_orders
FROM `bigquery-public-data.thelook_ecommerce.orders`
WHERE created_at >= '2023-01-01'
GROUP BY 1, 2
"""
df_heatmap = client.query(query_heatmap).to_dataframe()

# Pivot data for heatmap format
heatmap_data = df_heatmap.pivot(index='day_of_week', columns='hour_of_day', values='total_orders')

fig_heatmap = px.imshow(
    heatmap_data,
    labels=dict(x="Hour of Day", y="Day of Week (1=Sun)", color="Orders"),
    x=list(range(24)),
    y=['Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat'],
    title="Order Density: Sales Peak Analysis",
    color_continuous_scale='YlGnBu'
)

fig_heatmap.update_layout(template="plotly_white")
fig_heatmap.show()

# Export
fig_heatmap.write_html("order_density_heatmap.html")

In [16]:
# --- BOX PLOT: PRICE DISTRIBUTION & OUTLIERS (ORDERED) ---
query_box = """
SELECT
    p.category,
    oi.sale_price
FROM `bigquery-public-data.thelook_ecommerce.order_items` AS oi
JOIN `bigquery-public-data.thelook_ecommerce.products` AS p ON oi.product_id = p.id
WHERE p.category IN ('Outerwear & Coats', 'Sweaters', 'Jeans', 'Suits & Sport Coats')
"""
df_box = client.query(query_box).to_dataframe()

import plotly.express as px

# Definindo a ordem exata das colunas
target_order = ['Outerwear & Coats', 'Sweaters', 'Jeans', 'Suits & Sport Coats']

fig_box = px.box(
    df_box,
    x="category",
    y="sale_price",
    color="category",
    category_orders={"category": target_order}, # For√ßa a ordem desejada
    title="Price Dispersion & Outlier Detection by Category",
    labels={"category": "Category", "sale_price": "Sale Price ($)"},
    points="outliers"
)

fig_box.update_layout(
    template="plotly_white",
    showlegend=False,
    xaxis_title="Product Category",
    yaxis_title="Sale Price ($)"
)

fig_box.show()

# Export
fig_box.write_html("price_distribution_boxplot.html")

In [11]:
# --- TIME SERIES: ORDERS VS REVENUE (DUAL AXIS) ---
query_dual = """
SELECT
    EXTRACT(YEAR FROM created_at) AS order_year,
    COUNT(DISTINCT order_id) AS total_orders,
    SUM(sale_price) AS total_revenue
FROM `bigquery-public-data.thelook_ecommerce.order_items`
WHERE status NOT IN ('Cancelled', 'Returned')
  AND EXTRACT(YEAR FROM created_at) BETWEEN 2023 AND 2025
GROUP BY 1
ORDER BY 1
"""
df_dual = client.query(query_dual).to_dataframe()

# Criando o gr√°fico com subplots/eixos secund√°rios
from plotly.subplots import make_subplots
import plotly.graph_objects as go

fig_dual = make_subplots(specs=[[{"secondary_y": True}]])

# Adicionando Linha de Pedidos (Eixo Y1)
fig_dual.add_trace(
    go.Scatter(x=df_dual['order_year'], y=df_dual['total_orders'],
               name="Orders", mode='lines+markers', line=dict(color='#003366', width=3)),
    secondary_y=False,
)

# Adicionando Barra de Receita (Eixo Y2)
fig_dual.add_trace(
    go.Bar(x=df_dual['order_year'], y=df_dual['total_revenue'],
           name="Revenue ($)", marker_color='rgba(0, 150, 255, 0.3)'),
    secondary_y=True,
)

# Configura√ß√µes de Layout (English)
fig_dual.update_layout(
    title_text="Business Growth: Order Volume vs. Revenue (2023-2025)",
    template="plotly_white",
    legend=dict(x=0.01, y=0.99),
    xaxis=dict(tickmode='linear')
)

fig_dual.update_yaxes(title_text="<b>Orders</b> (Quantity)", secondary_y=False)
fig_dual.update_yaxes(title_text="<b>Revenue</b> ($ USD)", secondary_y=True)

fig_dual.show()

# Exportar
fig_dual.write_html("growth_comparison_dual_axis.html")