In [19]:
# Importar bibliotecas e iniciar Spark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.regression import LinearRegression, RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml import Pipeline
import pandas as pd
import os
from datetime import datetime, timedelta

spark = SparkSession.builder \
    .appName("DailyTotalSalesPrediction") \
    .config("spark.driver.memory", "2g") \
    .getOrCreate()

print("=" * 70)
print("PREVIS√ÉO DE VENDAS TOTAIS DI√ÅRIAS PARA 2019")
print("=" * 70)

PREVIS√ÉO DE VENDAS TOTAIS DI√ÅRIAS PARA 2019


In [20]:
print("\n[1/6] Carregando dados...")

schema = StructType([
    StructField("order_id", StringType(), True),
    StructField("order_status", StringType(), True),
    StructField("order_purchase_timestamp", StringType(), True),
    StructField("customer_city", StringType(), True),
    StructField("customer_state", StringType(), True),
    StructField("latitude_media", DoubleType(), True),
    StructField("longitude_media", DoubleType(), True),
    StructField("product_id", StringType(), True),
    StructField("product_category_name", StringType(), True),
    StructField("price", DoubleType(), True),
    StructField("freight_value", DoubleType(), True),
    StructField("payment_value", DoubleType(), True),
    StructField("order_date", StringType(), True),
    StructField("precip", DoubleType(), True),
    StructField("temp_max", DoubleType(), True),
    StructField("temp_min", DoubleType(), True),
    StructField("weather", StringType(), True)
])

base_path = "../input"
csv_path = os.path.join(base_path, "final-dataset")

df = spark.read.option("header", "true").schema(schema).csv(csv_path)

# Converter datas e extrair features temporais
df = df.withColumn("order_date", to_date(col("order_purchase_timestamp")))
df = df.withColumn("year", year("order_date"))
df = df.withColumn("month", month("order_date"))
df = df.withColumn("dayofweek", dayofweek("order_date"))

# Tratar weather
df = df.withColumn("weather", 
                   when(col("weather") == "unknown", "clear")
                   .when(col("weather").isNull(), "clear")
                   .otherwise(col("weather")))

df = df.fillna({'precip': 0.0, 'temp_max': 25.0, 'temp_min': 18.0})

# Features de temperatura
df = df.withColumn("temp_media", (col("temp_max") + col("temp_min")) / 2)
df = df.withColumn("amplitude_termica", col("temp_max") - col("temp_min"))

# Esta√ß√£o do ano
df = df.withColumn("estacao", 
                   when((col("month").between(3, 5)), "outono")
                   .when((col("month").between(6, 8)), "inverno")
                   .when((col("month").between(9, 11)), "primavera")
                   .otherwise("verao"))

# Filtrar dados completos
df_clean = df.filter(
    col("payment_value").isNotNull() & 
    col("price").isNotNull() &
    col("freight_value").isNotNull()
)

print(f"Total de pedidos individuais: {df_clean.count():,}")


[1/6] Carregando dados...
Total de pedidos individuais: 102,349


In [21]:
print("\n[2/6] Agregando vendas por dia...")

# AGREGAR: soma de vendas + contagem de pedidos por dia
df_daily = df_clean.groupBy("order_date", "year", "month", "dayofweek") \
    .agg(
        sum("payment_value").alias("total_vendas_dia"),
        count("order_id").alias("num_pedidos"),
        avg("price").alias("avg_price"),
        avg("freight_value").alias("avg_freight"),
        avg("precip").alias("avg_precip"),
        avg("temp_max").alias("avg_temp_max"),
        avg("temp_min").alias("avg_temp_min"),
        first("estacao").alias("estacao")
    ) \
    .withColumn("temp_media", (col("avg_temp_max") + col("avg_temp_min")) / 2) \
    .withColumn("amplitude_termica", col("avg_temp_max") - col("avg_temp_min"))

# Mostrar exemplo
print(f"\nTotal de dias com vendas: {df_daily.count()}")
print("\nExemplo de dados agregados:")
df_daily.select("order_date", "total_vendas_dia", "num_pedidos").show(5, False)


[2/6] Agregando vendas por dia...

Total de dias com vendas: 615

Exemplo de dados agregados:
+----------+------------------+-----------+
|order_date|total_vendas_dia  |num_pedidos|
+----------+------------------+-----------+
|2017-10-06|20840.75          |134        |
|2018-04-10|33005.0           |205        |
|2017-09-12|32420.91          |206        |
|2018-04-25|49824.469999999994|294        |
|2017-08-12|19894.08          |103        |
+----------+------------------+-----------+
only showing top 5 rows


In [22]:
print("\n[3/6] Preparando pipeline de ML...")

# Indexar esta√ß√£o
estacao_indexer = StringIndexer(
    inputCol="estacao",
    outputCol="estacao_index", 
    handleInvalid="keep"
)

# Features para prever TOTAL DE VENDAS DO DIA
feature_cols = [
    'avg_price', 'avg_freight', 'num_pedidos',
    'avg_precip', 'avg_temp_max', 'avg_temp_min', 
    'temp_media', 'amplitude_termica',
    'month', 'dayofweek', 'estacao_index'
]

assembler = VectorAssembler(
    inputCols=feature_cols,
    outputCol="features"
)

pipeline = Pipeline(stages=[estacao_indexer, assembler])

# Split temporal
train_df = df_daily.filter(col("year") <= 2017)
test_df = df_daily.filter(col("year") == 2018)

print(f"Dias de treino (2016-2017): {train_df.count()}")
print(f"Dias de teste (2018): {test_df.count()}")

pipeline_model = pipeline.fit(train_df)
train_data = pipeline_model.transform(train_df)
test_data = pipeline_model.transform(test_df)


[3/6] Preparando pipeline de ML...
Dias de treino (2016-2017): 373
Dias de teste (2018): 242


In [23]:
print("\n[4/6] Treinando modelos com Cross Validation...")

# --- Linear Regression ---
lr = LinearRegression(
    featuresCol="features",
    labelCol="total_vendas_dia",
    predictionCol="prediction"
)

lr_paramGrid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.01, 0.1, 0.3]) \
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \
    .build()

lr_evaluator = RegressionEvaluator(
    labelCol="total_vendas_dia",
    predictionCol="prediction",
    metricName="rmse"
)

lr_cv = CrossValidator(
    estimator=lr,
    estimatorParamMaps=lr_paramGrid,
    evaluator=lr_evaluator,
    numFolds=3
)

print("  -> Treinando Linear Regression...")
lr_model = lr_cv.fit(train_data)
lr_predictions = lr_model.transform(test_data)
lr_rmse = lr_evaluator.evaluate(lr_predictions)

lr_r2_eval = RegressionEvaluator(
    labelCol="total_vendas_dia", 
    predictionCol="prediction",
    metricName="r2"
)
lr_r2 = lr_r2_eval.evaluate(lr_predictions)

print(f"     RMSE: R$ {lr_rmse:,.2f} | R¬≤: {lr_r2:.4f}")

# --- Random Forest ---
rf = RandomForestRegressor(
    featuresCol="features",
    labelCol="total_vendas_dia", 
    predictionCol="prediction",
    seed=42,
    maxBins=50
)

rf_paramGrid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [50, 100]) \
    .addGrid(rf.maxDepth, [5, 10]) \
    .build()

rf_evaluator = RegressionEvaluator(
    labelCol="total_vendas_dia",
    predictionCol="prediction", 
    metricName="rmse"
)

rf_cv = CrossValidator(
    estimator=rf,
    estimatorParamMaps=rf_paramGrid,
    evaluator=rf_evaluator,
    numFolds=3
)

print("  -> Treinando Random Forest...")
rf_model = rf_cv.fit(train_data)
rf_predictions = rf_model.transform(test_data)
rf_rmse = rf_evaluator.evaluate(rf_predictions)

rf_r2_eval = RegressionEvaluator(
    labelCol="total_vendas_dia",
    predictionCol="prediction",
    metricName="r2"
)
rf_r2 = rf_r2_eval.evaluate(rf_predictions)

print(f"     RMSE: R$ {rf_rmse:,.2f} | R¬≤: {rf_r2:.4f}")

# Selecionar melhor modelo
if lr_rmse < rf_rmse:
    best_model_name = "Linear Regression"
    best_model = lr_model
    best_rmse = lr_rmse
    best_r2 = lr_r2
else:
    best_model_name = "Random Forest"
    best_model = rf_model
    best_rmse = rf_rmse
    best_r2 = rf_r2

print(f"\n{'='*70}")
print(f"MELHOR MODELO: {best_model_name}")
print(f"RMSE: R$ {best_rmse:,.2f} | R¬≤: {best_r2:.4f}")
print(f"{'='*70}")


[4/6] Treinando modelos com Cross Validation...
  -> Treinando Linear Regression...
     RMSE: R$ 3,490.22 | R¬≤: 0.9071
  -> Treinando Random Forest...
     RMSE: R$ 9,838.81 | R¬≤: 0.2616

MELHOR MODELO: Linear Regression
RMSE: R$ 3,490.22 | R¬≤: 0.9071


In [24]:
print("\n[5/6] Gerando previs√µes di√°rias para 2019...")

# Calcular m√©dias hist√≥ricas por m√™s e dia da semana
historical_patterns = df_daily.groupBy("month", "dayofweek") \
    .agg(
        avg("num_pedidos").alias("avg_num_pedidos"),
        avg("avg_price").alias("hist_avg_price"),
        avg("avg_freight").alias("hist_avg_freight"),
        avg("avg_precip").alias("hist_avg_precip"),
        avg("avg_temp_max").alias("hist_avg_temp_max"),
        avg("avg_temp_min").alias("hist_avg_temp_min")
    ).collect()

# Criar dicion√°rio de lookup
patterns_dict = {}
for row in historical_patterns:
    key = (row['month'], row['dayofweek'])
    patterns_dict[key] = row

# Gerar todas as datas de 2019
start_date = datetime(2019, 1, 1)
end_date = datetime(2019, 12, 31)
predictions_2019 = []

current_date = start_date
while current_date <= end_date:
    month = current_date.month
    dayofweek = current_date.isoweekday() + 1  # PySpark: domingo=1
    
    # Buscar padr√£o hist√≥rico
    pattern = patterns_dict.get((month, dayofweek))
    
    if pattern:
        # Esta√ß√£o do ano
        if month in [3, 4, 5]:
            estacao = "outono"
        elif month in [6, 7, 8]:
            estacao = "inverno"
        elif month in [9, 10, 11]:
            estacao = "primavera"
        else:
            estacao = "verao"
        
        predictions_2019.append({
            'order_date': current_date.strftime('%Y-%m-%d'),
            'year': 2019,
            'month': month,
            'dayofweek': dayofweek,
            'num_pedidos': pattern['avg_num_pedidos'],
            'avg_price': pattern['hist_avg_price'],
            'avg_freight': pattern['hist_avg_freight'],
            'avg_precip': pattern['hist_avg_precip'],
            'avg_temp_max': pattern['hist_avg_temp_max'],
            'avg_temp_min': pattern['hist_avg_temp_min'],
            'estacao': estacao
        })
    
    current_date += timedelta(days=1)

print(f"  -> Criados {len(predictions_2019)} registros di√°rios")

# Converter para DataFrame do Spark
df_2019 = spark.createDataFrame(predictions_2019)
df_2019 = df_2019.withColumn("temp_media", (col("avg_temp_max") + col("avg_temp_min")) / 2)
df_2019 = df_2019.withColumn("amplitude_termica", col("avg_temp_max") - col("avg_temp_min"))

# Aplicar transforma√ß√µes
df_2019_transformed = pipeline_model.transform(df_2019)

# Fazer previs√µes
print(f"  -> Fazendo previs√µes com {best_model_name}...")
df_2019_predictions = best_model.transform(df_2019_transformed)


[5/6] Gerando previs√µes di√°rias para 2019...
  -> Criados 313 registros di√°rios
  -> Fazendo previs√µes com Linear Regression...


In [26]:
print("\n[6/6] Exportando resultados...")

results = df_2019_predictions.select(
    "order_date", 
    "prediction",
    "num_pedidos"
).orderBy("order_date").collect()

# Criar DataFrame pandas
predictions_df = pd.DataFrame({
    'data': [r['order_date'] for r in results],
    'vendas_previstas': [r['prediction'] if r['prediction'] > 0 else 0 for r in results],
    'pedidos_estimados': [int(r['num_pedidos']) for r in results]
})

# Calcular estat√≠sticas
total_previsto = predictions_df['vendas_previstas'].sum()
media_diaria = predictions_df['vendas_previstas'].mean()
total_pedidos = predictions_df['pedidos_estimados'].sum()

# Totais hist√≥ricos
total_2016 = df_daily.filter(col("year") == 2016).agg(sum("total_vendas_dia")).collect()[0][0] or 0
total_2017 = df_daily.filter(col("year") == 2017).agg(sum("total_vendas_dia")).collect()[0][0] or 0
total_2018 = df_daily.filter(col("year") == 2018).agg(sum("total_vendas_dia")).collect()[0][0] or 0

# Salvar CSV
output_path = "../output/previsoes_diarias_2019.csv"
os.makedirs("../output", exist_ok=True)
predictions_df.to_csv(output_path, index=False)

# ============================================================================
# RELAT√ìRIO FINAL
# ============================================================================
print("\n" + "=" * 70)
print("RELAT√ìRIO FINAL - PREVIS√ïES DE VENDAS TOTAIS 2019")
print("=" * 70)

print(f"\nüìä MODELO SELECIONADO: {best_model_name}")
print(f"   RMSE: R$ {best_rmse:,.2f}")
print(f"   R¬≤: {best_r2:.4f}")

print(f"\nüìà TOTAIS HIST√ìRICOS:")
print(f"   2016: R$ {total_2016:,.2f}")
print(f"   2017: R$ {total_2017:,.2f}")
print(f"   2018: R$ {total_2018:,.2f}")

print(f"\nüîÆ PREVIS√ÉO 2019:")
print(f"   Total anual: R$ {total_previsto:,.2f}")
print(f"   M√©dia di√°ria: R$ {media_diaria:,.2f}")
print(f"   Total de pedidos: {total_pedidos:,}")
print(f"   Dias previstos: {len(predictions_df)}")

if total_2018 > 0:
    variacao = ((total_previsto - total_2018) / total_2018) * 100
    print(f"   Varia√ß√£o vs 2018: {variacao:+.2f}%")

print(f"\nüíæ ARQUIVO GERADO:")
print(f"   {output_path}")

print("\nüìã PREVIEW (primeiros 10 dias):")
print(predictions_df.head(10).to_string(index=False))

print("\nüìã ESTAT√çSTICAS MENSAIS:")
predictions_df['mes'] = pd.to_datetime(predictions_df['data']).dt.month
monthly_stats = predictions_df.groupby('mes')['vendas_previstas'].sum()
for mes, total in monthly_stats.items():
    print(f"   M√™s {mes:02d}: R$ {total:,.2f}")

print("\n" + "=" * 70)
print("AN√ÅLISE COMPLETA!")
print("=" * 70)

spark.stop()
print("\n‚úì Spark encerrado.")


[6/6] Exportando resultados...

RELAT√ìRIO FINAL - PREVIS√ïES DE VENDAS TOTAIS 2019

üìä MODELO SELECIONADO: Linear Regression
   RMSE: R$ 3,490.22
   R¬≤: 0.9071

üìà TOTAIS HIST√ìRICOS:
   2016: R$ 61,080.15
   2017: R$ 7,436,454.36
   2018: R$ 9,042,746.54

üîÆ PREVIS√ÉO 2019:
   Total anual: R$ 8,714,976.43
   M√©dia di√°ria: R$ 27,843.38
   Total de pedidos: 54,618
   Dias previstos: 313
   Varia√ß√£o vs 2018: -3.62%

üíæ ARQUIVO GERADO:
   ../output/previsoes_diarias_2019.csv

üìã PREVIEW (primeiros 10 dias):
      data  vendas_previstas  pedidos_estimados
2019-01-01      25540.544988                165
2019-01-02      28233.943282                178
2019-01-03      21760.172180                149
2019-01-04      21995.937913                131
2019-01-05      17096.728113                111
2019-01-07      23617.707314                155
2019-01-08      25540.544988                165
2019-01-09      28233.943282                178
2019-01-10      21760.172180             