In [3]:
# ==========================================================
#  NOTEBOOK 01 — PRÉ-PROCESSAMENTO DOS BOARDGAMES
# ==========================================================

from pyspark.sql import SparkSession
from pyspark.sql.functions import col
import pyspark.sql.functions as F
from pyspark.sql.window import Window
import json

# ----------------------------------------------------------
# Criar sessão Spark
# ----------------------------------------------------------
spark = SparkSession.builder.appName("BoardGames01").getOrCreate()

# ----------------------------------------------------------
# 1) Carregar dataset JSON
# ----------------------------------------------------------
with open("/app/dados/boardgames_100.json") as f:
    games_data = json.load(f)

df = spark.createDataFrame(games_data)

print("Schema original:")
df.printSchema()
df.show(5, truncate=False)

# ----------------------------------------------------------
# 2) Limpeza e padronização de colunas
# ----------------------------------------------------------
df_clean = df.select(
    col("id").cast("long"),
    col("title"),
    col("types"),
    col("rating")["rating"].alias("rating").cast("double"),
    col("rating")["num_of_reviews"].alias("num_reviews").cast("int"),
    F.size(col("recommendations")["fans_liked"]).alias("recommendations")
)

print("\nSchema após limpeza:")
df_clean.printSchema()
df_clean.show(10, truncate=False)

# ----------------------------------------------------------
# 3) Criar os NODES (jogos)
# ----------------------------------------------------------
nodes = df_clean.select(
    col("id").alias("nodeId"),
    "title",
    "rating",
    "num_reviews"
)

print("\nPreview dos nodes:")
nodes.show(10, truncate=False)

# ----------------------------------------------------------
# 4) Criar EDGES com similaridade de rating
# ----------------------------------------------------------
windowSpec = Window.orderBy("rating")

ranked = df_clean.withColumn("rank", F.row_number().over(windowSpec))

edges = (
    ranked.alias("a")
    .join(
        ranked.alias("b"),
        (F.abs(F.col("a.rating") - F.col("b.rating")) < 0.35) &
        (F.col("a.id") != F.col("b.id"))
    )
    .select(
        F.col("a.id").alias("src"),
        F.col("b.id").alias("dst")
    )
    .limit(500)
)

print("\nPreview dos edges:")
edges.show(20)

# ----------------------------------------------------------
# 5) Salvar nodes e edges como CSVs
# ----------------------------------------------------------
nodes_path = "/app/dados/nodes.csv"
edges_path = "/app/dados/edges.csv"

nodes.coalesce(1).write.csv(nodes_path, header=True, mode="overwrite")
edges.coalesce(1).write.csv(edges_path, header=True, mode="overwrite")

print(f"\nnodes.csv salvo em: {nodes_path}")
print(f"edges.csv salvo em: {edges_path}")

print("\nNotebook 01 finalizado com sucesso!")


Schema original:
root
 |-- credit: map (nullable = true)
 |    |-- key: string
 |    |-- value: array (valueContainsNull = true)
 |    |    |-- element: map (containsNull = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: long (valueContainsNull = true)
 |-- id: long (nullable = true)
 |-- maxplayers: long (nullable = true)
 |-- maxplaytime: long (nullable = true)
 |-- minage: long (nullable = true)
 |-- minplayers: long (nullable = true)
 |-- minplaytime: long (nullable = true)
 |-- rank: long (nullable = true)
 |-- rating: map (nullable = true)
 |    |-- key: string
 |    |-- value: double (valueContainsNull = true)
 |-- recommendations: map (nullable = true)
 |    |-- key: string
 |    |-- value: array (valueContainsNull = true)
 |    |    |-- element: long (containsNull = true)
 |-- title: string (nullable = true)
 |-- types: map (nullable = true)
 |    |-- key: string
 |    |-- value: array (valueContainsNull = true)
 |    |    |-- element: map (containsNull = true




nodes.csv salvo em: /app/dados/nodes.csv
edges.csv salvo em: /app/dados/edges.csv

Notebook 01 finalizado com sucesso!


                                                                                