# Pre-processing of data proper for visualizations

In [2]:
import sys
sys.path.append("../src")
import pandas as pd
import json
from custom_processing import *

In [3]:
games_df = pd.read_csv("../data/raw/games_info_viz_final_project_v2.csv")
console_df = pd.read_csv("../data/raw/console_info_viz_final_project.csv")
publisher_df = pd.read_csv("../data/raw/publisher_info_viz_final_project.csv")

  games_df = pd.read_csv("../data/raw/games_info_viz_final_project_v2.csv")


## Treemap processing data

In [12]:
treemap_data = games_df[["Title", "Genre", "ReleaseYear"]]
treemap_data = treemap_data[treemap_data["Genre"] != "Not Defined"]
treemap_data["ReleaseYear"] = treemap_data["ReleaseYear"].apply(safe_int)
treemap_data = treemap_data[treemap_data["ReleaseYear"].notna() & (treemap_data["ReleaseYear"] <= 2020) & (treemap_data["Title"].notna())]

In [13]:
treemap_data

Unnamed: 0,Title,Genre,ReleaseYear
0,Minecraft,Action,2009
1,Grand Theft Auto V,Action,2013
2,Tetris (EA),Puzzle,2006
3,Wii Sports,Sports,2006
4,PUBG: Battlegrounds,Shooter,2017
...,...,...,...
480096,Scrapjunk Courier,Strategy,2017
480097,Primal Fears (itch),Action,2017
480099,Airstrike HD Demo,Action,2016
480100,Urania's Mirror,Adventure,2016


In [14]:
## Geração do JSON pro Treemap
# Monta a hierarquia
hierarchy = []
for year, year_df in treemap_data.groupby("ReleaseYear"):
    genres = []
    for genre, genre_df in year_df.groupby("Genre"):
        titles = genre_df["Title"].tolist()
        genres.append({
            "Genre": genre,
            "Count": len(titles),
            "Titles": titles
        })
    hierarchy.append({
        "ReleaseYear": int(year),
        "Genres": genres
    })

# Salva como JSON
with open("../data/processed/treemap_hierarchy.json", "w", encoding="utf-8") as f:
    json.dump(hierarchy, f, ensure_ascii=False, indent=2)

## Node-link diagram data

In [29]:
n_games = 2000

In [30]:
net_data = games_df[["Title", "Genre", "ReleaseYear", "Developer", "Publisher", "MetacriticScore"]]
net_data = net_data[(net_data["Genre"] != "Not Defined") & (net_data["Publisher"] != "Not Defined")].dropna()
net_data["ReleaseYear"] = net_data["ReleaseYear"].apply(safe_int)
net_data = net_data[(net_data["ReleaseYear"] <= 2020) & (net_data["Developer"] != "Not Defined") & (net_data["Publisher"] != "Not Defined")].dropna()

In [31]:
sample_data = net_data.sort_values(by=["MetacriticScore"], ascending=False).head(n_games)[["Title", "Genre", "ReleaseYear", "Developer", "Publisher"]]

In [32]:
## Geração do JSON pro gráfico node-link

publishers_set = set(clean_name(pub) for pub in sample_data["Publisher"].unique() if clean_name(pub))
developers_set = set(clean_name(dev) for dev in sample_data["Developer"].unique() if clean_name(dev))
reserved_names = publishers_set.union(developers_set)

nodes = []
node_ids = set()

for _, row in sample_data.iterrows():
    title = row["Title"]  # Não aplica limpeza em títulos
    dev = clean_name(row["Developer"])
    pub = clean_name(row["Publisher"])
    genre = fix_genre(row["Genre"])

    # Se o título coincidir com algum nome de publisher/developer, adiciona " (game)"
    title_final = title
    if title in reserved_names:
        title_final = f"{title} (game)"

    # Adiciona Publisher
    if pub and pub not in node_ids:
        nodes.append({"id": pub, "group": "Publisher"})
        node_ids.add(pub)

    # Só adiciona Developer se NÃO estiver na lista de publishers
    if dev and dev not in publishers_set and dev not in node_ids:
        nodes.append({"id": dev, "group": "Developer"})
        node_ids.add(dev)

    # Adiciona Game, incluindo publisher e developer limpos e gênero corrigido
    if title_final and title_final not in node_ids:
        nodes.append({
            "id": title_final,
            "group": "Game",
            "year": row["ReleaseYear"],
            "genre": genre,
            "publisher": pub,
            "developer": dev
        })
        node_ids.add(title_final)

links = []
for _, row in sample_data.iterrows():
    title = row["Title"]
    dev = clean_name(row["Developer"])
    pub = clean_name(row["Publisher"])

    title_final = title
    if title in reserved_names:
        title_final = f"{title} (game)"

    # Só cria link Developer→Game se Developer não for Publisher
    if dev and dev not in publishers_set and title_final:
        links.append({"source": dev, "target": title_final})
    # Sempre cria link Publisher→Game
    if pub and title_final:
        links.append({"source": pub, "target": title_final})

output = {"nodes": nodes, "links": links}
with open(f"../data/processed/games_node_link_{n_games}.json", "w", encoding="utf-8") as f:
    json.dump(output, f, ensure_ascii=False, indent=2)

## Table data

In [16]:
# Jogos mais bem avaliados de acordos com o Metacritic
table_data = games_df[["Title", "Genre", "Sales"]].sort_values(by="Sales", ascending=False).dropna()
table_data = table_data[table_data["Genre"] != "Not Defined"]

In [17]:
table_data

Unnamed: 0,Title,Genre,Sales
2,Tetris (EA),Puzzle,100000000.0
3,Wii Sports,Sports,82740000.0
4,PUBG: Battlegrounds,Shooter,75000000.0
7,Red Dead Redemption 2,Action,50000000.0
11,Tetris (1989),Puzzle,43000000.0
...,...,...,...
11291,Rave Master: Special Attack Force!,Fighting,10000.0
11289,Shorts,Platform,10000.0
11287,Tantei Jinguuji Saburo: Hai to Diamond,Adventure,10000.0
11286,Toro to Morimori,Misc,10000.0


In [18]:
# Geração do JSON para visualização em tabela

table_json = table_data.to_dict(orient="records")

# Salva como JSON
with open("../data/processed/table_data.json", "w", encoding="utf-8") as f:
    json.dump(table_json, f, ensure_ascii=False, indent=2)