# Parte 2: An√°lise por Continente - Quest√£o 2.1

## Autores
**Carlos Lavor Neto** - Engenharia de Computa√ß√£o - UEA

**Alexandro Pantoja** - Engenharia de Computa√ß√£o - UEA

## Objetivo
Responder √† pergunta 2.1:
- Distribui√ß√£o total de medalhas por continente
- Total acumulado e por edi√ß√£o
- Gr√°ficos: pizza e linha
- N√∫mero m√©dio de atletas por continente

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from pathlib import Path
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Configura√ß√£o
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("Set2")
plt.rcParams['figure.figsize'] = (14, 8)
plt.rcParams['font.size'] = 10

print("Bibliotecas carregadas com sucesso!")

## 1. Carregar Dados

In [None]:
# Paths
BASE_PATH = Path('..')
BRONZE_PATH = BASE_PATH / 'bronze'
GOLD_PATH = BASE_PATH / 'gold'
OUTPUT_PATH = BASE_PATH / 'outputs'

# Carregar dados
print("Carregando dados...")
df_medal_tally = pd.read_parquet(BRONZE_PATH / 'medal_tally.parquet')
df_athlete_event = pd.read_parquet(BRONZE_PATH / 'athlete_event_result.parquet')
df_athlete_bio = pd.read_parquet(BRONZE_PATH / 'athlete_bio.parquet')
df_game = pd.read_parquet(BRONZE_PATH / 'game.parquet')

print(f"‚úì medal_tally: {df_medal_tally.shape}")
print(f"‚úì athlete_event: {df_athlete_event.shape}")
print(f"‚úì athlete_bio: {df_athlete_bio.shape}")
print(f"‚úì game: {df_game.shape}")

## 2. Criar Mapeamento Pa√≠s ‚Üí Continente

In [None]:
# Verificar se o mapeamento j√° existe
mapping_file = BRONZE_PATH / 'noc_continent_mapping.parquet'

if mapping_file.exists():
    print("Carregando mapeamento existente...")
    df_noc_continent = pd.read_parquet(mapping_file)
    print(f"‚úì Mapeamento carregado: {len(df_noc_continent)} NOCs")
else:
    print("Criando novo mapeamento...")
    # Aqui o mapeamento ser√° criado no notebook 03a ou j√° deve existir
    print("‚ö† Execute o notebook de cria√ß√£o do mapeamento primeiro!")

## 3. Integrar Medalhas com Continentes

In [None]:
# Carregar dados integrados
medals_continent_file = BRONZE_PATH / 'medals_by_continent.parquet'

if medals_continent_file.exists():
    df_medals_continent = pd.read_parquet(medals_continent_file)
    print(f"‚úì Dados integrados carregados: {df_medals_continent.shape}")
else:
    print("Integrando dados...")
    df_medals_continent = df_medal_tally.merge(df_noc_continent, on='country_noc', how='left')
    df_medals_continent['continent'] = df_medals_continent['continent'].fillna('Unknown')
    df_medals_continent.to_parquet(medals_continent_file, index=False)
    print(f"‚úì Dados integrados e salvos: {df_medals_continent.shape}")

df_medals_continent.head()

## 4. Pergunta 2.1: Distribui√ß√£o Total de Medalhas por Continente

### 4.1 Total Acumulado

In [None]:
# Filtrar continentes conhecidos
df_known = df_medals_continent[df_medals_continent['continent'] != 'Unknown'].copy()

# Agregar por continente
medals_by_continent = df_known.groupby('continent').agg({
    'gold': 'sum',
    'silver': 'sum',
    'bronze': 'sum',
    'total': 'sum'
}).reset_index().sort_values('total', ascending=False)

print("\nüìä Total Acumulado de Medalhas por Continente (1896-2022):")
display(medals_by_continent)

# Salvar
medals_by_continent.to_parquet(GOLD_PATH / 'medals_by_continent_total.parquet', index=False)
medals_by_continent.to_csv(OUTPUT_PATH / 'tables' / 'medals_by_continent_total.csv', index=False)
print("\n‚úì Dados salvos")

### 4.2 Gr√°fico de Pizza

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))

colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#FFA07A', '#98D8C8']
explode = [0.05 if i == 0 else 0 for i in range(len(medals_by_continent))]

wedges, texts, autotexts = ax.pie(
    medals_by_continent['total'],
    labels=medals_by_continent['continent'],
    autopct='%1.1f%%',
    startangle=90,
    colors=colors,
    explode=explode,
    textprops={'fontsize': 12, 'weight': 'bold'}
)

legend_labels = [
    f"{row['continent']}: {int(row['total'])} medalhas"
    for _, row in medals_by_continent.iterrows()
]
ax.legend(legend_labels, loc='center left', bbox_to_anchor=(1, 0, 0.5, 1), fontsize=10)

plt.title('Distribui√ß√£o Total de Medalhas por Continente (1896-2022)',
          fontsize=14, fontweight='bold', pad=20)
plt.tight_layout()

plt.savefig(OUTPUT_PATH / 'figures' / 'medals_by_continent_pie.png', dpi=300, bbox_inches='tight')
print("‚úì Gr√°fico salvo")
plt.show()

### 4.3 Medalhas por Edi√ß√£o

In [None]:
# Agregar por ano e continente
medals_by_year = df_known.groupby(['year', 'continent'])['total'].sum().reset_index()
medals_pivot = medals_by_year.pivot(index='year', columns='continent', values='total').fillna(0)

print("\nüìà Medalhas por Edi√ß√£o - Primeiras linhas:")
display(medals_pivot.head())

# Salvar
medals_pivot.reset_index().to_parquet(GOLD_PATH / 'medals_by_year_continent.parquet', index=False)
medals_pivot.to_csv(OUTPUT_PATH / 'tables' / 'medals_by_year_continent.csv')
print("‚úì Dados salvos")

### 4.4 Gr√°fico de Linha - Evolu√ß√£o

In [None]:
fig, ax = plt.subplots(figsize=(16, 8))

for continent in medals_pivot.columns:
    ax.plot(medals_pivot.index, medals_pivot[continent],
            marker='o', linewidth=2, markersize=4, label=continent, alpha=0.8)

ax.set_xlabel('Ano', fontsize=12, fontweight='bold')
ax.set_ylabel('Total de Medalhas', fontsize=12, fontweight='bold')
ax.set_title('Evolu√ß√£o de Medalhas por Continente (1896-2022)',
             fontsize=14, fontweight='bold', pad=20)
ax.legend(loc='upper left', fontsize=11)
ax.grid(True, alpha=0.3)
plt.xticks(rotation=45)
plt.tight_layout()

plt.savefig(OUTPUT_PATH / 'figures' / 'medals_by_year_continent_line.png', dpi=300, bbox_inches='tight')
print("‚úì Gr√°fico salvo")
plt.show()

### 4.5 N√∫mero M√©dio de Atletas

In [None]:
# Carregar atletas por continente
athletes_continent_file = BRONZE_PATH / 'athletes_by_continent.parquet'

if athletes_continent_file.exists():
    df_athletes_continent = pd.read_parquet(athletes_continent_file)
    
    # Atletas por edi√ß√£o e continente
    athletes_by_edition = df_athletes_continent[
        df_athletes_continent['continent'] != 'Unknown'
    ].groupby(['edition', 'continent'])['athlete_id'].nunique().reset_index()
    athletes_by_edition.columns = ['edition', 'continent', 'num_athletes']
    
    # M√©dia
    avg_athletes = athletes_by_edition.groupby('continent')['num_athletes'].agg([
        ('m√©dia', 'mean'),
        ('desvio_padr√£o', 'std'),
        ('m√≠nimo', 'min'),
        ('m√°ximo', 'max')
    ]).round(2).reset_index().sort_values('m√©dia', ascending=False)
    
    print("\nüë• N√∫mero M√©dio de Atletas por Continente:")
    display(avg_athletes)
    
    # Salvar
    avg_athletes.to_parquet(GOLD_PATH / 'avg_athletes_by_continent.parquet', index=False)
    avg_athletes.to_csv(OUTPUT_PATH / 'tables' / 'avg_athletes_by_continent.csv', index=False)
    print("‚úì Dados salvos")
else:
    print("‚ö† Arquivo de atletas por continente n√£o encontrado")

## 5. Resumo da Quest√£o 2.1

In [None]:
print("\n" + "="*80)
print("RESUMO - QUEST√ÉO 2.1")
print("="*80)
print("\n‚úÖ An√°lises Realizadas:")
print("   ‚Ä¢ Distribui√ß√£o total de medalhas por continente")
print("   ‚Ä¢ Gr√°fico de pizza (distribui√ß√£o percentual)")
print("   ‚Ä¢ Evolu√ß√£o temporal por edi√ß√£o")
print("   ‚Ä¢ Gr√°fico de linha (evolu√ß√£o hist√≥rica)")
print("   ‚Ä¢ Estat√≠sticas de n√∫mero m√©dio de atletas")
print("\n‚úÖ Arquivos Gerados:")
print("   ‚Ä¢ medals_by_continent_total.parquet (Gold)")
print("   ‚Ä¢ medals_by_year_continent.parquet (Gold)")
print("   ‚Ä¢ avg_athletes_by_continent.parquet (Gold)")
print("   ‚Ä¢ medals_by_continent_pie.png (Figura)")
print("   ‚Ä¢ medals_by_year_continent_line.png (Figura)")
print("\n" + "="*80)
print("‚úì QUEST√ÉO 2.1 CONCLU√çDA!")
print("="*80)