# Parte 2: An√°lise por Continente (Continua√ß√£o)

## üë• Autores
**Carlos Lavor Neto** - Engenharia de Computa√ß√£o - UEA

**Alexandro Pantoja** - Engenharia de Computa√ß√£o - UEA

## Perguntas Anal√≠ticas:
- 2.2: Crescimento da representa√ß√£o ao longo do tempo
- 2.3: Participa√ß√£o feminina por continente
- 2.4: Modalidades mais fortes por continente
- 2.5: Crescimento nas medalhas entre 1986 e 2024

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from pathlib import Path
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Configura√ß√£o
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("Set2")
plt.rcParams['figure.figsize'] = (14, 8)
plt.rcParams['font.size'] = 10

In [None]:
# Paths
BASE_PATH = Path('..')
BRONZE_PATH = BASE_PATH / 'bronze'
GOLD_PATH = BASE_PATH / 'gold'
OUTPUT_PATH = BASE_PATH / 'outputs'

# Carregar dados
print("Carregando dados...")
df_athletes_continent = pd.read_parquet(BRONZE_PATH / 'athletes_by_continent.parquet')
df_medals_continent = pd.read_parquet(BRONZE_PATH / 'medals_by_continent.parquet')

print(f"‚úì athletes_continent: {df_athletes_continent.shape}")
print(f"‚úì medals_continent: {df_medals_continent.shape}")

## 6. Pergunta 2.2: Crescimento da Representa√ß√£o ao Longo do Tempo

In [None]:
print("\n" + "="*80)
print("PERGUNTA 2.2: CRESCIMENTO DA REPRESENTA√á√ÉO AO LONGO DO TEMPO")
print("="*80)

# Filtrar dados conhecidos
df_known = df_athletes_continent[df_athletes_continent['continent'] != 'Unknown'].copy()

# N√∫mero de atletas por ano e continente
athletes_by_year = df_known.groupby(['year', 'continent'])['athlete_id'].nunique().reset_index()
athletes_by_year.columns = ['year', 'continent', 'num_athletes']

# Estat√≠sticas por continente
stats_by_continent = athletes_by_year.groupby('continent')['num_athletes'].agg([
    ('m√©dia', 'mean'),
    ('desvio_padr√£o', 'std'),
    ('m√≠nimo', 'min'),
    ('m√°ximo', 'max'),
    ('mediana', 'median')
]).round(2).reset_index()

stats_by_continent = stats_by_continent.sort_values('m√©dia', ascending=False)

print("\nEstat√≠sticas de Representa√ß√£o por Continente:")
display(stats_by_continent)

# Salvar
output_file = GOLD_PATH / 'representation_stats_by_continent.parquet'
stats_by_continent.to_parquet(output_file, index=False)
stats_by_continent.to_csv(OUTPUT_PATH / 'tables' / 'representation_stats_by_continent.csv', index=False)
print(f"\n‚úì Salvo: {output_file}")

### 6.1 Gr√°fico de Evolu√ß√£o da Representa√ß√£o

In [None]:
# Pivotar dados
athletes_pivot = athletes_by_year.pivot(index='year', columns='continent', values='num_athletes').fillna(0)

# Gr√°fico de linha
fig, ax = plt.subplots(figsize=(16, 8))

for continent in athletes_pivot.columns:
    ax.plot(athletes_pivot.index, athletes_pivot[continent], 
            marker='o', linewidth=2.5, markersize=5, label=continent, alpha=0.8)

ax.set_xlabel('Ano', fontsize=12, fontweight='bold')
ax.set_ylabel('N√∫mero de Atletas', fontsize=12, fontweight='bold')
ax.set_title('Crescimento da Representa√ß√£o de Atletas por Continente (1896-2022)', 
             fontsize=14, fontweight='bold', pad=20)
ax.legend(loc='upper left', fontsize=11)
ax.grid(True, alpha=0.3)
plt.xticks(rotation=45)
plt.tight_layout()

output_file = OUTPUT_PATH / 'figures' / 'athletes_growth_by_continent.png'
plt.savefig(output_file, dpi=300, bbox_inches='tight')
print(f"‚úì Gr√°fico salvo: {output_file}")
plt.show()

### 6.2 Taxa de Crescimento por Per√≠odo

In [None]:
# Calcular taxa de crescimento por d√©cada
athletes_pivot['decade'] = (athletes_pivot.index // 10) * 10

# M√©dia por d√©cada
by_decade = athletes_pivot.groupby('decade').mean()

# Taxa de crescimento
growth_rate = by_decade.pct_change() * 100
growth_rate = growth_rate.dropna()

print("\nTaxa de Crescimento M√©dio por D√©cada (%):")
display(growth_rate.round(2))

## 7. Pergunta 2.3: Participa√ß√£o Feminina por Continente

In [None]:
print("\n" + "="*80)
print("PERGUNTA 2.3: PARTICIPA√á√ÉO FEMININA POR CONTINENTE")
print("="*80)

# Filtrar apenas dados com sexo definido
df_with_sex = df_known[df_known['sex'].notna()].copy()

# Contar atletas por sexo, ano e continente
gender_by_year = df_with_sex.groupby(['year', 'continent', 'sex'])['athlete_id'].nunique().reset_index()
gender_by_year.columns = ['year', 'continent', 'sex', 'num_athletes']

# Pivotar para ter Male e Female como colunas
gender_pivot = gender_by_year.pivot_table(
    index=['year', 'continent'], 
    columns='sex', 
    values='num_athletes', 
    fill_value=0
).reset_index()

# Calcular percentual feminino
if 'Female' in gender_pivot.columns and 'Male' in gender_pivot.columns:
    gender_pivot['total'] = gender_pivot['Female'] + gender_pivot['Male']
    gender_pivot['female_pct'] = (gender_pivot['Female'] / gender_pivot['total'] * 100).round(2)
elif 'Female' in gender_pivot.columns:
    gender_pivot['female_pct'] = 100.0
else:
    gender_pivot['female_pct'] = 0.0

print("\nParticipa√ß√£o Feminina - Primeiros registros:")
display(gender_pivot.head(20))

# Salvar
output_file = GOLD_PATH / 'female_participation_by_continent.parquet'
gender_pivot.to_parquet(output_file, index=False)
gender_pivot.to_csv(OUTPUT_PATH / 'tables' / 'female_participation_by_continent.csv', index=False)
print(f"\n‚úì Salvo: {output_file}")

### 7.1 Gr√°fico - Evolu√ß√£o da Participa√ß√£o Feminina

In [None]:
# Gr√°fico de linha - % feminino
fig, ax = plt.subplots(figsize=(16, 8))

for continent in gender_pivot['continent'].unique():
    data = gender_pivot[gender_pivot['continent'] == continent]
    ax.plot(data['year'], data['female_pct'], 
            marker='o', linewidth=2.5, markersize=5, label=continent, alpha=0.8)

ax.set_xlabel('Ano', fontsize=12, fontweight='bold')
ax.set_ylabel('Participa√ß√£o Feminina (%)', fontsize=12, fontweight='bold')
ax.set_title('Evolu√ß√£o da Participa√ß√£o Feminina por Continente (1896-2022)', 
             fontsize=14, fontweight='bold', pad=20)
ax.legend(loc='upper left', fontsize=11)
ax.grid(True, alpha=0.3)
ax.set_ylim(0, 100)
plt.xticks(rotation=45)
plt.tight_layout()

output_file = OUTPUT_PATH / 'figures' / 'female_participation_evolution.png'
plt.savefig(output_file, dpi=300, bbox_inches='tight')
print(f"‚úì Gr√°fico salvo: {output_file}")
plt.show()

### 7.2 Gr√°fico - Barras Agrupadas (√öltimas 5 Edi√ß√µes)

In [None]:
# Selecionar √∫ltimas 5 edi√ß√µes
recent_years = sorted(gender_pivot['year'].unique())[-5:]
recent_data = gender_pivot[gender_pivot['year'].isin(recent_years)]

# Preparar dados para gr√°fico
fig, ax = plt.subplots(figsize=(14, 8))

x = np.arange(len(recent_years))
width = 0.15
continents = sorted(recent_data['continent'].unique())

for i, continent in enumerate(continents):
    data = recent_data[recent_data['continent'] == continent]
    offset = width * (i - len(continents)/2 + 0.5)
    ax.bar(x + offset, data['female_pct'], width, label=continent, alpha=0.8)

ax.set_xlabel('Ano', fontsize=12, fontweight='bold')
ax.set_ylabel('Participa√ß√£o Feminina (%)', fontsize=12, fontweight='bold')
ax.set_title('Participa√ß√£o Feminina por Continente - √öltimas 5 Edi√ß√µes', 
             fontsize=14, fontweight='bold', pad=20)
ax.set_xticks(x)
ax.set_xticklabels(recent_years)
ax.legend(loc='upper left', fontsize=10)
ax.grid(axis='y', alpha=0.3)
ax.set_ylim(0, 100)
plt.tight_layout()

output_file = OUTPUT_PATH / 'figures' / 'female_participation_recent.png'
plt.savefig(output_file, dpi=300, bbox_inches='tight')
print(f"‚úì Gr√°fico salvo: {output_file}")
plt.show()

## 8. Pergunta 2.4: Modalidades Mais Fortes por Continente

In [None]:
print("\n" + "="*80)
print("PERGUNTA 2.4: MODALIDADES MAIS FORTES POR CONTINENTE")
print("="*80)

# Contar medalhas por continente e esporte
df_with_medals = df_athletes_continent[
    (df_athletes_continent['continent'] != 'Unknown') & 
    (df_athletes_continent['medal'].notna())
].copy()

medals_by_sport = df_with_medals.groupby(['continent', 'sport']).size().reset_index(name='medal_count')

# Top 5 esportes por continente
top_sports_by_continent = []

for continent in medals_by_sport['continent'].unique():
    continent_data = medals_by_sport[medals_by_sport['continent'] == continent]
    top5 = continent_data.nlargest(5, 'medal_count')
    top5['rank'] = range(1, len(top5) + 1)
    top_sports_by_continent.append(top5)

df_top_sports = pd.concat(top_sports_by_continent, ignore_index=True)

print("\nTop 5 Esportes por Continente:")
for continent in df_top_sports['continent'].unique():
    print(f"\n{continent}:")
    display(df_top_sports[df_top_sports['continent'] == continent][['rank', 'sport', 'medal_count']])

# Salvar
output_file = GOLD_PATH / 'top_sports_by_continent.parquet'
df_top_sports.to_parquet(output_file, index=False)
df_top_sports.to_csv(OUTPUT_PATH / 'tables' / 'top_sports_by_continent.csv', index=False)
print(f"\n‚úì Salvo: {output_file}")

### 8.1 Gr√°fico - Barras Empilhadas por Continente

In [None]:
# Para cada continente, criar um gr√°fico de barras
fig, axes = plt.subplots(3, 2, figsize=(18, 14))
axes = axes.flatten()

continents = sorted(df_top_sports['continent'].unique())

for i, continent in enumerate(continents):
    if i >= len(axes):
        break
    
    ax = axes[i]
    data = df_top_sports[df_top_sports['continent'] == continent].sort_values('medal_count', ascending=True)
    
    ax.barh(data['sport'], data['medal_count'], color=f'C{i}', alpha=0.7)
    ax.set_xlabel('N√∫mero de Medalhas', fontsize=10, fontweight='bold')
    ax.set_title(f'{continent}', fontsize=12, fontweight='bold')
    ax.grid(axis='x', alpha=0.3)
    
    # Adicionar valores nas barras
    for j, (idx, row) in enumerate(data.iterrows()):
        ax.text(row['medal_count'] + 10, j, str(int(row['medal_count'])), 
                va='center', fontsize=9, fontweight='bold')

# Remover eixos extras se houver
for i in range(len(continents), len(axes)):
    fig.delaxes(axes[i])

plt.suptitle('Top 5 Esportes com Mais Medalhas por Continente', 
             fontsize=16, fontweight='bold', y=1.00)
plt.tight_layout()

output_file = OUTPUT_PATH / 'figures' / 'top_sports_by_continent.png'
plt.savefig(output_file, dpi=300, bbox_inches='tight')
print(f"‚úì Gr√°fico salvo: {output_file}")
plt.show()

## 9. Pergunta 2.5: Crescimento entre 1986 e 2024

In [None]:
print("\n" + "="*80)
print("PERGUNTA 2.5: CRESCIMENTO NAS MEDALHAS ENTRE 1986 E 2024")
print("="*80)

# Filtrar dados de 1986 em diante
df_since_1986 = df_medals_continent[
    (df_medals_continent['year'] >= 1986) & 
    (df_medals_continent['continent'] != 'Unknown')
].copy()

# Agregar por continente e per√≠odo
medals_1986_1995 = df_since_1986[
    (df_since_1986['year'] >= 1986) & (df_since_1986['year'] <= 1995)
].groupby('continent')['total'].sum().reset_index()
medals_1986_1995.columns = ['continent', 'medals_1986_1995']

medals_2014_2022 = df_since_1986[
    (df_since_1986['year'] >= 2014) & (df_since_1986['year'] <= 2022)
].groupby('continent')['total'].sum().reset_index()
medals_2014_2022.columns = ['continent', 'medals_2014_2022']

# Merge e calcular crescimento
growth_analysis = medals_1986_1995.merge(medals_2014_2022, on='continent')
growth_analysis['absolute_growth'] = growth_analysis['medals_2014_2022'] - growth_analysis['medals_1986_1995']
growth_analysis['percent_growth'] = (
    (growth_analysis['medals_2014_2022'] - growth_analysis['medals_1986_1995']) / 
    growth_analysis['medals_1986_1995'] * 100
).round(2)

growth_analysis = growth_analysis.sort_values('percent_growth', ascending=False)

print("\nCrescimento de Medalhas por Continente (1986-1995 vs 2014-2022):")
display(growth_analysis)

# Salvar
output_file = GOLD_PATH / 'medals_growth_1986_2024.parquet'
growth_analysis.to_parquet(output_file, index=False)
growth_analysis.to_csv(OUTPUT_PATH / 'tables' / 'medals_growth_1986_2024.csv', index=False)
print(f"\n‚úì Salvo: {output_file}")

### 9.1 Gr√°fico - Crescimento Percentual

In [None]:
# Gr√°fico de barras - Crescimento percentual
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 7))

# Gr√°fico 1: Crescimento Absoluto
colors = ['green' if x > 0 else 'red' for x in growth_analysis['absolute_growth']]
ax1.barh(growth_analysis['continent'], growth_analysis['absolute_growth'], color=colors, alpha=0.7)
ax1.set_xlabel('Crescimento Absoluto (medalhas)', fontsize=11, fontweight='bold')
ax1.set_ylabel('Continente', fontsize=11, fontweight='bold')
ax1.set_title('Crescimento Absoluto de Medalhas\n(1986-1995 vs 2014-2022)', 
              fontsize=12, fontweight='bold')
ax1.grid(axis='x', alpha=0.3)
ax1.axvline(x=0, color='black', linestyle='-', linewidth=0.8)

# Adicionar valores
for i, (idx, row) in enumerate(growth_analysis.iterrows()):
    value = row['absolute_growth']
    ax1.text(value + (50 if value > 0 else -50), i, f"{int(value)}", 
             va='center', ha='left' if value > 0 else 'right', fontweight='bold')

# Gr√°fico 2: Crescimento Percentual
colors = ['green' if x > 0 else 'red' for x in growth_analysis['percent_growth']]
ax2.barh(growth_analysis['continent'], growth_analysis['percent_growth'], color=colors, alpha=0.7)
ax2.set_xlabel('Crescimento Percentual (%)', fontsize=11, fontweight='bold')
ax2.set_title('Crescimento Percentual de Medalhas\n(1986-1995 vs 2014-2022)', 
              fontsize=12, fontweight='bold')
ax2.grid(axis='x', alpha=0.3)
ax2.axvline(x=0, color='black', linestyle='-', linewidth=0.8)

# Adicionar valores
for i, (idx, row) in enumerate(growth_analysis.iterrows()):
    value = row['percent_growth']
    ax2.text(value + (5 if value > 0 else -5), i, f"{value:.1f}%", 
             va='center', ha='left' if value > 0 else 'right', fontweight='bold')

plt.tight_layout()

output_file = OUTPUT_PATH / 'figures' / 'medals_growth_comparison.png'
plt.savefig(output_file, dpi=300, bbox_inches='tight')
print(f"‚úì Gr√°fico salvo: {output_file}")
plt.show()

### 9.2 Evolu√ß√£o Temporal Completa (1986-2022)

In [None]:
# Medalhas por ano desde 1986
medals_by_year_since_1986 = df_since_1986.groupby(['year', 'continent'])['total'].sum().reset_index()

# Gr√°fico de linha
fig, ax = plt.subplots(figsize=(16, 8))

for continent in sorted(medals_by_year_since_1986['continent'].unique()):
    data = medals_by_year_since_1986[medals_by_year_since_1986['continent'] == continent]
    ax.plot(data['year'], data['total'], 
            marker='o', linewidth=2.5, markersize=6, label=continent, alpha=0.8)

ax.set_xlabel('Ano', fontsize=12, fontweight='bold')
ax.set_ylabel('Total de Medalhas', fontsize=12, fontweight='bold')
ax.set_title('Evolu√ß√£o de Medalhas por Continente (1986-2022)', 
             fontsize=14, fontweight='bold', pad=20)
ax.legend(loc='upper left', fontsize=11)
ax.grid(True, alpha=0.3)
plt.xticks(rotation=45)
plt.tight_layout()

output_file = OUTPUT_PATH / 'figures' / 'medals_evolution_1986_2022.png'
plt.savefig(output_file, dpi=300, bbox_inches='tight')
print(f"‚úì Gr√°fico salvo: {output_file}")
plt.show()

## 10. Resumo Final - Parte 2

In [None]:
print("\n" + "="*80)
print("RESUMO FINAL - PARTE 2: AN√ÅLISE POR CONTINENTE")
print("="*80)

print("\n2.1 - Distribui√ß√£o de Medalhas:")
print("   ‚úì Gr√°fico de pizza com distribui√ß√£o total")
print("   ‚úì Gr√°fico de linha com evolu√ß√£o por edi√ß√£o")
print("   ‚úì Estat√≠sticas de n√∫mero m√©dio de atletas")

print("\n2.2 - Crescimento da Representa√ß√£o:")
print("   ‚úì Estat√≠sticas descritivas (m√©dia, desvio padr√£o)")
print("   ‚úì Gr√°fico de evolu√ß√£o ao longo do tempo")
print("   ‚úì Taxa de crescimento por d√©cada")

print("\n2.3 - Participa√ß√£o Feminina:")
print("   ‚úì Percentual de mulheres por continente e edi√ß√£o")
print("   ‚úì Gr√°fico de linha com evolu√ß√£o temporal")
print("   ‚úì Gr√°fico de barras agrupadas (√∫ltimas 5 edi√ß√µes)")

print("\n2.4 - Modalidades Mais Fortes:")
print("   ‚úì Top 5 esportes por continente")
print("   ‚úì Gr√°ficos de barras horizontais por continente")

print("\n2.5 - Crescimento 1986-2024:")
print("   ‚úì An√°lise comparativa de dois per√≠odos")
print("   ‚úì Gr√°ficos de crescimento absoluto e percentual")
print("   ‚úì Evolu√ß√£o temporal completa desde 1986")

print("\n" + "="*80)
print("‚úì AN√ÅLISE COMPLETA CONCLU√çDA!")
print("="*80)

# Listar arquivos gerados
print("\nArquivos gerados na camada Gold:")
gold_files = list(GOLD_PATH.glob('*.parquet'))
for f in sorted(gold_files):
    print(f"   - {f.name}")

print("\nGr√°ficos gerados:")
figure_files = list((OUTPUT_PATH / 'figures').glob('*.png'))
for f in sorted(figure_files):
    print(f"   - {f.name}")

print("\nTabelas CSV geradas:")
table_files = list((OUTPUT_PATH / 'tables').glob('*.csv'))
for f in sorted(table_files):
    print(f"   - {f.name}")