# 02 - Gera√ß√£o de Estudantes Sint√©ticos

Este notebook implementa a **Etapa 2** do pipeline SINKT: gera√ß√£o de 100+ estudantes sint√©ticos baseados nos perfis cognitivos criados.

## Objetivo
Gerar estudantes individualizados com varia√ß√£o dentro de cada perfil, mantendo reprodutibilidade.

## Sa√≠da
- `data/output/students.json`: Arquivo JSON contendo todos os estudantes sint√©ticos

## Importa√ß√£o de Bibliotecas

In [None]:
import json
import os
from datetime import datetime
from typing import Dict, List, Any
import numpy as np
import pandas as pd

print("‚úÖ Bibliotecas importadas com sucesso")

## Carregamento dos Perfis Cognitivos

In [None]:
# Carregar perfis gerados no notebook anterior
with open('data/output/profiles.json', 'r', encoding='utf-8') as f:
    profiles_data = json.load(f)

profiles = profiles_data['profiles']
print(f"‚úÖ {len(profiles)} perfis carregados")
print(f"\nPerfis dispon√≠veis:")
for pid, profile in profiles.items():
    print(f"  - {profile['nome']} ({pid})")

## Configura√ß√£o de Par√¢metros

In [None]:
# Configura√ß√µes
NUM_STUDENTS = 100
SEED = 42
INDIVIDUAL_VARIATION = 0.15  # ¬±15% de varia√ß√£o individual
PROFILE_DISTRIBUTION = {
    'balanced': 0.25,
    'quick_learner': 0.15,
    'careful': 0.15,
    'struggling': 0.15,
    'logical': 0.15,
    'intuitive': 0.10,
    'perfectionist': 0.05
}

# Definir seed para reprodutibilidade
np.random.seed(SEED)

print(f"üéØ Configura√ß√µes:")
print(f"  - N√∫mero de estudantes: {NUM_STUDENTS}")
print(f"  - Seed: {SEED}")
print(f"  - Varia√ß√£o individual: ¬±{INDIVIDUAL_VARIATION*100:.0f}%")
print(f"  - Distribui√ß√£o de perfis: {PROFILE_DISTRIBUTION}")

## Gera√ß√£o de Estudantes com Varia√ß√£o Individual

In [None]:
def generate_students(profiles: Dict[str, Dict], num_students: int, 
                     distribution: Dict[str, float], variation: float,
                     seed: int) -> List[Dict[str, Any]]:
    """Gera estudantes sint√©ticos baseados nos perfis."""
    np.random.seed(seed)
    students = []
    
    # Determinar quantos estudantes por perfil
    profile_counts = {}
    remaining = num_students
    profile_ids = list(distribution.keys())
    
    for i, pid in enumerate(profile_ids):
        if i == len(profile_ids) - 1:
            profile_counts[pid] = remaining
        else:
            count = int(num_students * distribution[pid])
            profile_counts[pid] = count
            remaining -= count
    
    student_id = 0
    
    for profile_id, count in profile_counts.items():
        if profile_id not in profiles:
            continue
            
        profile = profiles[profile_id]
        
        for _ in range(count):
            # Aplicar varia√ß√£o individual aos par√¢metros
            student = {
                'student_id': f'student_{student_id:04d}',
                'profile_id': profile_id,
                'parameters': {}
            }
            
            # Variar cada par√¢metro
            for param in ['mastery_init_level', 'learn_rate', 'slip', 'guess',
                         'logic_skill', 'reading_skill', 'memory_capacity',
                         'tech_familiarity', 'learning_consistency']:
                if param in profile:
                    base_value = profile[param]
                    # Adicionar varia√ß√£o aleat√≥ria
                    variation_factor = np.random.uniform(-variation, variation)
                    new_value = base_value * (1 + variation_factor)
                    # Garantir que fica no range [0, 1]
                    new_value = max(0, min(1, new_value))
                    student['parameters'][param] = round(new_value, 4)
            
            student['seed'] = seed + student_id
            student['generated_at'] = datetime.now().isoformat()
            
            students.append(student)
            student_id += 1
    
    return students

# Gerar estudantes
print("üîÑ Gerando estudantes sint√©ticos...")
students = generate_students(profiles, NUM_STUDENTS, PROFILE_DISTRIBUTION, 
                             INDIVIDUAL_VARIATION, SEED)
print(f"‚úÖ {len(students)} estudantes gerados com sucesso")

## Valida√ß√£o de Qualidade dos Estudantes

In [None]:
def validate_students(students: List[Dict], profiles: Dict[str, Dict]) -> Dict[str, Any]:
    """Valida qualidade dos estudantes gerados."""
    issues = []
    
    # Verificar IDs √∫nicos
    student_ids = [s['student_id'] for s in students]
    if len(student_ids) != len(set(student_ids)):
        issues.append("IDs de estudantes n√£o s√£o √∫nicos")
    
    # Verificar refer√™ncias a perfis v√°lidos
    for student in students:
        if student['profile_id'] not in profiles:
            issues.append(f"Estudante {student['student_id']}: perfil inv√°lido")
        
        # Verificar par√¢metros
        for param, value in student['parameters'].items():
            if not isinstance(value, (int, float)):
                issues.append(f"Estudante {student['student_id']}: {param} n√£o √© num√©rico")
            elif value < 0 or value > 1:
                issues.append(f"Estudante {student['student_id']}: {param} fora do range")
    
    # Calcular distribui√ß√£o de perfis
    profile_dist = {}
    for student in students:
        pid = student['profile_id']
        profile_dist[pid] = profile_dist.get(pid, 0) + 1
    
    return {
        'total_students': len(students),
        'profile_distribution': profile_dist,
        'issues': issues,
        'is_valid': len(issues) == 0
    }

# Validar
validation_result = validate_students(students, profiles)

print("\n‚úÖ Valida√ß√£o de Estudantes:")
print(f"  - Total: {validation_result['total_students']}")
print(f"  - V√°lido: {'Sim' if validation_result['is_valid'] else 'N√£o'}")
print(f"\nüìä Distribui√ß√£o por Perfil:")
for pid, count in sorted(validation_result['profile_distribution'].items()):
    pct = (count / len(students)) * 100
    print(f"  - {pid}: {count} estudantes ({pct:.1f}%)")

if validation_result['issues']:
    print(f"\n‚ùå Problemas encontrados:")
    for issue in validation_result['issues'][:5]:
        print(f"  - {issue}")

## An√°lise de Qualidade da Amostra

In [None]:
def analyze_student_quality(students: List[Dict]) -> pd.DataFrame:
    """Analisa qualidade da amostra de estudantes."""
    data = []
    
    for student in students:
        params = student['parameters']
        
        # Calcular scores agregados
        cognitive_score = np.mean([
            params.get('logic_skill', 0),
            params.get('reading_skill', 0),
            params.get('memory_capacity', 0)
        ])
        
        learning_potential = (
            params.get('learn_rate', 0) * 0.3 +
            params.get('mastery_init_level', 0) * 0.3 +
            params.get('learning_consistency', 0) * 0.4
        )
        
        data.append({
            'student_id': student['student_id'],
            'profile_id': student['profile_id'],
            'cognitive_score': cognitive_score,
            'learning_potential': learning_potential,
            'mastery_init': params.get('mastery_init_level', 0),
            'learn_rate': params.get('learn_rate', 0)
        })
    
    return pd.DataFrame(data)

quality_df = analyze_student_quality(students)

print("\nüìä An√°lise de Qualidade da Amostra:")
print(f"\n  Cognitive Score:")
print(f"    M√©dia: {quality_df['cognitive_score'].mean():.3f}")
print(f"    Desvio: {quality_df['cognitive_score'].std():.3f}")
print(f"    Range: [{quality_df['cognitive_score'].min():.3f}, {quality_df['cognitive_score'].max():.3f}]")

print(f"\n  Learning Potential:")
print(f"    M√©dia: {quality_df['learning_potential'].mean():.3f}")
print(f"    Desvio: {quality_df['learning_potential'].std():.3f}")
print(f"    Range: [{quality_df['learning_potential'].min():.3f}, {quality_df['learning_potential'].max():.3f}]")

print(f"\n  Mastery Initial Level:")
print(f"    M√©dia: {quality_df['mastery_init'].mean():.3f}")
print(f"    Desvio: {quality_df['mastery_init'].std():.3f}")

print(f"\n  Learn Rate:")
print(f"    M√©dia: {quality_df['learn_rate'].mean():.4f}")
print(f"    Desvio: {quality_df['learn_rate'].std():.4f}")

## Salvamento dos Estudantes

In [None]:
# Criar estrutura completa com metadados
output_data = {
    "metadata": {
        "description": "Conjunto de estudantes sint√©ticos gerados para valida√ß√£o do SINKT",
        "version": "1.0.0",
        "created_at": datetime.now().isoformat(),
        "num_students": len(students),
        "generation_seed": SEED,
        "profile_distribution": PROFILE_DISTRIBUTION,
        "individual_variation": INDIVIDUAL_VARIATION,
        "quality_metrics": {
            "cognitive_score_mean": float(quality_df['cognitive_score'].mean()),
            "cognitive_score_std": float(quality_df['cognitive_score'].std()),
            "learning_potential_mean": float(quality_df['learning_potential'].mean()),
            "learning_potential_std": float(quality_df['learning_potential'].std())
        }
    },
    "students": {s['student_id']: s for s in students}
}

# Salvar arquivo JSON
output_file = 'data/output/students.json'
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(output_data, f, indent=2, ensure_ascii=False)

print(f"‚úÖ Estudantes salvos em: {output_file}")
print(f"üì¶ Total de estudantes: {len(students)}")

## Resumo da Execu√ß√£o

In [None]:
print("\n" + "="*70)
print("üéâ GERA√á√ÉO DE ESTUDANTES SINT√âTICOS CONCLU√çDA COM SUCESSO!")
print("="*70)
print(f"\nüìÅ Arquivo gerado:")
print(f"  - {output_file}")
print(f"\nüìä Resumo:")
print(f"  - Total de estudantes: {len(students)}")
print(f"  - Seed: {SEED}")
print(f"  - Varia√ß√£o individual: ¬±{INDIVIDUAL_VARIATION*100:.0f}%")
print(f"\n‚úÖ Pr√≥ximo passo: Execute o notebook '03_geracao_interacoes.ipynb'")
print("="*70)