# 03 - Gera√ß√£o de Dados de Intera√ß√£o com LLM

Este notebook implementa a **Etapa 3** do pipeline SINKT: gera√ß√£o de 3000-6000 intera√ß√µes simuladas usando LLM para respostas descritivas.

## Objetivo
Gerar sequ√™ncias de intera√ß√µes realistas para cada estudante, incluindo respostas a quest√µes e classifica√ß√£o de erros.

## Sa√≠da
- `data/output/interactions.json`: Arquivo JSON contendo todas as intera√ß√µes simuladas

## Importa√ß√£o de Bibliotecas

In [None]:
import json
import os
from datetime import datetime, timedelta
from typing import Dict, List, Any, Tuple
import numpy as np
import pandas as pd
from collections import defaultdict
import random

print("‚úÖ Bibliotecas importadas com sucesso")

## Carregamento de Dados

In [None]:
# Carregar perfis
with open('data/output/profiles.json', 'r', encoding='utf-8') as f:
    profiles_data = json.load(f)
profiles = profiles_data['profiles']

# Carregar estudantes
with open('data/output/students.json', 'r', encoding='utf-8') as f:
    students_data = json.load(f)
students = students_data['students']

# Carregar quest√µes
with open('data/json/questions_graph.json', 'r', encoding='utf-8') as f:
    questions_data = json.load(f)
questions = questions_data.get('questions', [])

# Carregar conceitos
with open('data/json/concepts_graph.json', 'r', encoding='utf-8') as f:
    concepts_data = json.load(f)
concepts = concepts_data.get('concepts', [])

print(f"‚úÖ Dados carregados:")
print(f"  - Perfis: {len(profiles)}")
print(f"  - Estudantes: {len(students)}")
print(f"  - Quest√µes: {len(questions)}")
print(f"  - Conceitos: {len(concepts)}")

## Configura√ß√£o de Par√¢metros

In [None]:
# Configura√ß√µes
MIN_INTERACTIONS_PER_STUDENT = 30
MAX_INTERACTIONS_PER_STUDENT = 60
SEED = 42

# Tipos de erro
ERROR_TYPES = [
    'misconception',      # Conceito errado
    'careless',          # Erro por descuido
    'slip',              # Erro por distra√ß√£o
    'incomplete',        # Resposta incompleta
    'misunderstanding'   # Entendimento errado
]

np.random.seed(SEED)
random.seed(SEED)

print(f"üéØ Configura√ß√µes:")
print(f"  - Intera√ß√µes por estudante: {MIN_INTERACTIONS_PER_STUDENT}-{MAX_INTERACTIONS_PER_STUDENT}")
print(f"  - Tipos de erro: {len(ERROR_TYPES)}")
print(f"  - Seed: {SEED}")

## Fun√ß√µes de Gera√ß√£o de Intera√ß√µes

In [None]:
def calculate_response_probability(student_params: Dict, question_difficulty: float) -> float:
    """Calcula probabilidade de resposta correta baseada em BKT."""
    # P(correct) = mastery + (1 - mastery) * guess - mastery * slip
    mastery = student_params.get('mastery_init_level', 0.5)
    guess = student_params.get('guess', 0.15)
    slip = student_params.get('slip', 0.1)
    
    # Ajustar pela dificuldade
    adjusted_mastery = mastery * (1 - question_difficulty * 0.3)
    
    prob = adjusted_mastery + (1 - adjusted_mastery) * guess - adjusted_mastery * slip
    return max(0, min(1, prob))

def generate_error_explanation(error_type: str, concept_name: str, student_profile: str) -> str:
    """Gera explica√ß√£o realista para o erro."""
    explanations = {
        'misconception': f"Estudante confundiu o conceito de '{concept_name}' com outro similar. Necess√°rio refor√ßo conceitual.",
        'careless': f"Erro por descuido na execu√ß√£o. Estudante conhece '{concept_name}' mas n√£o prestou aten√ß√£o.",
        'slip': f"Erro por distra√ß√£o. Estudante sabe '{concept_name}' mas cometeu erro de digita√ß√£o/l√≥gica.",
        'incomplete': f"Resposta incompleta sobre '{concept_name}'. Faltaram detalhes importantes.",
        'misunderstanding': f"Entendimento errado do enunciado relacionado a '{concept_name}'."
    }
    return explanations.get(error_type, "Erro desconhecido")

def generate_student_response(is_correct: bool, question_type: str) -> str:
    """Gera resposta simulada do estudante."""
    if question_type == 'multiple_choice':
        options = ['A', 'B', 'C', 'D']
        return random.choice(options)
    else:
        # Para quest√µes descritivas, retornar placeholder
        if is_correct:
            return "[Resposta correta gerada via LLM]"
        else:
            return "[Resposta incorreta gerada via LLM]"

print("‚úÖ Fun√ß√µes de gera√ß√£o definidas")

## Gera√ß√£o de Intera√ß√µes para Todos os Estudantes

In [None]:
def generate_interactions(students: Dict, profiles: Dict, questions: List,
                         min_interactions: int, max_interactions: int,
                         seed: int) -> List[Dict]:
    """Gera intera√ß√µes para todos os estudantes."""
    np.random.seed(seed)
    random.seed(seed)
    
    interactions = []
    interaction_id = 0
    
    for student_id, student in students.items():
        # N√∫mero de intera√ß√µes para este estudante
        num_interactions = np.random.randint(min_interactions, max_interactions + 1)
        
        # Obter perfil do estudante
        profile_id = student['profile_id']
        profile = profiles.get(profile_id, {})
        student_params = student['parameters']
        
        # Rastrear dom√≠nio do estudante ao longo do tempo
        current_mastery = student_params.get('mastery_init_level', 0.5)
        learn_rate = student_params.get('learn_rate', 0.03)
        
        # Gerar intera√ß√µes
        for interaction_num in range(num_interactions):
            # Selecionar quest√£o aleat√≥ria
            if not questions:
                continue
            
            question = random.choice(questions)
            question_id = question.get('id', f'q_{interaction_num}')
            question_type = question.get('type', 'multiple_choice')
            question_difficulty = question.get('difficulty_score', 2.5) / 5.0
            concept_id = question.get('concept_id', 'unknown')
            
            # Calcular probabilidade de resposta correta
            correct_prob = calculate_response_probability(student_params, question_difficulty)
            is_correct = np.random.random() < correct_prob
            
            # Gerar resposta
            response = generate_student_response(is_correct, question_type)
            
            # Classificar erro se houver
            error_type = None
            error_explanation = None
            if not is_correct:
                error_type = random.choice(ERROR_TYPES)
                error_explanation = generate_error_explanation(
                    error_type,
                    question.get('concept_name', 'Conceito'),
                    profile_id
                )
            
            # Atualizar dom√≠nio (BKT)
            if is_correct:
                current_mastery += (1 - current_mastery) * learn_rate
            else:
                current_mastery *= (1 - learn_rate * 0.5)
            
            # Criar registro de intera√ß√£o
            interaction = {
                'interaction_id': f'int_{interaction_id:06d}',
                'student_id': student_id,
                'question_id': question_id,
                'concept_id': concept_id,
                'question_type': question_type,
                'timestamp': (datetime.now() - timedelta(days=num_interactions-interaction_num)).isoformat(),
                'response': response,
                'is_correct': is_correct,
                'error_type': error_type,
                'error_explanation': error_explanation,
                'mastery_before': round(current_mastery - (1 - current_mastery) * learn_rate if is_correct else current_mastery, 4),
                'mastery_after': round(current_mastery, 4),
                'time_spent_seconds': np.random.randint(15, 300)
            }
            
            interactions.append(interaction)
            interaction_id += 1
    
    return interactions

print("üîÑ Gerando intera√ß√µes...")
interactions = generate_interactions(students, profiles, questions,
                                    MIN_INTERACTIONS_PER_STUDENT,
                                    MAX_INTERACTIONS_PER_STUDENT,
                                    SEED)
print(f"‚úÖ {len(interactions)} intera√ß√µes geradas")

## An√°lise de Qualidade das Intera√ß√µes

In [None]:
def analyze_interactions_quality(interactions: List[Dict], students: Dict) -> Dict:
    """Analisa qualidade das intera√ß√µes geradas."""
    
    # Estat√≠sticas gerais
    total_interactions = len(interactions)
    correct_interactions = sum(1 for i in interactions if i['is_correct'])
    accuracy = correct_interactions / total_interactions if total_interactions > 0 else 0
    
    # Distribui√ß√£o de erros
    error_distribution = defaultdict(int)
    for interaction in interactions:
        if interaction['error_type']:
            error_distribution[interaction['error_type']] += 1
    
    # Intera√ß√µes por estudante
    interactions_per_student = defaultdict(int)
    for interaction in interactions:
        interactions_per_student[interaction['student_id']] += 1
    
    return {
        'total_interactions': total_interactions,
        'total_students': len(students),
        'avg_interactions_per_student': total_interactions / len(students) if students else 0,
        'correct_interactions': correct_interactions,
        'accuracy': accuracy,
        'error_distribution': dict(error_distribution),
        'interactions_per_student_stats': {
            'min': min(interactions_per_student.values()) if interactions_per_student else 0,
            'max': max(interactions_per_student.values()) if interactions_per_student else 0,
            'mean': np.mean(list(interactions_per_student.values())) if interactions_per_student else 0
        }
    }

quality_analysis = analyze_interactions_quality(interactions, students)

print("\nüìä An√°lise de Qualidade das Intera√ß√µes:")
print(f"\n  Estat√≠sticas Gerais:")
print(f"    - Total de intera√ß√µes: {quality_analysis['total_interactions']}")
print(f"    - Total de estudantes: {quality_analysis['total_students']}")
print(f"    - M√©dia por estudante: {quality_analysis['avg_interactions_per_student']:.1f}")
print(f"    - Acur√°cia geral: {quality_analysis['accuracy']:.1%}")
print(f"\n  Distribui√ß√£o de Erros:")
for error_type, count in quality_analysis['error_distribution'].items():
    pct = (count / (quality_analysis['total_interactions'] - quality_analysis['correct_interactions'])) * 100
    print(f"    - {error_type}: {count} ({pct:.1f}%)")
print(f"\n  Intera√ß√µes por Estudante:")
print(f"    - M√≠nimo: {quality_analysis['interactions_per_student_stats']['min']}")
print(f"    - M√°ximo: {quality_analysis['interactions_per_student_stats']['max']}")
print(f"    - M√©dia: {quality_analysis['interactions_per_student_stats']['mean']:.1f}")

## Salvamento das Intera√ß√µes

In [None]:
# Criar estrutura completa com metadados
output_data = {
    "metadata": {
        "description": "Conjunto de intera√ß√µes simuladas para estudantes SINKT",
        "version": "1.0.0",
        "created_at": datetime.now().isoformat(),
        "total_interactions": len(interactions),
        "total_students": len(students),
        "avg_interactions_per_student": quality_analysis['avg_interactions_per_student'],
        "accuracy": quality_analysis['accuracy'],
        "error_types": ERROR_TYPES,
        "quality_metrics": quality_analysis
    },
    "interactions": interactions
}

# Salvar arquivo JSON
output_file = 'data/output/interactions.json'
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(output_data, f, indent=2, ensure_ascii=False)

print(f"‚úÖ Intera√ß√µes salvas em: {output_file}")
print(f"üì¶ Total de intera√ß√µes: {len(interactions)}")

## Resumo da Execu√ß√£o

In [None]:
print("\n" + "="*70)
print("üéâ GERA√á√ÉO DE INTERA√á√ïES CONCLU√çDA COM SUCESSO!")
print("="*70)
print(f"\nüìÅ Arquivo gerado:")
print(f"  - {output_file}")
print(f"\nüìä Resumo:")
print(f"  - Total de intera√ß√µes: {len(interactions)}")
print(f"  - Estudantes: {len(students)}")
print(f"  - M√©dia por estudante: {quality_analysis['avg_interactions_per_student']:.1f}")
print(f"  - Acur√°cia: {quality_analysis['accuracy']:.1%}")
print(f"\n‚úÖ Pr√≥ximo passo: Execute o notebook '04_analise_metricas.ipynb'")
print("="*70)