In [1]:
import json
import torch
from transformers import BertTokenizer, BertForSequenceClassification, T5Tokenizer, T5ForConditionalGeneration
from torch.nn.functional import softmax
import tensorflow_hub as hub
import tensorflow as tf
import numpy as np
import spacy

# Carregar o modelo e o tokenizador do BERT pré-treinado
MODEL_NAME = "neuralmind/bert-base-portuguese-cased"
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=5)  # Notas de 0 a 4

# Carregar modelo USE (Universal Sentence Encoder)
use_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

# Carregar modelo T5 para análise de respostas
t5_model_name = "unicamp-dl/ptt5-base-portuguese-vocab"
t5_tokenizer = T5Tokenizer.from_pretrained(t5_model_name)
t5_model = T5ForConditionalGeneration.from_pretrained(t5_model_name)

# Carregar modelo spaCy para NLP
nlp = spacy.load("pt_core_news_sm")

# Função para processar texto com BERT e atribuir notas
def avaliar_resposta_bert(resposta_aluno, respostas_referencia):
    """Avalia a resposta do aluno usando BERT."""
    max_length = 512  # Limite máximo de tokens suportado pelo BERT
    inputs = tokenizer(resposta_aluno, respostas_referencia, padding=True, truncation=True, max_length=max_length, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    scores = softmax(outputs.logits, dim=1).squeeze().tolist()
    return scores.index(max(scores))  # Pegamos a maior probabilidade

# Função para processar texto com USE e calcular similaridade
def avaliar_resposta_use(resposta_aluno, respostas_referencia):
    """Avalia a resposta do aluno usando Universal Sentence Encoder."""
    embeddings = use_model([resposta_aluno, respostas_referencia])
    sim = np.inner(embeddings[0], embeddings[1])  # Produto escalar como métrica de similaridade
    return int((sim + 1) / 2 * 4)  # Normaliza para escala de 0 a 4

# Função para processar texto com T5 e gerar uma nota
def avaliar_resposta_t5(resposta_aluno, respostas_referencia):
    """Avalia a resposta do aluno usando T5."""
    input_text = f"Avalie a resposta: {resposta_aluno} com base na referência: {respostas_referencia}"
    inputs = t5_tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = t5_model.generate(**inputs, max_length=5)
    nota_t5 = t5_tokenizer.decode(outputs[0], skip_special_tokens=True)
    try:
        return int(nota_t5)
    except ValueError:
        return 0  # Se a conversão falhar, retorna 0

# Carregar os dados
with open("ptbrData.json", "r", encoding="utf-8") as f:
    data = json.load(f)

output_data = []

# Processar cada questão e resposta
for item in data:
    numero_pergunta = item["number_question"]
    respostas_referencia = " ".join([resp["reference_response"] for resp in item["reference_responses"]])
    
    for resposta_aluno in item["responses_students"]:
        resposta_texto = resposta_aluno["answer_question"]
        nota_original = resposta_aluno["grade"]
        
        nota_bert = avaliar_resposta_bert(resposta_texto, respostas_referencia)
        nota_use = avaliar_resposta_use(resposta_texto, respostas_referencia)
        nota_t5 = avaliar_resposta_t5(resposta_texto, respostas_referencia)
        
        output_data.append({
            "number_question": numero_pergunta,
            "answer_question": resposta_texto,
            "original_grade": nota_original,
            "bert_grade": nota_bert,
            "use_grade": nota_use,
            "t5_grade": nota_t5
        })

# Salvar o resultado em um arquivo JSON
with open("correcao_llms.json", "w", encoding="utf-8") as f:
    json.dump(output_data, f, ensure_ascii=False, indent=4)

print("Correção concluída. Resultados salvos em 'correcao_llms.json'")





Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.














spiece.model:   0%|          | 0.00/756k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/456 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Correção concluída. Resultados salvos em 'correcao_llms.json'
