In [None]:
# Cài đặt các thư viện cần thiết
%pip install transformers datasets evaluate torch accelerate
%pip install rouge_score bert_score sacrebleu
%pip install sentencepiece protobuf


In [None]:
# Import các thư viện
import json
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
import seaborn as sns

from transformers import (
    T5Tokenizer, T5ForConditionalGeneration,
    TrainingArguments, Trainer,
    DataCollatorForSeq2Seq
)

from datasets import load_metric
import evaluate

import warnings
warnings.filterwarnings('ignore')

# Thiết lập device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")


In [None]:
# Đọc dữ liệu từ file JSON
with open('all_exercises.json', 'r', encoding='utf-8') as f:
    exercises_data = json.load(f)

print(f"Tổng số exercises: {len(exercises_data)}")
print(f"\nVí dụ một exercise:")
print(json.dumps(exercises_data[0], indent=2, ensure_ascii=False))


In [None]:
# Phân tích dữ liệu
df = pd.DataFrame(exercises_data)

print("Thống kê về types của exercises:")
print(df['type'].value_counts())
print("\nThống kê về skills:")
print(df['skill'].value_counts())
print("\nThống kê về levels:")
print(df['level'].value_counts())


In [None]:
# Tạo dataset cho task text generation
# Chúng ta sẽ tạo task: từ câu hỏi + options -> tạo explanation

def prepare_training_data(exercises_data):
    training_examples = []
    
    for exercise in exercises_data:
        if 'options' in exercise and 'explanation' in exercise:
            # Tạo input text
            input_text = f"Question: {exercise['question']}\n"
            
            if exercise['options']:
                input_text += "Options:\n"
                for option in exercise['options']:
                    input_text += f"{option['key']}: {option['option']}\n"
            
            input_text += f"Correct Answer: {exercise['system_answer']}\n"
            input_text += "Explain why this answer is correct:"
            
            # Target text là explanation
            target_text = exercise['explanation']
            
            training_examples.append({
                'input_text': input_text,
                'target_text': target_text,
                'level': exercise.get('level', 'unknown'),
                'skill': exercise.get('skill', 'unknown')
            })
    
    return training_examples

training_data = prepare_training_data(exercises_data)
print(f"Tổng số training examples: {len(training_data)}")
print(f"\nVí dụ training example:")
print(f"Input: {training_data[0]['input_text'][:200]}...")
print(f"Target: {training_data[0]['target_text'][:100]}...")


In [None]:
# Chia dữ liệu train/validation/test
from sklearn.model_selection import train_test_split

# Chia 80% train, 10% validation, 10% test
train_data, temp_data = train_test_split(training_data, test_size=0.2, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

print(f"Train set: {len(train_data)} examples")
print(f"Validation set: {len(val_data)} examples")
print(f"Test set: {len(test_data)} examples")


In [None]:
# Load model và tokenizer
model_name = "google/flan-t5-small"  # Sử dụng flan-t5-small cho demo
# Bạn có thể thay bằng "venify/googleT5" nếu có access

tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

print(f"Model loaded: {model_name}")
print(f"Model parameters: {model.num_parameters():,}")


In [None]:
# Load các metrics
def load_evaluation_metrics():
    metrics = {}
    
    try:
        metrics['bleu'] = evaluate.load('bleu')
        print("✓ BLEU metric loaded")
    except:
        print("✗ Failed to load BLEU metric")
    
    try:
        metrics['rouge'] = evaluate.load('rouge')
        print("✓ ROUGE metric loaded")
    except:
        print("✗ Failed to load ROUGE metric")
    
    try:
        metrics['bertscore'] = evaluate.load('bertscore')
        print("✓ BERTScore metric loaded")
    except:
        print("✗ Failed to load BERTScore metric")
    
    try:
        metrics['meteor'] = evaluate.load('meteor')
        print("✓ METEOR metric loaded")
    except:
        print("✗ Failed to load METEOR metric")
    
    return metrics

evaluation_metrics = load_evaluation_metrics()


In [None]:
# Hàm generate text
def generate_text(model, tokenizer, input_text, max_length=200):
    model.eval()
    with torch.no_grad():
        # Tokenize input
        inputs = tokenizer(
            input_text,
            return_tensors='pt',
            truncation=True,
            max_length=512
        )
        
        # Generate
        outputs = model.generate(
            inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            max_length=max_length,
            num_beams=4,
            early_stopping=True,
            do_sample=False
        )
        
        # Decode
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        return generated_text

# Test generation với một ví dụ
test_input = "Question: What is the capital of France?\nOptions:\nA: London\nB: Paris\nC: Berlin\nD: Madrid\nCorrect Answer: B\nExplain why this answer is correct:"

generated = generate_text(model, tokenizer, test_input)

print("Input:")
print(test_input)
print("\nGenerated:")
print(generated)


In [None]:
# Đánh giá trên test set (sample nhỏ để demo)
def evaluate_model(model, tokenizer, test_data, metrics, sample_size=20):
    """
    Đánh giá mô hình với các metrics khác nhau
    """
    # Lấy sample để đánh giá nhanh
    sample_data = test_data[:sample_size]
    
    predictions = []
    references = []
    
    print(f"Generating predictions for {len(sample_data)} examples...")
    
    for i, example in enumerate(sample_data):
        if i % 5 == 0:
            print(f"Progress: {i}/{len(sample_data)}")
        
        generated = generate_text(model, tokenizer, example['input_text'])
        predictions.append(generated)
        references.append(example['target_text'])
    
    print("Calculating metrics...")
    results = {}
    
    # BLEU Score
    if 'bleu' in metrics:
        try:
            bleu_result = metrics['bleu'].compute(
                predictions=[pred.split() for pred in predictions],
                references=[[ref.split()] for ref in references]
            )
            results['bleu'] = bleu_result['bleu']
        except Exception as e:
            print(f"Error calculating BLEU: {e}")
            results['bleu'] = None
    
    # ROUGE Score
    if 'rouge' in metrics:
        try:
            rouge_result = metrics['rouge'].compute(
                predictions=predictions,
                references=references
            )
            results['rouge'] = {
                'rouge1': rouge_result['rouge1'],
                'rouge2': rouge_result['rouge2'],
                'rougeL': rouge_result['rougeL']
            }
        except Exception as e:
            print(f"Error calculating ROUGE: {e}")
            results['rouge'] = None
    
    return results, predictions, references

# Chạy evaluation với sample nhỏ để demo
if len(training_data) > 0:
    # Sử dụng sample data để test
    sample_test_data = [
        {
            'input_text': "Question: What is the capital of France?\nOptions:\nA: London\nB: Paris\nC: Berlin\nD: Madrid\nCorrect Answer: B\nExplain why this answer is correct:",
            'target_text': "Paris is the capital and largest city of France. It has been the capital since the late 10th century and is the political, economic, and cultural center of the country."
        },
        {
            'input_text': "Question: Which tense is used in 'I have been studying'?\nOptions:\nA: Present Simple\nB: Present Perfect\nC: Present Perfect Continuous\nD: Past Perfect\nCorrect Answer: C\nExplain why this answer is correct:",
            'target_text': "The phrase 'I have been studying' uses the Present Perfect Continuous tense. This tense is formed with 'have/has + been + verb-ing' and indicates an action that started in the past and continues to the present moment or has recently finished with relevance to the present."
        }
    ]
    
    eval_results, predictions, references = evaluate_model(
        model, tokenizer, sample_test_data, evaluation_metrics, sample_size=2
    )
else:
    print("No training data available for evaluation")


In [None]:
# Hiển thị kết quả đánh giá
def display_evaluation_results(results):
    print("=" * 50)
    print("EVALUATION RESULTS")
    print("=" * 50)
    
    if results.get('bleu') is not None:
        print(f"📊 BLEU Score: {results['bleu']:.4f}")
        print("   - Measures precision of n-gram overlaps")
        print("   - Range: 0-1 (higher is better)")
        print()
    
    if results.get('rouge') is not None:
        print(f"📊 ROUGE Scores:")
        print(f"   - ROUGE-1: {results['rouge']['rouge1']:.4f}")
        print(f"   - ROUGE-2: {results['rouge']['rouge2']:.4f}")
        print(f"   - ROUGE-L: {results['rouge']['rougeL']:.4f}")
        print("   - Measures recall of important content")
        print()
    
    # Hiển thị so sánh predictions vs references
    print("📝 SAMPLE COMPARISONS")
    print("=" * 30)
    if 'predictions' in globals() and 'references' in globals():
        for i in range(min(2, len(predictions))):
            print(f"\nExample {i+1}:")
            print(f"Reference: {references[i][:150]}...")
            print(f"Predicted: {predictions[i][:150]}...")
            print("-" * 50)

if 'eval_results' in globals():
    display_evaluation_results(eval_results)
else:
    print("Run evaluation first to see results")


In [None]:
# Visualization của kết quả
def plot_evaluation_results(results):
    if not results:
        print("No results to plot")
        return
        
    # Chuẩn bị dữ liệu cho visualization
    metrics_data = []
    
    if results.get('bleu') is not None:
        metrics_data.append(('BLEU', results['bleu']))
    
    if results.get('rouge') is not None:
        metrics_data.extend([
            ('ROUGE-1', results['rouge']['rouge1']),
            ('ROUGE-2', results['rouge']['rouge2']),
            ('ROUGE-L', results['rouge']['rougeL'])
        ])
    
    if metrics_data:
        # Tạo bar chart
        plt.figure(figsize=(10, 6))
        
        metrics_names = [item[0] for item in metrics_data]
        scores = [item[1] for item in metrics_data]
        
        bars = plt.bar(metrics_names, scores, 
                      color=['skyblue', 'lightcoral', 'lightgreen', 'orange', 'purple'][:len(metrics_names)])
        
        # Thêm giá trị lên đầu bar
        for bar, score in zip(bars, scores):
            plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01, 
                    f'{score:.3f}', ha='center', va='bottom', fontweight='bold')
        
        plt.title('Model Evaluation Results', fontsize=16, fontweight='bold')
        plt.ylabel('Score', fontsize=12)
        plt.xlabel('Metrics', fontsize=12)
        plt.ylim(0, 1)
        plt.xticks(rotation=45)
        plt.grid(axis='y', alpha=0.3)
        plt.tight_layout()
        plt.show()
    else:
        print("No metrics data available for plotting")

# Tạo biểu đồ so sánh
def create_comparison_table():
    """
    Tạo bảng so sánh các metrics
    """
    comparison_data = {
        'Metric': ['BLEU', 'ROUGE-1', 'ROUGE-2', 'ROUGE-L', 'BERTScore', 'METEOR'],
        'Purpose': [
            'Precision-based n-gram overlap',
            'Recall-based unigram overlap', 
            'Recall-based bigram overlap',
            'Longest common subsequence',
            'Semantic similarity via BERT',
            'Harmonic mean + synonyms'
        ],
        'Range': ['0-1', '0-1', '0-1', '0-1', '0-1', '0-1'],
        'Best For': [
            'Machine Translation',
            'Text Summarization',
            'Text Summarization',
            'Text Summarization',
            'Any text generation',
            'Machine Translation'
        ],
        'Speed': ['Fast', 'Fast', 'Fast', 'Fast', 'Slow', 'Medium']
    }
    
    df_comparison = pd.DataFrame(comparison_data)
    print("📊 METRICS COMPARISON TABLE")
    print("=" * 80)
    print(df_comparison.to_string(index=False))
    return df_comparison

# Gọi functions
if 'eval_results' in globals():
    plot_evaluation_results(eval_results)

create_comparison_table()


In [None]:
# Tạo custom dataset class cho training
class T5Dataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data[idx]
        
        # Tokenize input
        input_encoding = self.tokenizer(
            item['input_text'],
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        # Tokenize target
        target_encoding = self.tokenizer(
            item['target_text'],
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': input_encoding['input_ids'].flatten(),
            'attention_mask': input_encoding['attention_mask'].flatten(),
            'labels': target_encoding['input_ids'].flatten()
        }

# Thiết lập training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=2,  # Nhỏ để tránh OOM
    per_device_eval_batch_size=2,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=50,
    evaluation_strategy="steps",
    eval_steps=200,
    save_steps=500,
    save_total_limit=2,
    prediction_loss_only=True,
    remove_unused_columns=False
)

print("Training setup ready!")
print("Uncomment the training code below to start training")


In [None]:
# Training code (UNCOMMENT để train)
"""
# Tạo datasets cho training
if len(training_data) > 0:
    train_dataset = T5Dataset(train_data, tokenizer)
    val_dataset = T5Dataset(val_data, tokenizer)
    
    # Data collator
    data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
    
    # Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        data_collator=data_collator,
        tokenizer=tokenizer
    )
    
    print("Starting training...")
    trainer.train()
    print("Training completed!")
    
    # Save the model
    trainer.save_model("./fine_tuned_t5")
    print("Model saved to ./fine_tuned_t5")
else:
    print("No training data available")
"""

print("Training code is commented out")
print("Uncomment the code above to start training")
print("⚠️ Warning: Training may take several hours and requires GPU")


In [None]:
# Tạo evaluation report
def create_evaluation_report(results):
    report = f"""
# T5 Model Evaluation Report

## Model Information
- Model: {model_name}
- Evaluation Date: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}
- Purpose: English Learning Exercise Explanation Generation

## Metrics Results
"""
    
    if results.get('bleu') is not None:
        report += f"\n- BLEU Score: {results['bleu']:.4f}"
    
    if results.get('rouge') is not None:
        report += f"""
- ROUGE-1: {results['rouge']['rouge1']:.4f}
- ROUGE-2: {results['rouge']['rouge2']:.4f}
- ROUGE-L: {results['rouge']['rougeL']:.4f}"""
    
    report += """

## Interpretation Guide
- **BLEU**: 0.3+ is good, 0.5+ is excellent for translation
- **ROUGE-1**: 0.4+ is good for summarization
- **ROUGE-2**: 0.2+ is good for summarization
- **ROUGE-L**: 0.3+ is good for summarization

## Recommendations
Based on the evaluation results:

1. Consider the semantic metrics (BERTScore) alongside traditional metrics
2. Monitor text quality through human evaluation
3. Use multiple metrics for comprehensive evaluation
4. Regular re-evaluation with fresh data
5. Focus on educational appropriateness for English learning

## venify/googleT5 vs Standard T5
- venify/googleT5 is likely fine-tuned for educational content
- Should show better performance on English learning tasks
- May have specialized vocabulary for educational contexts
"""
    
    return report

# Tạo report
if 'eval_results' in globals() and eval_results:
    evaluation_report = create_evaluation_report(eval_results)
    print(evaluation_report)
    
    # Lưu report vào file
    with open('t5_evaluation_report.md', 'w', encoding='utf-8') as f:
        f.write(evaluation_report)
    
    print("\n✅ Evaluation report saved to 't5_evaluation_report.md'")
else:
    print("No evaluation results available. Run the evaluation first.")
