In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
import shutil

# Load tokenizer and model
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

# Add special tokens
tokenizer.add_special_tokens({'additional_special_tokens': ['<start>', '<end>']})

# Read additional data for continued training
additional_dataset_path = "/content/drive/MyDrive/content.txt"

# Create a TextDataset from the additional data
additional_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path=additional_dataset_path,
    block_size=128,
    overwrite_cache=True,
)

# Create a data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Fine-tuning configuration
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/t5-fine-tuned",
    overwrite_output_dir=True,
    num_train_epochs=1,  # Adjust as needed
    per_device_train_batch_size=8,
    save_steps=2_500,
    save_total_limit=3,
    logging_steps=500,
    learning_rate=2e-5,
)

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=additional_dataset,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("/content/drive/MyDrive/t5-fine-tuned")
tokenizer.save_pretrained("/content/drive/MyDrive/t5-fine-tuned")




from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load fine-tuned model and tokenizer
fine_tuned_model_path = "/content/drive/MyDrive/t5-fine-tuned"
model = T5ForConditionalGeneration.from_pretrained(fine_tuned_model_path)
tokenizer = T5Tokenizer.from_pretrained(fine_tuned_model_path)

# Set generation parameters
max_length = 50  # Maximum number of tokens in each generated sequence
num_return_sequences = 1  # Number of independent sequences to generate

# Start interactive loop
while True:
    # Get user input
    prompt_text = input("You: ")

    # Tokenize prompt text
    input_ids = tokenizer.encode(prompt_text, return_tensors="pt")

    # Generate text
    output_sequences = model.generate(
        input_ids=input_ids,
        max_length=max_length,
        num_return_sequences=num_return_sequences,
        early_stopping=True  # Stop generation when prediction finishes
    )

    # Decode generated sequences
    generated_texts = [tokenizer.decode(sequence, skip_special_tokens=True) for sequence in output_sequences]

    # Print generated text
    for i, text in enumerate(generated_texts):
        print(f"Model: {text}")
