<a href="https://colab.research.google.com/github/Deepersensor/NaijaGPT/blob/master/naijagpt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# prompt: build a python notebook that attempts to train and build a text conversational model from scratch also using data (.parquet files) in the data/folder

# Install necessary libraries
!pip install transformers datasets evaluate rouge_score pandas pyarrow

# Import libraries
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer
import evaluate
import numpy as np

# Load data from Parquet files (adjust paths as needed)
def load_data_from_parquet(data_dir):
  data_files = {
      'train': data_dir + 'train.parquet',
      'validation': data_dir + 'validation.parquet',
      # Add test data if available: 'test': data_dir + 'test.parquet',
  }
  dataset = Dataset.from_parquet(data_files)
  return dataset

# Replace 'data/' with the actual path to your data folder
dataset = load_data_from_parquet('data/')

# Preprocess data (customize based on your dataset structure)
def preprocess_function(examples):
  model_inputs = tokenizer(examples['input_text'], max_length=1024, truncation=True)
  with tokenizer.as_target_tokenizer():
    labels = tokenizer(examples['output_text'], max_length=1024, truncation=True)
  model_inputs["labels"] = labels["input_ids"]
  return model_inputs

# Choose a pre-trained model for sequence-to-sequence tasks
model_checkpoint = "google/flan-t5-base" # Or another model like "t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

tokenized_datasets = dataset.map(preprocess_function, batched=True)

# Define evaluation metrics (customize based on your task)
metric = evaluate.load("rouge")

def compute_metrics(eval_pred):
  predictions, labels = eval_pred
  decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
  labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
  decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

  result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
  # Extract ROUGE scores
  result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
  return {k: round(v, 4) for k, v in result.items()}

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True, # Use mixed precision training if supported by your hardware
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# Evaluate the model
trainer.evaluate()

# Save the trained model
trainer.save_model("./trained_model")

# Example of how to use the trained model for inference
input_text = "How are you doing?"
inputs = tokenizer(input_text, return_tensors="pt").to("cuda")
outputs = model.generate(**inputs)
decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
decoded_output