In [19]:
import pandas as pd
from datasets import Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
import ast


In [20]:
# Load data CSV
df = pd.read_csv("data1.csv")

# Jika kolom 'options' disimpan sebagai string (e.g. "['A', 'B', 'C', 'D']"), parse ke list:
df["options"] = df["options"].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)


In [21]:
# Fungsi untuk buat input-output format
def make_input_output(row):
    opts = row["options"]
    option_labels = ['A', 'B', 'C', 'D']
    options_text = " ".join([f"{label}. {text}" for label, text in zip(option_labels, opts)])

    input_text = (
        f"Passage: {row['article']}\n"
        f"Question: {row['question']}\n"
        f"Options: {options_text}\n"
        f"Answer: {row['answer']}\n"
        f"Instruction: Berikan penjelasan dalam Bahasa Indonesia mengapa jawaban berikut benar.."
    )
    return {"input": input_text, "output": row["feedback"]}

# Terapkan ke seluruh dataframe
formatted = df.apply(make_input_output, axis=1)
dataset = Dataset.from_pandas(pd.DataFrame(formatted.tolist()))

In [22]:
# Load tokenizer dan model FLAN-T5
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")


In [23]:
# Tokenisasi
def tokenize(example):
    model_input = tokenizer(example["input"], truncation=True, padding="max_length", max_length=512)
    label = tokenizer(example["output"], truncation=True, padding="max_length", max_length=512)
    model_input["labels"] = label["input_ids"]
    return model_input

tokenized = dataset.map(tokenize)

Map:   0%|          | 0/596 [00:00<?, ? examples/s]

In [24]:
# Training args
training_args = TrainingArguments(
    output_dir="./flan-t5-toefl1",
    per_device_train_batch_size=2,
    num_train_epochs=36,
    logging_dir="./logs",
    save_total_limit=1,
    report_to=[]  # ⬅️ ini nonaktifin logging ke wandb
)


# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized,
)

# Train model
trainer.train()

Step,Training Loss
500,1.7431
1000,0.8446
1500,0.7709
2000,0.7207
2500,0.6827
3000,0.6491
3500,0.6285
4000,0.5976
4500,0.5804
5000,0.5641


TrainOutput(global_step=10728, training_loss=0.6263203475121506, metrics={'train_runtime': 9729.6669, 'train_samples_per_second': 2.205, 'train_steps_per_second': 1.103, 'total_flos': 1.4692153754124288e+16, 'train_loss': 0.6263203475121506, 'epoch': 36.0})

In [25]:
model.save_pretrained("./flan-t5-toefl1")
tokenizer.save_pretrained("./flan-t5-toefl1")


('./flan-t5-toefl1/tokenizer_config.json',
 './flan-t5-toefl1/special_tokens_map.json',
 './flan-t5-toefl1/spiece.model',
 './flan-t5-toefl1/added_tokens.json')

In [27]:
# Load model dan tokenizer hasil fine-tuning (atau langsung pakai 'model' dari RAM)
model_path = "./flan-t5-toefl1"  # <- ganti dengan folder hasil fine-tune kamu
tokenizer = T5Tokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)

# Siapkan contoh input
passage = """"Do you like playing soccer? If you do, you may want to join the school soccer team. Read on and see how to join the school soccer team.
The first step is to know the information  about the team. You can find more information by asking these questions. How many students are there in the soccer team? How many new players do they need? How often do they play soccer?
The second step is to practice. Practice soccer with your friends or your family. Only good players can join the school soccer team. So you need to practice your ""skill.""
The third step is to study hard. If you always get bad grades in the exam, your teachers and parents won't let you join the soccer team.
The last step is to relax. Don't be too nervous . You can take a deep breath  and keep smiling. Believe in yourself and you can be the best player."""
question = "Which step is about studying hard?"
options = ['A. The first step.', 'B. The second step.', 'C.The third step.' 'D. The last step.']
answer = "C"

# Format prompt-nya
input_text = f"""Passage: {passage}
Question: {question}
Options: {" | ".join(options)}
Answer: {answer}
Instruction: Berikan penjelasan dalam Bahasa Indonesia mengapa jawaban berikut benar."""

# Tokenisasi
inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True)

# Generate jawaban
output_ids = model.generate(
    input_ids=inputs["input_ids"],
    attention_mask=inputs["attention_mask"],
    max_length=256,
    num_beams=4
)

# Decode hasilnya
output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

print("🧠 Feedback dari model:\n", output_text)


🧠 Feedback dari model:
 Pilihan A:nJawaban ini kurang tepat karena tidak ada informasi bahwa tujuan utama adalah pria tua.nnPilihan B:nJawaban ini kurang tepat karena tidak ada informasi bahwa tujuan utama adalah pria tua.nnPilihan C:nJawaban ini benar karena teks menyatakan bahwa tujuan utama adalah pria tua yang mengajarkan informasi saat tim sepak bola.nnPilihan D:nJawaban ini kurang tepat karena tidak ada informasi bahwa
