In [None]:
!pip install transformers datasets scikit-learn nltk



In [None]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=db673ece42061c9edd374f4e9f3d3d920edc16351399ef032f933327fa941818
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [None]:
pip install sacrebleu rouge-score


Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.1.1-py3-none-any.whl.metadata (8.6 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-3.1.1-py3-none-any.whl (19 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-3.1.1 sacrebleu-2.5.1


In [None]:
import os
import pandas as pd
import nltk
import string
import re
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import (
    AutoTokenizer,
    BartForConditionalGeneration,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq
)
from datasets import Dataset
import torch
from sacrebleu import corpus_bleu
from rouge_score import rouge_scorer

# Disable W&B tracking
os.environ['WANDB_DISABLED'] = 'true'

# Download NLTK resources
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Load and clean data
df = pd.read_csv("/content/AI.csv")
df.dropna(subset=["Question", "Answer"], inplace=True)

# NLP tools setup
stop_words = set(stopwords.words('english'))

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def preprocess(text: str) -> str:
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    tokens = [w for w in tokens if w not in stop_words]
    return ' '.join(tokens)

# Metrics function
def compute_metrics(preds, refs):
    bleu = corpus_bleu(preds, [refs])
    bleu_score = bleu.score

    scorer = rouge_scorer.RougeScorer(['rouge1','rouge2','rougeL'], use_stemmer=True)
    rouge1 = rouge2 = rougeL = 0.0
    for p, r in zip(preds, refs):
        scores = scorer.score(r, p)
        rouge1 += scores['rouge1'].fmeasure
        rouge2 += scores['rouge2'].fmeasure
        rougeL  += scores['rougeL'].fmeasure
    n = len(preds)
    return {
        'bleu': bleu_score,
        'rouge1': rouge1 / n,
        'rouge2': rouge2 / n,
        'rougeL': rougeL / n
    }

# Prepare text columns
df['input_text'] = df['Question'].apply(preprocess)
df['target_text'] = df['Answer']

# TF-IDF setup
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(df['input_text'])

# Split data
train_df = df.sample(frac=0.9, random_state=42)
test_df  = df.drop(train_df.index)
train_ds = Dataset.from_pandas(train_df[['input_text','target_text']])
test_ds  = Dataset.from_pandas(test_df[['input_text','target_text']])

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('facebook/bart-base')
model     = BartForConditionalGeneration.from_pretrained('facebook/bart-base')
tokenizer.pad_token = tokenizer.eos_token

# Tokenization function
def tokenize_func(examples):
    inputs = tokenizer(
        examples['input_text'],
        truncation=True,
        padding='max_length',
        max_length=128
    )
    with tokenizer.as_target_tokenizer():
        targets = tokenizer(
            examples['target_text'],
            truncation=True,
            padding='max_length',
            max_length=128
        )
    inputs['labels'] = targets['input_ids']
    return inputs

train_ds = train_ds.map(tokenize_func, batched=True, remove_columns=['input_text','target_text'])
test_ds  = test_ds.map(tokenize_func,  batched=True, remove_columns=['input_text','target_text'])

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    run_name='bart_base_finetune',
    report_to=['none'],
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_strategy='epoch',
    save_strategy='epoch',
    logging_strategy='steps',
    logging_steps=50,
    predict_with_generate=True,
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss'
)

# Trainer setup
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    data_collator=data_collator,
    tokenizer=tokenizer
)

def generate_answer(question: str, threshold: float = 0.4):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    q_clean = preprocess(question)
    q_vec   = tfidf_vectorizer.transform([q_clean])
    sims    = cosine_similarity(q_vec, X_tfidf)[0]
    idx     = sims.argmax()
    max_sim = sims[idx]

    if max_sim >= threshold:
        ans  = df.loc[idx, 'Answer']
        expl = f"Maximum similarity = {max_sim:.2f} ≥ {threshold}. Answer retrieved from data."
    else:
        inputs = tokenizer(question, return_tensors='pt').to(device)
        outputs = model.generate(
            **inputs,
            max_length=100,
            num_beams=5,
            early_stopping=True,
            no_repeat_ngram_size=3,
            num_return_sequences=1
        )
        ans  = tokenizer.decode(outputs[0], skip_special_tokens=True)
        expl = f"Maximum similarity = {max_sim:.2f} < {threshold}. Answer generated by BART-base."
        if ans.strip().lower() == question.strip().lower():
            expl += " ⚠️ The generated answer is identical to the question — likely poor generation."
    return ans, expl

if __name__ == '__main__':
    print("Starting fine-tuning BART-base...")
    train_result = trainer.train()
    print(f"Training loss: {train_result.training_loss:.4f}")

    print("Evaluating model...")
    eval_results = trainer.evaluate()
    print(f"Validation loss: {eval_results['eval_loss']:.4f}")

    print("Computing BLEU and ROUGE on the test set...")
    preds, refs = [], []
    for q, true_a in zip(test_df['Question'], test_df['Answer']):
        gen_a, _ = generate_answer(q, threshold=0.0)
        preds.append(gen_a)
        refs.append(true_a)

    metrics = compute_metrics(preds, refs)
    print(f"BLEU score: {metrics['bleu']:.2f}")
    print(f"ROUGE-1: {metrics['rouge1']:.2f}, ROUGE-2: {metrics['rouge2']:.2f}, ROUGE-L: {metrics['rougeL']:.2f}")

    user_q = input("Enter your question: ")
    ans, expl = generate_answer(user_q)
    print("Question:", user_q)
    print("Answer:", ans)
    print("Explanation:", expl)


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Map:   0%|          | 0/453 [00:00<?, ? examples/s]



Map:   0%|          | 0/50 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(


Starting fine-tuning BART-base...


Epoch,Training Loss,Validation Loss
1,0.5831,0.413691
2,0.4185,0.379291
3,0.3597,0.369081
4,0.3206,0.366687
5,0.2985,0.366465


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


Training loss: 0.3811
Evaluating model...


Validation loss: 0.3665
Computing BLEU and ROUGE on the test set...
BLEU score: 90.44
ROUGE-1: 0.96, ROUGE-2: 0.87, ROUGE-L: 0.96
Enter your question: What sources was drawn on the formation of AI?
Question: What sources was drawn on the formation of AI?
Answer: knowledge of the basic physiology and function of neurons in the brain; a formal analysis of propositional logic due to Russell and Whitehead; and Turing's theory of computation.

Explanation: Maximum similarity = 0.73 ≥ 0.4. Answer retrieved from data.
