<a href="https://colab.research.google.com/github/Dhanya-Zac/Multilingual-LLM-hallucination-test/blob/main/trivia_tokenization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import json
import random
import datasets
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed, GenerationConfig
import csv


In [None]:
from google.colab import files

In [None]:
!pip install safetensors




In [None]:
!pip install -q transformers datasets accelerate

In [None]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [None]:


class KnowledgeDataset:
    def __init__(self, model_name="deepseek-ai/deepseek-llm-7b-base",
                 path_to_knowledge_dataset="datasets/", dataset_name="triviaqa"):

        set_seed(42)
        torch.manual_seed(42)

        self.model_name = model_name
        self.dataset_name = dataset_name

        #Load DeepSeek tokenizer and model
        #self.tok = AutoTokenizer.from_pretrained(self.model_name, use_fast=True)
        self.tok = AutoTokenizer.from_pretrained("deepseek-ai/deepseek-llm-7b-base", use_fast=True)

        self.model = AutoModelForCausalLM.from_pretrained(
            self.model_name,
            torch_dtype=torch.bfloat16,
            device_map="auto",
            offload_folder="./offload"
        )
        self.model.generation_config = GenerationConfig.from_pretrained(self.model_name)
        self.model.generation_config.pad_token_id = self.model.generation_config.eos_token_id

        self.tok.padding_side = "left"
        self.tok.pad_token = self.tok.eos_token

        #Load dataset
        if self.dataset_name == "triviaqa":
            initial_dataset = self.create_initial_dataset_for_trivia_qa()
        else:  # natural questions
            initial_dataset = self.create_initial_dataset_for_natural_questions()

        # --- Filter and save dataset ---
        self.create_knowledge_dataset(initial_dataset, path_to_knowledge_dataset)

    # --- Initial dataset loader for TriviaQA ---
    def create_initial_dataset_for_trivia_qa(self):
        print("Creating initial dataset for TriviaQA...")
        dataset = datasets.load_dataset("trivia_qa", 'rc')
        train = dataset["train"]
        data = []

        for i, row in enumerate(train):
            prompt = "question: " + row["question"] + "\nanswer:"
            answer = row["answer"]["value"]

            # --- Filtering Rules ---
            # 1. Not all uppercase (except first letter)
            if answer.isupper() and len(answer) > 1:
                answer = answer[0] + answer[1:].lower()
            # 2. Must not contain "." or "/"
            if "." in answer or "/" in answer:
                continue
            # 3. Must not contain digits
            if any(char.isdigit() for char in answer):
                continue
            # 4. Tokenized answer must not exceed 5 tokens
            tokens = self.tok(answer)["input_ids"]
            if len(tokens) > 5:
                continue
            # 5. Prompt must be unique
            if prompt in [e[0] for e in data]:
                continue

            data.append([prompt, answer])

        # Randomly sample up to 70k examples
        data = random.sample(data, min(70000, len(data)))
        print(f"Finished creating initial dataset with {len(data)} examples")
        return data

    # --- Placeholder for Natural Questions dataset ---
    #def create_initial_dataset_for_natural_questions(self):
        #raise NotImplementedError("Add your Natural Questions loader here if needed.")

    # --- Save dataset to JSONL and CSV ---
    def create_knowledge_dataset(self, data, path):
        os.makedirs(path, exist_ok=True)

        # JSONL save
        out_path_jsonl = f"{path}{self.dataset_name}_filtered.jsonl"
        with open(out_path_jsonl, "w", encoding="utf-8") as f:
            for d in data:
                json.dump({"prompt": d[0], "answer": d[1]}, f)
                f.write("\n")

        # CSV save
        out_path_csv = f"{path}{self.dataset_name}_filtered.csv"
        with open(out_path_csv, "w", encoding="utf-8", newline="") as f:
            writer = csv.writer(f)
            writer.writerow(["prompt", "answer"])
            for d in data:
                writer.writerow([d[0], d[1]])

        print(f"Knowledge dataset saved at:\n- {out_path_jsonl}\n- {out_path_csv}")


In [None]:
knowledge_dataset = KnowledgeDataset()


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



Creating initial dataset for TriviaQA...


Resolving data files:   0%|          | 0/26 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/26 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/26 [00:00<?, ?files/s]

train-00000-of-00026.parquet:   0%|          | 0.00/308M [00:00<?, ?B/s]

train-00001-of-00026.parquet:   0%|          | 0.00/298M [00:00<?, ?B/s]

train-00002-of-00026.parquet:   0%|          | 0.00/290M [00:00<?, ?B/s]

train-00003-of-00026.parquet:   0%|          | 0.00/444M [00:00<?, ?B/s]

train-00004-of-00026.parquet:   0%|          | 0.00/461M [00:00<?, ?B/s]

train-00005-of-00026.parquet:   0%|          | 0.00/474M [00:00<?, ?B/s]

train-00006-of-00026.parquet:   0%|          | 0.00/404M [00:00<?, ?B/s]

train-00007-of-00026.parquet:   0%|          | 0.00/324M [00:00<?, ?B/s]

train-00008-of-00026.parquet:   0%|          | 0.00/329M [00:00<?, ?B/s]

train-00009-of-00026.parquet:   0%|          | 0.00/336M [00:00<?, ?B/s]

train-00010-of-00026.parquet:   0%|          | 0.00/400M [00:00<?, ?B/s]

train-00011-of-00026.parquet:   0%|          | 0.00/370M [00:00<?, ?B/s]

train-00012-of-00026.parquet:   0%|          | 0.00/341M [00:00<?, ?B/s]

train-00013-of-00026.parquet:   0%|          | 0.00/327M [00:00<?, ?B/s]

train-00014-of-00026.parquet:   0%|          | 0.00/310M [00:00<?, ?B/s]

train-00015-of-00026.parquet:   0%|          | 0.00/157M [00:00<?, ?B/s]

train-00016-of-00026.parquet:   0%|          | 0.00/136M [00:00<?, ?B/s]

train-00017-of-00026.parquet:   0%|          | 0.00/159M [00:00<?, ?B/s]

train-00018-of-00026.parquet:   0%|          | 0.00/200M [00:00<?, ?B/s]

train-00019-of-00026.parquet:   0%|          | 0.00/180M [00:00<?, ?B/s]

train-00020-of-00026.parquet:   0%|          | 0.00/150M [00:00<?, ?B/s]

train-00021-of-00026.parquet:   0%|          | 0.00/153M [00:00<?, ?B/s]

train-00022-of-00026.parquet:   0%|          | 0.00/147M [00:00<?, ?B/s]

train-00023-of-00026.parquet:   0%|          | 0.00/157M [00:00<?, ?B/s]

train-00024-of-00026.parquet:   0%|          | 0.00/154M [00:00<?, ?B/s]

train-00025-of-00026.parquet:   0%|          | 0.00/158M [00:00<?, ?B/s]

validation-00000-of-00004.parquet:   0%|          | 0.00/327M [00:00<?, ?B/s]

validation-00001-of-00004.parquet:   0%|          | 0.00/296M [00:00<?, ?B/s]

validation-00002-of-00004.parquet:   0%|          | 0.00/184M [00:00<?, ?B/s]

validation-00003-of-00004.parquet:   0%|          | 0.00/129M [00:00<?, ?B/s]

test-00000-of-00004.parquet:   0%|          | 0.00/307M [00:00<?, ?B/s]

test-00001-of-00004.parquet:   0%|          | 0.00/288M [00:00<?, ?B/s]

test-00002-of-00004.parquet:   0%|          | 0.00/171M [00:00<?, ?B/s]

test-00003-of-00004.parquet:   0%|          | 0.00/128M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/138384 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/17944 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/17210 [00:00<?, ? examples/s]

Loading dataset shards:   0%|          | 0/24 [00:00<?, ?it/s]

Finished creating initial dataset with 64328 examples
Knowledge dataset saved at:
- datasets/triviaqa_filtered.jsonl
- datasets/triviaqa_filtered.csv


In [None]:
files.download("datasets/triviaqa_filtered.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>