In [2]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module='tqdm')


In [4]:
import json,glob,os
import pandas as pd
from datasets import Dataset
from collections import defaultdict


In [5]:
train_path=r"C:\Users\thaku\VS_code\Langchain\Fact_check\data\Fever\train.jsonl"
val_path=r"C:\Users\thaku\VS_code\Langchain\Fact_check\data\Fever\shared_task_dev.jsonl"

In [6]:
def load_json(path):
    with open(path,"r",encoding="utf-8") as f:
        return [json.loads(line) for line in f]
    
train_data=load_json(train_path)
print(f"loaded {len(train_data)} claims")

print(json.dumps(train_data[0],indent=2))

loaded 145449 claims
{
  "id": 75397,
  "verifiable": "VERIFIABLE",
  "label": "SUPPORTS",
  "claim": "Nikolaj Coster-Waldau worked with the Fox Broadcasting Company.",
  "evidence": [
    [
      [
        92206,
        104971,
        "Nikolaj_Coster-Waldau",
        7
      ],
      [
        92206,
        104971,
        "Fox_Broadcasting_Company",
        0
      ]
    ]
  ]
}


In [7]:
from collections import Counter
labels=[d["label"] for d in train_data]
print(Counter(labels))

Counter({'SUPPORTS': 80035, 'NOT ENOUGH INFO': 35639, 'REFUTES': 29775})


In [8]:
from datasets import Features,Value,Sequence
features=Features({
    "id":Value("int64"),
    "verifiable":Value("string"),
    "label":Value("string"),
    "claim": Value("string"),
    "evidence":Sequence(Sequence(Sequence(Value("string"))))
})

import ast

for ex in train_data:
    new_evidence = []
    for ele in ex["evidence"]:          # ele = list of evidence lines
        converted_group = []
        for group in ele:               # group = may already be a list OR a string like "['92206', ...]"
            # if group is a string (stringified list), parse it safely
            if isinstance(group, str):
                try:
                    group = ast.literal_eval(group)
                except Exception:
                    group = [group]
            # ensure all elements are strings
            converted_line = [str(item) for item in group]
            converted_group.append(converted_line)
        new_evidence.append(converted_group)
    ex["evidence"] = new_evidence


In [9]:
val_data=load_json(val_path)

for ex in val_data:
    new_evidence = []
    for ele in ex["evidence"]:          # ele = list of evidence lines
        converted_group = []
        for group in ele:               # group = may already be a list OR a string like "['92206', ...]"
            # if group is a string (stringified list), parse it safely
            if isinstance(group, str):
                try:
                    group = ast.literal_eval(group)
                except Exception:
                    group = [group]
            # ensure all elements are strings
            converted_line = [str(item) for item in group]
            converted_group.append(converted_line)
        new_evidence.append(converted_group)
    ex["evidence"] = new_evidence

In [10]:

train_ds=Dataset.from_list(train_data,features=features)
val_ds=Dataset.from_list(val_data)

In [11]:
train_ds[0]["evidence"]

[[['92206', '104971', 'Nikolaj_Coster-Waldau', '7'],
  ['92206', '104971', 'Fox_Broadcasting_Company', '0']]]

In [12]:
wiki_path=r"C:\Users\thaku\VS_code\Langchain\Fact_check\data\Fever\wiki-pages"

In [13]:
from tqdm import tqdm
def load_fever_wiki(wiki_dir):
    """
    Load all wiki-pages-XXX.jsonl files and build:
      wiki_lookup[page_title][line_index] = sentence
      wiki_text[page_title] = full text
    """
    wiki_lookup = {}
    wiki_text = {}
    files = [f for f in os.listdir(wiki_dir) if f.endswith(".jsonl")]
    for file in tqdm(sorted(files), desc="Loading Wikipedia pages"):
        with open(os.path.join(wiki_dir, file), "r", encoding="utf8") as f:
            for line in f:
                try:
                    page = json.loads(line)
                    page_id = page["id"]
                    page_lines = {}
                    for line_item in page["lines"].split("\n"):
                        if not line_item.strip():
                            continue
                        parts = line_item.split("\t", 1)
                        if len(parts) == 2:
                            idx, sent = parts
                            if sent.strip():
                                page_lines[int(idx)] = sent.strip()
                    if page_lines:
                        wiki_lookup[page_id] = page_lines
                    if page.get("text"):
                        wiki_text[page_id] = page["text"]
                except Exception as e:
                    continue
    print(f"Loaded {len(wiki_lookup):,} Wikipedia pages.")
    return wiki_lookup, wiki_text

In [14]:
wiki_lookup, wiki_text=load_fever_wiki(wiki_path)

Loading Wikipedia pages: 100%|██████████| 109/109 [04:15<00:00,  2.35s/it]

Loaded 5,395,683 Wikipedia pages.





In [15]:
import re
from tqdm import tqdm

def clean_wiki_text(text):
    """Clean a single sentence or text block from FEVER wiki dump artifacts."""
    if not text or not isinstance(text, str):
        return ""

    # 1️⃣ Remove tab-separated parts (entity links)
    text = text.split("\t")[0]

    # 2️⃣ Replace bracket placeholders
    text = text.replace("-LRB-", "(").replace("-RRB-", ")")

    # 3️⃣ Replace double dashes with em-dash or space
    text = text.replace("--", "–")

    # 4️⃣ Normalize spaces
    text = re.sub(r"\s+", " ", text).strip()

    return text


def clean_wiki_data(wiki_lookup, wiki_text):
    """Clean all sentences in wiki_lookup and full page text in wiki_text."""
    clean_lookup = {}
    clean_text = {}

    print("🧹 Cleaning wiki_lookup sentences...")
    for page, lines in tqdm(wiki_lookup.items()):
        cleaned_lines = {}
        for idx, sent in lines.items():
            cleaned_lines[idx] = clean_wiki_text(sent)
        clean_lookup[page] = cleaned_lines

    print("🧹 Cleaning wiki_text pages...")
    for page, txt in tqdm(wiki_text.items()):
        clean_text[page] = clean_wiki_text(txt)

    return clean_lookup, clean_text
wiki_lookup, wiki_text = clean_wiki_data(wiki_lookup, wiki_text)


🧹 Cleaning wiki_lookup sentences...


  0%|          | 0/5395683 [00:00<?, ?it/s]

100%|██████████| 5395683/5395683 [09:28<00:00, 9496.56it/s] 


🧹 Cleaning wiki_text pages...


100%|██████████| 5395683/5395683 [09:55<00:00, 9056.74it/s] 


In [16]:
wiki_text["The_Hunger_Games_-LRB-film-RRB-"]

"The Hunger Games is a 2012 American dystopian science fiction adventure film directed by Gary Ross and based on the novel of the same name by Suzanne Collins . It is the first installment in The Hunger Games film series and was produced by Nina Jacobson and Jon Kilik , with a screenplay by Ross , Collins , and Billy Ray . The film stars Jennifer Lawrence , Josh Hutcherson , Liam Hemsworth , Woody Harrelson , Elizabeth Banks , Lenny Kravitz , Stanley Tucci , and Donald Sutherland . The story takes place in a dystopian post-apocalyptic future in the nation of Panem , where boys and girls between the ages of 12 and 18 must take part in the Hunger Games , a televised annual event in which the `` tributes '' are required to fight to the death until there is only one survivor . Katniss Everdeen ( Lawrence ) volunteers to take her younger sister 's place . Joined by her district 's male tribute , Peeta Mellark ( Hutcherson ) , Katniss travels to the Capitol to train for the Hunger Games unde

In [17]:
wiki_lookup["The_Hunger_Games_-LRB-film-RRB-"]

{0: 'The Hunger Games is a 2012 American dystopian science fiction adventure film directed by Gary Ross and based on the novel of the same name by Suzanne Collins .',
 1: 'It is the first installment in The Hunger Games film series and was produced by Nina Jacobson and Jon Kilik , with a screenplay by Ross , Collins , and Billy Ray .',
 2: 'The film stars Jennifer Lawrence , Josh Hutcherson , Liam Hemsworth , Woody Harrelson , Elizabeth Banks , Lenny Kravitz , Stanley Tucci , and Donald Sutherland .',
 3: "The story takes place in a dystopian post-apocalyptic future in the nation of Panem , where boys and girls between the ages of 12 and 18 must take part in the Hunger Games , a televised annual event in which the `` tributes '' are required to fight to the death until there is only one survivor .",
 4: "Katniss Everdeen ( Lawrence ) volunteers to take her younger sister 's place .",
 5: "Joined by her district 's male tribute , Peeta Mellark ( Hutcherson ) , Katniss travels to the Cap

In [18]:
for l in train_data[12]["evidence"]:
    for group in l:
        print(group[2])

The_Hunger_Games_-LRB-film-RRB-
The_Hunger_Games_-LRB-film-RRB-
The_Hunger_Games_-LRB-film-RRB-
The_Hunger_Games_-LRB-film-RRB-


In [19]:
train_data[290]

{'id': 198918,
 'verifiable': 'VERIFIABLE',
 'label': 'SUPPORTS',
 'claim': 'International Relations includes communication.',
 'evidence': [[['233882', '236772', 'International_relations', '9']]]}

In [None]:
for i in range(len(train_data)):
    # first extract topic name and correponding row number to look inside wiki lookup

    topics=defaultdict(set)
    for l in train_data[i]["evidence"]:
        for group in l:
            if group[2]!=None:
                if group[3]!=None:
                    topics[group[2]].add(int(group[3]))
                else:
                    topics[group[2]].add(-1)
    evidence=[]
    for topic in topics:
        for line_number in topics[topic]:
            evidence.append(wiki_lookup[topic][line_number])
    

In [20]:
topics=defaultdict(set)
for l in train_data[290]["evidence"]:
    for group in l:
        if group[2]!=None:
            if group[3]!=None:
                topics[group[2]].add(int(group[3]))
            else:
                topics[group[2]].add(-1)
evidence=[]
for topic in topics:
    for line_number in topics[topic]:
        evidence.append(wiki_lookup[topic][line_number])
print(evidence)

['For example , international relations draws from the fields of : technology and engineering , economics , communication studies , history , international law , demography , philosophy , geography , social work , sociology , anthropology , criminology , psychology , gender studies , cultural studies , culturology , and diplomacy .']


In [21]:
from tqdm import tqdm
from collections import Counter

def evaluate_sentence_page_alignment_no_norm(dataset, wiki_lookup):
    """
    Evaluate FEVER dataset coverage using exact page_id matching (no normalization).
    Handles int, str, or None sentence indices.
    """

    stats = []
    total = len(dataset)

    for ex in tqdm(dataset, desc="Evaluating coverage (no normalization)"):
        has_page = False
        has_sentence = False
        has_page_no_sentence = False

        for group in ex.get("evidence", []):
            for line in group:
                if len(line) < 4:
                    continue

                page = line[2]       # exact FEVER page_id
                sent_idx = line[3]   # may be int, str, or None

                if not page:
                    continue

                # ✅ Page exists?
                if page in wiki_lookup:
                    has_page = True

                    # ✅ Sentence index given?
                    if sent_idx is None:
                        # Page cited but no specific sentence reference
                        has_page_no_sentence = True
                        continue

                    # ✅ Try to convert to integer safely
                    try:
                        sent_idx_int = int(sent_idx)
                    except (ValueError, TypeError):
                        sent_idx_int = None

                    # ✅ Sentence index exists?
                    if sent_idx_int is not None and sent_idx_int in wiki_lookup[page]:
                        has_sentence = True
                    else:
                        has_page_no_sentence = True

        stats.append({
            "id": ex.get("id"),
            "has_page": has_page,
            "has_sentence": has_sentence,
            "has_page_no_sentence": has_page_no_sentence
        })

    # Aggregate results
    c = Counter()
    for s in stats:
        if s["has_page"]:
            c["page"] += 1
        if s["has_sentence"]:
            c["sentence"] += 1
        if s["has_page_no_sentence"]:
            c["page_but_no_sentence"] += 1
        if not s["has_page"]:
            c["no_page"] += 1

    coverage_summary = {
        "total_examples": total,
        "page_exists_%": round(100 * c["page"] / total, 2),
        "sentence_exists_%": round(100 * c["sentence"] / total, 2),
        "page_but_no_sentence_%": round(100 * c["page_but_no_sentence"] / total, 2),
        "no_page_%": round(100 * c["no_page"] / total, 2)
    }

    return coverage_summary, stats


In [22]:
coverage_summary, detailed_stats = evaluate_sentence_page_alignment_no_norm(train_data, wiki_lookup)

print("\n📊 Coverage Summary (handles None, str, int):")
for k, v in coverage_summary.items():
    print(f"{k:30s}: {v}")


Evaluating coverage (no normalization):   0%|          | 0/145449 [00:00<?, ?it/s]

Evaluating coverage (no normalization): 100%|██████████| 145449/145449 [00:12<00:00, 11847.99it/s]



📊 Coverage Summary (handles None, str, int):
total_examples                : 145449
page_exists_%                 : 99.54
sentence_exists_%             : 75.03
page_but_no_sentence_%        : 24.5
no_page_%                     : 0.46


In [23]:
from collections import defaultdict

def extract_sentence_and_page_evidence(example, wiki_lookup, wiki_text):
    """
    Extract both sentence-level and page-level evidence for one FEVER example.
    Returns two strings: (sentence_evidence, page_evidence)
    """

    topics = defaultdict(set)
    for group in example.get("evidence", []):
        for line in group:
            if len(line) >= 4:
                page = line[2]
                sent_idx = line[3]
                if page is not None:
                    if sent_idx is not None:
                        try:
                            topics[page].add(int(sent_idx))
                        except ValueError:
                            topics[page].add(-1)
                    else:
                        topics[page].add(-1)

    sentence_evidence = []
    page_evidence = []

    for topic, line_nums in topics.items():
        # sentence evidence: specific lines
        if topic in wiki_lookup:
            for line_number in line_nums:
                if line_number >= 0 and line_number in wiki_lookup[topic]:
                    sentence_evidence.append(wiki_lookup[topic][line_number])

        # page evidence: entire text when -1 present
        if topic in wiki_text:
            page_evidence.append(wiki_text[topic])

    sentence_evidence_text = " ".join(sentence_evidence)
    page_evidence_text = " ".join(page_evidence)

    return sentence_evidence_text, page_evidence_text


for ex in train_data:
    sent_evi, page_evi = extract_sentence_and_page_evidence(ex, wiki_lookup, wiki_text)
    ex["sentence_evidence"] = sent_evi
    ex["page_evidence"] = page_evi
    # optional combined field
    # ex["evidence_text"] = f"{sent_evi} {page_evi}".strip()


In [24]:
train_data[12]

{'id': 76253,
 'verifiable': 'VERIFIABLE',
 'label': 'SUPPORTS',
 'claim': 'There is a movie called The Hunger Games.',
 'evidence': [[['93100', '106004', 'The_Hunger_Games_-LRB-film-RRB-', '0']],
  [['93100', '106005', 'The_Hunger_Games_-LRB-film-RRB-', '1']],
  [['93100', '106006', 'The_Hunger_Games_-LRB-film-RRB-', '2']],
  [['93100', '106007', 'The_Hunger_Games_-LRB-film-RRB-', '16']]],
 'sentence_evidence': 'The Hunger Games is a 2012 American dystopian science fiction adventure film directed by Gary Ross and based on the novel of the same name by Suzanne Collins . It is the first installment in The Hunger Games film series and was produced by Nina Jacobson and Jon Kilik , with a screenplay by Ross , Collins , and Billy Ray . The film stars Jennifer Lawrence , Josh Hutcherson , Liam Hemsworth , Woody Harrelson , Elizabeth Banks , Lenny Kravitz , Stanley Tucci , and Donald Sutherland . The film was released on March 21 , 2012 , in some European countries and in the US on March 23 ,

In [25]:
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.replace("-LSB-", "[")
    text = text.replace("-RSB-", "]")
    text = text.replace("-LRB-", "(")
    text = text.replace("-RRB-", ")")
    text = text.replace("--", "–")          # replace double dash with en dash
    text = re.sub(r"\s+", " ", text)
    return text.strip()

In [26]:
rag_rows = []

for ex in train_data:
    claim = ex.get("claim", "")
    label = ex.get("label", "")
    sentence_evidence = clean_text(ex.get("sentence_evidence", ""))
    page_evidence = clean_text(ex.get("page_evidence", ""))

    # Combined context for RAG retrieval
    combined_evidence = f"{sentence_evidence} {page_evidence}".strip()

    rag_rows.append({
        "id": ex.get("id"),
        "claim": claim,
        "label": label,
        "sentence_evidence": sentence_evidence,
        "page_evidence": page_evidence
    })

rag_df = pd.DataFrame(rag_rows)

In [62]:
pd.set_option('display.max_colwidth', 100)
rag_df.sample(5)

Unnamed: 0,id,claim,label,sentence_evidence,page_evidence
90525,161307,The Kingdom of Georgia only lasted thirty years.,REFUTES,It reached its Golden Age of political and economic strength during the reign of King David IV a...,"The Kingdom of Georgia , also known as the Georgian Empire , was a medieval monarchy which emerg..."
88786,106588,Marshall McLuhan taught English at University of Toronto as a Professor of English.,NOT ENOUGH INFO,,"None may refer to : Zero , the mathematical concept of the quantity `` none '' The empty set , t..."
83303,87157,Dead Man Down features a Swedish actress born in 1979 in a lead role.,SUPPORTS,"The film stars Colin Farrell , Noomi Rapace , Dominic Cooper , and Terrence Howard , and was rel...",Dead Man Down is an 2013 American neo-noir crime thriller film written by J.H. Wyman and directe...
99252,9529,Robert Duvall received the National Medal of Arts in June of 2005.,SUPPORTS,He received the National Medal of Arts in 2005 .,"Robert Selden Duvall ( [ duːˈvɔːl ] born January 5 , 1931 ) is an American actor and filmmaker ...."
27811,156594,Source Code had Jake Gyllenhaal in it.,SUPPORTS,"Gyllenhaal received further recognition for roles in Zodiac ( 2007 ) , Brothers ( 2009 ) , Princ...","Jacob Benjamin Gyllenhaal ( [ ˈdʒɪlənhɑːl ] ; born December 19 , 1980 ) is an American actor . A..."


In [28]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(chunk_size=500,
                                          chunk_overlap=100,
                                          length_function=len)

docs=[]
for i,row in tqdm(rag_df.iterrows(),desc="Loading all rows"):
    for chunk in splitter.split_text(row["page_evidence"]):
        docs.append({
            "id":row["id"],
            "claim":row["claim"],
            "label":row["label"],
            "chunk":chunk
        })


Loading all rows: 145449it [02:26, 995.11it/s] 


In [29]:
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.documents import Document
from tqdm import tqdm

lc_docs = [
    Document(page_content=d["chunk"],metadata={"id":d["id"],"claim":d["claim"],"label":d["label"]})
    for d in tqdm(docs,desc="Preparing Langchain Documents")
]

embedding_fn=HuggingFaceEmbeddings(model_name="multi-qa-MiniLM-L6-cos-V1")

vector_store=FAISS.from_documents(lc_docs,embedding_fn)


Preparing Langchain Documents: 100%|██████████| 651929/651929 [00:29<00:00, 21944.81it/s]


In [33]:
vector_store.save_local("./fever_faiss_store")
print("✅ FAISS vectorstore created and saved successfully!")


✅ FAISS vectorstore created and saved successfully!


In [34]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
vectorstore = FAISS.load_local(
    "./fever_faiss_store",
    embedding_fn,
    allow_dangerous_deserialization=True
)

In [1]:
import torch
print("CUDA available:", torch.cuda.is_available())
print("GPU Name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU detected")

CUDA available: True
GPU Name: NVIDIA GeForce RTX 4050 Laptop GPU


In [44]:
from langchain_google_genai import ChatGoogleGenerativeAI
from dotenv import load_dotenv
load_dotenv()
llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash-lite",  # or gemini-1.5-pro for better reasoning
    temperature=0.3,
    max_output_tokens=512
)


In [63]:
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough, RunnableLambda, RunnableParallel
from langchain_core.output_parsers import StrOutputParser

# Create retriever
retriever = vectorstore.as_retriever(search_kwargs={"k": 6})

# Output parser (returns a string)
parser = StrOutputParser()

# Define prompt
prompt = PromptTemplate(
    template="""
You are a fact-checking assistant.
Use the provided evidence to verify the claim and respond with one of:
- SUPPORTS
- REFUTES
- NOT ENOUGH INFO

Claim: {question}

Evidence:
{context}

Your answer and brief reasoning:
""",
    input_variables=["question", "context"],
)

# ✅ retrieve_context should only use retriever.invoke, no retriever pipe
def retrieve_context(question: str) -> str:
    docs = retriever.invoke(question)
    return "\n\n".join(d.page_content for d in docs)

# Parallel branch: question passthrough + retrieved context
parallel_chain = RunnableParallel({
    "question": RunnablePassthrough(),
    "context": RunnableLambda(retrieve_context)
})

# Main RAG chain
main_chain = parallel_chain | prompt | llm | parser

# Query
query = "Source Code had Jake Gyllenhaal in it.	?"

# Invoke
result = main_chain.invoke(query)

print("🧠 Gemini Answer:\n", result)


🧠 Gemini Answer:
 SUPPORTS

The evidence repeatedly states that "Source Code is a 2011 American-French science fiction thriller film directed by Duncan Jones... It stars Jake Gyllenhaal as a U.S. Army captain..."
