In [None]:
%%bash
wget http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_dev_distractor_v1.json

In [None]:
import pandas as pd

json_path='/kaggle/working/hotpot_dev_distractor_v1.json'

df=pd.read_json(json_path)
df.shape

In [None]:
# Shuffle the DataFrame and reset the index
df_shuffled = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Select the first 1000 rows
df_shuffled = df_shuffled.head(1300)
df_shuffled.shape

In [None]:
first=df_shuffled.iloc[0]
print(first['context'])
print(first.supporting_facts)

# Context is We Need

In [None]:
def extract_contexts(supporting_facts, context_list):
    """
    supporting_facts: list of [entity_name, sent_idx], e.g. [['VIVA Media', 0], ...]
    context_list: list of [entity_name, [sent1, sent2, ...]], e.g. [['VIVA Media', [...]], ...]
    Returns: list of matched context sentences
    """
    evidence = []
    # Build a dict for fast lookup
    context_dict = {entity: sents for entity, sents in context_list}
    for entity, sent_idx in supporting_facts:
        if entity in context_dict:
            sents = context_dict[entity]
            if 0 <= sent_idx < len(sents):
                evidence.append(sents[sent_idx])
    return evidence

# Apply the function to the DataFrame
df_shuffled['evidence'] = df_shuffled.apply(
    lambda row: extract_contexts(row['supporting_facts'], row['context']),
    axis=1
)

df_shuffled['evidence'] = df_shuffled['evidence'].apply(lambda x: ' '.join(x) if x is not None else '')

df_shuffled.shape

In [None]:
from typing import Optional, List, Tuple, Dict, Union
from pydantic import BaseModel, Field

class Entity(BaseModel):
    _id: str
    answer: str
    question: str
    supporting_facts: List[Tuple[str,int]]
    context: List[Tuple[str,List[str]]]
    question_type: str = Field(..., alias='type')  # Renaming 'type' to 'question_type'
    level: str
    evidence: str

class ReasoningPath(BaseModel):
    question: str
    context: str
    reason: str
    answer: Optional[str]=None
    ground_truth: str
    temperature: float
    tokens: int
    score: float

entities = [Entity.model_validate(row) for row in df_shuffled.to_dict(orient='records')]
print(len(entities))

In [None]:
import re

from sketch_of_thought import SoT
from doraemon import Doraemon
from relaxed_fda import RelaxedFDA

logger=Doraemon.get_logger(name=__name__, logfile="hotpot_qa_dataset_builder.log")


def process_entity(args)-> Optional[ReasoningPath]:
    
    et,paradigm,temperature=args
    
    try:
        prompt=SoT.few_short_support(
            task_name='HOTPOTQA', 
            prompt=SoT.get_initialized_prompt(paradigm=paradigm),
            few_short=SoT.get_few_short(task_name='HOTPOTQA'),
            question=f"Question:{et.question}\nContext:{et.evidence}"
        )
        r_s, tokens=Doraemon.inference(logger=logger, messages=prompt, temperature=temperature)
        result=ReasoningPath(
            question=str(et.question), 
            context=et.evidence,
            reason=str(r_s), 
            answer=RelaxedFDA.get_answer(r_s),
            ground_truth=str(et.answer),
            temperature=float(temperature), 
            tokens=int(tokens), 
            score=0.0)
        return result
    except Exception as e:
        logger.error(f"Error processing quetion {et.question} at temperature {temperature} with exception {e}")
        return None


paradigm = SoT.classify_question(entities[0].question)
logger.info(paradigm)
assert "conceptual_chaining"==str(paradigm)

In [None]:
temperatures = [i * 0.25 for i in range(9)]  # [0.0, 0.25, 0.5, ... ,2.0]

tasks=[]
for et in entities:
    for tp in temperatures:
        tasks.append((et,paradigm,tp))
logger.info(len(tasks))

In [None]:
from tqdm import tqdm
import concurrent.futures

with concurrent.futures.ProcessPoolExecutor(max_workers=1) as executor:
    results=list(tqdm(executor.map(process_entity, tasks), total=len(tasks)))

In [10]:
final_list:List[Dict] = [rp.model_dump() for rp in results if rp is not None]

final_df=pd.DataFrame(final_list)
final_df.to_pickle('sots_df.pkl')

In [None]:
import matplotlib.pyplot as plt

# Tokens distribution
plt.figure()
final_df['tokens'].hist()
plt.title("Tokens per Example")
plt.xlabel("tokens")
plt.ylabel("count")
plt.show()

In [12]:
!rm -rf /kaggle/working/hotpot_dev_distractor_v1.json