In [16]:
import pandas as pd
from scipy import sparse
import numpy as np
import shutil

# ROSIE-MIND

In [17]:
path_v1 = "../../data/mind_runs/rosie/v1/annotated/rosie_mind_v1_annotated.xlsx"
path_v2 = "../../data/mind_runs/rosie/v2/results/annotated/rosie_mind_v3_annotated.xlsx"
path_all_rosie = "/export/usuarios_ml4ds/lbartolome/Repos/umd/LinQAForge/data/source/corpus_rosie/passages/26_jan/df_1.parquet"


In [18]:
df_rosie = pd.read_parquet(path_all_rosie)

## v1

In [19]:
df_v1 = pd.read_excel(path_v1)

# add anchor_passage_id by merging with df_rosie (text) with df_v1 (anchor_passage)
df_v1 = df_v1.merge(df_rosie[['text', 'doc_id']], left_on='anchor_passage', right_on='text', how='left')

# drop "text" column
df_v1 = df_v1.drop(columns=['text'])

# rename columns
df_v1 = df_v1.rename(
    columns={
        'doc_id': 'anchor_passage_id',
        "a_s": "anchor_answer",
        "a_t": "comparison_answer",
        "comp_passage_id": "comparison_passage_id"
    }
)

# order columns
df_v1 = df_v1[['question_id', 'question', 'anchor_passage', 'anchor_passage_id', 'anchor_answer', 'comparison_passage', 'comparison_passage_id', 'comparison_answer', 'label', 'final_label', 'reason']]

# replace all apprearances of "c_c" in reason column with "comparison_passage", "c_a" with "anchor_passage", "a_a" with "anchor_answer" and "a_c" with "comparison_answer"
df_v1['reason'] = df_v1['reason'].str.replace('c_c', 'comparison_passage')
df_v1['reason'] = df_v1['reason'].str.replace('c_a', 'anchor_passage')
df_v1['reason'] = df_v1['reason'].str.replace('a_a', 'anchor_answer')
df_v1['reason'] = df_v1['reason'].str.replace('a_c', 'comparison_answer')

# add Notes and secondary_label columns with empty strings
df_v1['Notes'] = ""
df_v1['secondary_label'] = ""

df_v1.to_parquet("../../../upload_hf_mind/rosie_mind/v1.parquet", index=False)

df_v1.head(2)

Unnamed: 0,question_id,question,anchor_passage,anchor_passage_id,anchor_answer,comparison_passage,comparison_passage_id,comparison_answer,label,final_label,reason,Notes,secondary_label
0,TPC_11_1697,Is there a single standard for how much weigh...,Pregnancy weight-gain guidelines: There's no o...,EN_1175294_224986-2,"NO, there is no one-size-fits-all approach to ...",¿Cuánto aumento de peso es adecuado: La mayorí...,ES_905053_120100-1,"YES, there is a standard range for weight gain...",CONTRADICTION,CONTRADICTION,The answers present conflicting information ab...,,
1,TPC_11_1086,Is a baby born before 37 weeks considered pret...,A live-birth delivery was defined as a birth o...,EN_259065_52578-24,"YES, preterm birth is defined as a gestational...",A término precoz se refiere a un bebé que nace...,ES_797021_103544-11,"NO, early term refers to a baby born from week...",CONTRADICTION,CONTRADICTION,The two answers provide conflicting informatio...,,


## v2

In [20]:
df_v2 = pd.read_excel(path_v2)

# remove "topic". "Column 1"
df_v2 = df_v2.drop(columns=['topic', 'Column 1'])

# keep rows where contextualization_failed != "Yes"
df_v2 = df_v2[df_v2['contextualization_failed'] != 'yes']

# rename columns
df_v2 = df_v2.rename(
    columns={
        "a_s": "anchor_answer",
        "a_c": "comparison_answer",
        "comp_passage_id": "comparison_passage_id"
    }
)

# order columns
df_v2 = df_v2[['question_id', 'question', 'anchor_passage', 'anchor_passage_id', 'anchor_answer', 'comparison_passage', 'comparison_passage_id', 'comparison_answer', 'label', 'final_label', 'reason', 'Notes', 'secondary_label']]

# replace nans in "secondary_label" and Notes with empty string
df_v2['secondary_label'] = df_v2['secondary_label'].fillna('')
df_v2['Notes'] = df_v2['Notes'].fillna('')

df_v2.to_parquet("../../../upload_hf_mind/rosie_mind/v2.parquet", index=False)

df_v2.head(2)

Unnamed: 0,question_id,question,anchor_passage,anchor_passage_id,anchor_answer,comparison_passage,comparison_passage_id,comparison_answer,label,final_label,reason,Notes,secondary_label
0,TPC_11_487,Is it recommended for men to father a child wh...,Do not become pregnant while taking this medic...,EN_693335_108983-21,"NO, it is not recommended for men to father a ...",Las mujeres no deben estar embarazadas ni lact...,ES_972028_125922-11,I cannot answer the question given the context,CULTURAL_DISCREPANCY,NOT_ENOUGH_INFO,Comparison chunk is not about Decitabine but t...,"Orginal a_c: NO, it is not recommended for men...",
1,TPC_11_592,Do membranes typically rupture after labor or ...,What is premature rupture of membranes: Premat...,EN_764034_114750-0,"YES, membranes typically rupture after labor o...","Algunas veces, las membranas se rompen antes d...",ES_904272_120031-3,I cannot answer the question given the context,CONTRADICTION,NOT_ENOUGH_INFO,c_a states the typical pattern; c_c describes ...,"Original a_c: No, membranes do not typically r...",


In [21]:
len(df_v2)

652

## Metadata file

In [None]:
import json

metadata = {
  "name": "rosie_mind",
  "version": "1.0.1",
  "license": "mit",
  "language": ["en", "es"],
  "pretty_name": "ROSIE-MIND",
  "description": "Collection of English-Spanish QA pairs labeled for discrepancies (Contradiction, Cultural Discrepancy, No Discrepancy or Not Enough Information) between two answers based on two different passages (anchor and comparison) in different languages given a question posed in and generated from the anchor passage. This dataset contains two related Parquet files (v1 and v2) generated using slightly different version of the MIND pipeline on the same source data.",
  "size_categories": ["n<1K"],
  "splits": {
    "v1": {
      "filename": "v1.parquet",
      "description": "Initial MIND run using TB-ENN-W-D with quora-distilbert-multilingual embeddings and qwen:32b LLM. Generated from 4000 anchor passages across topics t12 (Pregnancy), t16 (Infant Care), and t25 (Pediatric Healthcare), detecting various discrepancy types (C, CD, ND) with ~42K NEI instances per topic. Contains 80 manually refined triplets."
    },
    "v2": {
      "filename": "v2.parquet",
      "description": "Enhanced version using improved MIND pipeline with BAAI/bge-m3 embeddings and llama3.3:70b LLM. Applied to 500 anchor passages across topics t12 (Pregnancy), t16 (Infant Care), and t25 (Pediatric Healthcare). Includes 652 manually refined triplets with additional secondary labels and notes for annotation nuances not captured by main labels."
    }
  },
  "features": df_v2.dtypes.apply(lambda x: x.name).to_dict(),
  "creator": "MIND Research Team",
  "source": {
    "citation": "Heran Y Mane, Amara Channell Doig, Francia Ximena Marin Gutierrez, Michelle Jasczynski, Xiaohe Yue, Neha Pundlik Srikanth, Sourabh Mane, Abby Sun, Rachel Ann Moats, Pragat Patel, and others. 2023. Practical guidance for the development of ROSIE, a health education question-and-answer chatbot for new mothers. Journal of Public Health Management and Practice, 29(5):663-670.",
    "link": "https://doi.org/10.1097/PHH.0000000000001851"
  },
  "derived_from": "lcalvobartolome/rosie_mind_topics",
  "dataset_type": "question_answering_discrepancy_detection",
  "tags": ["parquet", "text", "multilingual", "discrepancy-detection", "question-answering", "maternal-health", "infant-health"],
  "citation": "@inproceedings{calvo-bartolome2025discrepancy,\n  title={{D}iscrepancy {D}etection at the {D}ata {L}evel: {T}oward {C}onsistent {M}ultilingual {Q}uestion {A}nswering},\n  author={Lorena Calvo-Bartolom{\\'e} and Val{\\'e}rie Aldana and Karla Cantarero and Alonso Madro{\\~n}al de Mesa and Jer{\\'o}nimo Arenas-Garc{\\'\\i}a and Jordan Lee Boyd-Graber},\n  booktitle={Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing},\n  month={nov},\n  year={2025},\n  address={Suzhou, China},\n  publisher={Association for Computational Linguistics}\n}\n\nIf you use this dataset, please also cite the original source:\nMane, H. Y., Channell Doig, A., Marin Gutierrez, F. X., Jasczynski, M., Yue, X., Pundlik Srikanth, N., Mane, S., Sun, A., Moats, R. A., Patel, P., et al. (2023). Practical guidance for the development of ROSIE, a health education question-and-answer chatbot for new mothers. *Journal of Public Health Management and Practice*, 29(5): 663-670."

}

# Save metadata to JSON file
with open("../../../upload_hf_mind/rosie_mind/metadata.json", "w") as f:
    json.dump(metadata, f, indent=2)

print("Metadata saved successfully!")
print(json.dumps(metadata, indent=2))

Metadata saved successfully!
{
  "name": "rosie_mind",
  "version": "1.0.1",
  "license": "mit",
  "language": [
    "en",
    "es"
  ],
  "pretty_name": "ROSIE-MIND",
  "description": "Collection of English-Spanish QA pairs labeled for discrepancies (Contradiction, Cultural Discrepancy, No Discrepancy or Not Enough Information) between two answers based on two different passages (anchor and comparison) in different languages given a question posed in and generated from the anchor passage. This dataset contains two related Parquet files (v1 and v2) generated using slightly different version of the MIND pipeline on the same source data.",
  "size_categories": [
    "n<1K"
  ],
  "splits": {
    "v1": {
      "filename": "v1.parquet",
      "description": "Initial MIND run using TB-ENN-W-D with quora-distilbert-multilingual embeddings and qwen:32b LLM. Generated from 4000 anchor passages across topics t12 (Pregnancy), t16 (Infant Care), and t25 (Pediatric Healthcare), detecting various d

## ROSIE

In [23]:
path_thetas_en = "/export/usuarios_ml4ds/lbartolome/Repos/umd/LinQAForge/data/models/28_jan/poly_rosie_1_30/mallet_output/thetas_EN.npz"
path_thetas_es = "/export/usuarios_ml4ds/lbartolome/Repos/umd/LinQAForge/data/models/28_jan/poly_rosie_1_30/mallet_output/thetas_ES.npz"

In [24]:
def get_doc_top_tpcs(doc_distr, topn=10):
        sorted_tpc_indices = np.argsort(doc_distr)[::-1]
        top = sorted_tpc_indices[:topn].tolist()
        return [(k, float(doc_distr[k])) for k in top if doc_distr[k] > 0]

In [None]:
# en
thetas_en = sparse.load_npz(path_thetas_en).toarray()
df_en = df_rosie[df_rosie.lang == "EN"].copy()
df_en["thetas"] = list(thetas_en)
df_en["top_k"] = df_en["thetas"].apply(lambda x: get_doc_top_tpcs(x, topn=10))
df_en["main_topic_thetas"] = df_en["thetas"].apply(lambda x: int(np.argmax(x)))

# es
thetas_es = sparse.load_npz(path_thetas_es).toarray()
df_es = df_rosie[df_rosie.lang == "ES"].copy()
df_es["thetas"] = list(thetas_es)
df_es["top_k"] = df_es["thetas"].apply(lambda x: get_doc_top_tpcs(x, topn=10))
df_es["main_topic_thetas"] = df_es["thetas"].apply(lambda x: int(np.argmax(x)))

# concat back
df_rosie_ = pd.concat([df_en, df_es], ignore_index=True)

df_rosie_.to_parquet("../../../upload_hf_mind/rosie_mind_topics/rosie_preprpc_30tpc_model.parquet", index=False)

df_rosie_.head(2)

Unnamed: 0,id_top,doc_id,id_preproc,document_id,text,len,full_doc,human_labeled,lemmas,lemmas_tr,...,final_label,common_id,thetas,main_topic,doc_score,predicted_probability,label,method,top_k,main_topic_thetas
0,0,EN_607595_99219-9,0,99219,In order to avoid future allergic reactions – ...,24,"Published on For 12-year-old Rani, living with...",True,order avoid future allergic_reaction range vom...,evitar futuro reacción_alérgico vómito urticar...,...,"[1, 1, 0, 0]",1,"[0.10000000037252903, 0.0, 0.0, 0.0, 0.0, 0.33...",11,0.043793,0.795818,1,predicted,"[(5, 0.3399999997764826), (9, 0.30000000856816...",5
1,3,EN_1361170_297168-6,3,297168,"Removing the catheter:\n- In the morning, remo...",41,Nighttime bladder emptying keeps bladder press...,False,remove morning remove water balloon place syri...,quitar mañana quitar sacar agua globo coloquir...,...,"[1, 0, 0, 0]",4,"[0.012048192142722128, 0.0, 0.0, 0.0, 0.0, 0.0...",11,0.017128,0.728664,1,predicted,"[(8, 0.4939759002033487), (22, 0.2650602420410...",8


In [26]:
df_rosie_.columns

Index(['id_top', 'doc_id', 'id_preproc', 'document_id', 'text', 'len',
       'full_doc', 'human_labeled', 'lemmas', 'lemmas_tr', 'text_tr', 'lang',
       'final_label', 'common_id', 'thetas', 'main_topic', 'doc_score',
       'predicted_probability', 'label', 'method', 'top_k',
       'main_topic_thetas'],
      dtype='object')

In [27]:
len(df_rosie_)

875230

In [None]:
# save also tpc keys and tpc labels
source_dir = "/export/usuarios_ml4ds/lbartolome/Repos/umd/LinQAForge/data/models/28_jan/poly_rosie_1_30/mallet_output/"
dest_dir = "../../../upload_hf_mind/rosie_mind_topics/"

shutil.copy2(source_dir + "keys_EN.txt", dest_dir + "keys_EN.txt")
shutil.copy2(source_dir + "keys_ES.txt", dest_dir + "keys_ES.txt") 
shutil.copy2(source_dir + "tpc_labels.txt", dest_dir + "tpc_labels.txt")

print("Files copied successfully!")

Files copied successfully!


In [29]:
metadata_rosie_topics = {
  "name": "rosie_mind_topics",
  "version": "1.0.0",
  "license": "mit",
  "language": ["en", "es"],
  "pretty_name": "ROSIE-MIND-Topics",
  "description": "Bilingual (English-Spanish) dataset with topic modeling information used as the input to the MIND pipeline. Each record contains a document, its language, lemmas (both in the corresponding language and its translated version), and topic features derived from training a PLTM model with 30 topics.",
  "size_categories": ["100K<n<1M"],
  "instances": 25148,
  "features": df_rosie_.dtypes.apply(lambda x: x.name).to_dict(),
  "extra_files": {
    "keys_EN.txt": "Topic words in English for each topic",
    "keys_ES.txt": "Topic words in Spanish for each topic",
    "tpc_labels.txt": "Descriptive labels summarizing each topic"
  },
  "creator": "MIND Research Team",
  "source": {
    "citation": "Heran Y Mane et al. (2023). Practical guidance for the development of ROSIE, a health education question-and-answer chatbot for new mothers. Journal of Public Health Management and Practice, 29(5):663-670.",
    "link": "https://doi.org/10.1097/PHH.0000000000001851"
  },
  "dataset_type": "text_with_topics",
  "tags": ["parquet", "text", "topic-modeling", "health", "multilingual"],
  "citation": "@inproceedings{calvo-bartolome2025discrepancy,\n  title={{D}iscrepancy {D}etection at the {D}ata {L}evel: {T}oward {C}onsistent {M}ultilingual {Q}uestion {A}nswering},\n  author={Lorena Calvo-Bartolom{\\'e} et al.},\n  booktitle={Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing},\n  month={nov},\n  year={2025},\n  address={Suzhou, China},\n  publisher={Association for Computational Linguistics}\n}"
}

# Save metadata to JSON file
with open("../../../upload_hf_mind/rosie/metadata.json", "w") as f:
    json.dump(metadata_rosie_topics, f, indent=2)

print("Metadata saved successfully!")
print(json.dumps(metadata_rosie_topics, indent=2))


Metadata saved successfully!
{
  "name": "rosie_mind_topics",
  "version": "1.0.0",
  "license": "mit",
  "language": [
    "en",
    "es"
  ],
  "pretty_name": "ROSIE-MIND-Topics",
  "description": "Bilingual (English-Spanish) dataset with topic modeling information used as the input to the MIND pipeline. Each record contains a document, its language, lemmas (both in the corresponding language and its translated version), and topic features derived from training a PLTM model with 30 topics.",
  "size_categories": [
    "100K<n<1M"
  ],
  "instances": 25148,
  "features": {
    "id_top": "int64",
    "doc_id": "object",
    "id_preproc": "int64",
    "document_id": "int64",
    "text": "object",
    "len": "int64",
    "full_doc": "object",
    "human_labeled": "bool",
    "lemmas": "object",
    "lemmas_tr": "object",
    "text_tr": "object",
    "lang": "object",
    "final_label": "object",
    "common_id": "int64",
    "thetas": "object",
    "main_topic": "int64",
    "doc_score":

## ENDE

In [30]:
source_path = "/export/usuarios_ml4ds/lbartolome/Repos/alonso_mind/Data/polylingual_df.parquet"
SRC_THETAS_PATH="/export/usuarios_ml4ds/lbartolome/Repos/umd/mind/data/models/wiki/ende/poly_en_de_05_09_25/mallet_output/thetas_EN.npz"
TGT_THETAS_PATH="/export/usuarios_ml4ds/lbartolome/Repos/umd/mind/data/models/wiki/ende/poly_en_de_05_09_25/mallet_output/thetas_DE.npz"

In [31]:
df_ende  = pd.read_parquet(source_path)
thetas_en = sparse.load_npz(SRC_THETAS_PATH).toarray()
thetas_de = sparse.load_npz(TGT_THETAS_PATH).toarray() 

df_ende_en = df_ende[df_ende.lang == "EN"].copy()
df_ende_en["thetas"] = list(thetas_en)
df_ende_en["top_k"] = df_ende_en["thetas"].apply(lambda x: get_doc_top_tpcs(x, topn=10))
df_ende_en["main_topic_thetas"] = df_ende_en["thetas"].apply(lambda x: int(np.argmax(x))) 

df_ende_de = df_ende[df_ende.lang == "DE"].copy()
df_ende_de["thetas"] = list(thetas_de)
df_ende_de["top_k"] = df_ende_de["thetas"].apply(lambda x: get_doc_top_tpcs(x, topn=10))
df_ende_de["main_topic_thetas"] = df_ende_de["thetas"].apply(lambda x: int(np.argmax(x)))

df_ende_ = pd.concat([df_ende_en, df_ende_de], ignore_index=True)

# remove summary and equivalence columns
df_ende_ = df_ende_.drop(columns=['summary', 'equivalence'])

df_ende_.to_parquet("../../../upload_hf_mind/ende/ende_preprpc_25tpc_model.parquet", index=False)

df_ende_.head(2)

Unnamed: 0,chunk_id,text,doc_id,full_doc,lang,title,url,id,index,lemmas,lemmas_tr,id_preproc,thetas,top_k,main_topic_thetas
0,EN_0_0,"George Washington (February 22, 1732 [O.S. Feb...",0.0,"George Washington (February 22, 1732 [O.S. Feb...",EN,George Washington,https://en.wikipedia.org/wiki/George_Washington,0.0,,george_washington february february december f...,george_washington februar februar_ebenda präsi...,EN_0_0,"[0.0, 0.019607843319867167, 0.0, 0.0, 0.0, 0.0...","[(10, 0.21568627093060352), (7, 0.196078429473...",10
1,EN_0_1,"Born in the Colony of Virginia, Washington bec...",1.0,"George Washington (February 22, 1732 [O.S. Feb...",EN,George Washington,https://en.wikipedia.org/wiki/George_Washington,1.0,,bear colony virginia washington commander virg...,gebor kolonie virginia washington französisch_...,EN_0_1,"[0.0, 0.015384615277155088, 0.0, 0.0, 0.0, 0.0...","[(17, 0.48717948408701867), (16, 0.12307692221...",17


In [32]:
df_ende_.columns

Index(['chunk_id', 'text', 'doc_id', 'full_doc', 'lang', 'title', 'url', 'id',
       'index', 'lemmas', 'lemmas_tr', 'id_preproc', 'thetas', 'top_k',
       'main_topic_thetas'],
      dtype='object')

In [33]:
len(df_ende_)

25148

In [34]:
df_ende_.dtypes

chunk_id              object
text                  object
doc_id                object
full_doc              object
lang                  object
title                 object
url                   object
id                   float64
index                float64
lemmas                object
lemmas_tr             object
id_preproc            object
thetas                object
top_k                 object
main_topic_thetas      int64
dtype: object

In [None]:
# save also tpc keys and tpc labels
source_dir = "/export/usuarios_ml4ds/lbartolome/Repos/umd/mind/data/models/wiki/ende/poly_en_de_05_09_25/mallet_output/"
dest_dir = "../../../upload_hf_mind/ende_mind_topics/"

shutil.copy2(source_dir + "keys_EN.txt", dest_dir + "keys_EN.txt")
shutil.copy2(source_dir + "keys_DE.txt", dest_dir + "keys_DE.txt") 
shutil.copy2(source_dir + "tpc_labels.txt", dest_dir + "tpc_labels.txt")

print("Files copied successfully!")

Files copied successfully!


In [None]:
metadata_ende_topics = {
  "name": "ende_mind_topics",
  "version": "1.0.0",
  "license": "cc-by-sa-4.0",
  "language": ["en", "de"],
  "pretty_name": "ENDE-MIND-Topics",
  "description": "Bilingual (English-German) document corpus scraped from Wikipedia and enriched with topic-modeling features. Each record contains a document, its language, lemmas (both in the corresponding language and its translated version), and topic features derived from training a PLTM model with 25 topics.",
  "size_categories": ["10K<n<100K"],
  "instances": 25148,
  "features": df_ende_.dtypes.apply(lambda x: x.name).to_dict(),
  "extra_files": {
    "keys_EN.txt": "Topic keywords in English per topic (one line per topic).",
    "keys_DE.txt": "Topic keywords in German per topic (one line per topic).",
    "tpc_labels.txt": "Human-readable labels for each topic index (one line per topic)."
  },
  "creator": "MIND Research Team",
  "source": {
    "provider": "Wikipedia",
    "type": "web",
    "note": "Pages scraped for research under MIT license.",
    "scrape_command": "python3 -m wikipedia.generate_dtset --output-path test",
    "repository": "https://github.com/lcalvobartolome/mind"
  },
  "dataset_type": "text_with_topics",
  "tags": [
    "parquet",
    "text",
    "topic-modeling",
    "multilingual",
    "english",
    "german",
    "wikipedia",
    "mind"
  ],
  "citation": "@inproceedings{calvo-bartolome2025discrepancy,\n  title={{D}iscrepancy {D}etection at the {D}ata {L}evel: {T}oward {C}onsistent {M}ultilingual {Q}uestion {A}nswering},\n  author={Lorena Calvo-Bartolom{\\'e} and Val{\\'e}rie Aldana and Karla Cantarero and Alonso Madro{\\~n}al de Mesa and Jer{\\'o}nimo Arenas-Garc{\\'\\i}a and Jordan Lee Boyd-Graber},\n  booktitle={Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing},\n  month={nov},\n  year={2025},\n  address={Suzhou, China},\n  publisher={Association for Computational Linguistics}\n}\n\nPlease also attribute Wikipedia as the source (CC BY-SA 4.0)."
}

# Save metadata to JSON file
with open("../../../upload_hf_mind/ende_mind_topics/metadata.json", "w") as f:
    json.dump(metadata_ende_topics, f, indent=2)

print("Metadata saved successfully!")
print(json.dumps(metadata_ende_topics, indent=2))

Metadata saved successfully!
{
  "name": "ende_mind_topics",
  "version": "1.0.0",
  "license": "cc-by-sa-4.0",
  "language": [
    "en",
    "de"
  ],
  "pretty_name": "ENDE-MIND-Topics",
  "description": "Bilingual (English-German) document corpus scraped from Wikipedia and enriched with topic-modeling features. Each record contains a document, its language, lemmas (both in the corresponding language and its translated version), and topic features derived from training a PLTM model with 25 topics.",
  "size_categories": [
    "10K<n<100K"
  ],
  "instances": 25148,
  "features": {
    "chunk_id": "object",
    "text": "object",
    "doc_id": "object",
    "full_doc": "object",
    "lang": "object",
    "title": "object",
    "url": "object",
    "id": "float64",
    "index": "float64",
    "lemmas": "object",
    "lemmas_tr": "object",
    "id_preproc": "object",
    "thetas": "object",
    "top_k": "object",
    "main_topic_thetas": "int64"
  },
  "extra_files": {
    "keys_EN.txt":

## FEVER-DPLACE-Q

In [37]:
feverdplaceq = pd.read_csv("../../../upload_hf_mind/fever_dplace_q/FEVER-DPLACE-Q_v3_discp.csv")

feverdplaceq

Unnamed: 0,claim,evidence,label,question,answer1,answer2,annotation,discp_qwen:32b,reason_qwen:32b,discp_llama3.3:70b,reason_llama3.3:70b,discp_gpt-4o-2024-08-06,reason_gpt-4o-2024-08-06
0,Floor level of the prevailing type of dwelling.,"['Subterranean or semi-subterranean, ignoring ...",CULTURAL_DISCREPANCY,Is the primary living space typically located ...,"No, it is often subterranean or semi-subterran...","Yes, it is typically at ground level.",1,CULTURAL_DISCREPANCY,The answers reflect differences in cultural pr...,CULTURAL_DISCREPANCY,The answers reflect differences that can stem ...,CULTURAL_DISCREPANCY,The answers reflect differences in cultural pr...
1,Age or occupational specialization in the gath...,"['Junior age specialization, i.e., the activit...",CULTURAL_DISCREPANCY,Is the gathering of wild plants and small land...,"No, it is primarily carried out by boys and gi...","Yes, it is mainly done by older adults beyond ...",1,CULTURAL_DISCREPANCY,The answers reflect differences that stem from...,CULTURAL_DISCREPANCY,The answers provide directly opposing informat...,CULTURAL_DISCREPANCY,The answers reflect differences in cultural pr...
2,Following the inheritance rule for movable pro...,['Equal or relatively equal distribution among...,CULTURAL_DISCREPANCY,Is movable property typically inherited equall...,"Yes, it is distributed equally or relatively e...","No, it is predominantly inherited by the membe...",1,CULTURAL_DISCREPANCY,The answers reflect differences in cultural no...,CULTURAL_DISCREPANCY,The answers provide differing perspectives on ...,CULTURAL_DISCREPANCY,The answers reflect differences in cultural pr...
3,Secondary type of caste differentiation. See a...,"['Caste distinctions absent or insignificant',...",CULTURAL_DISCREPANCY,Is there a significant presence of despised oc...,"No, caste distinctions are absent or insignifi...","Yes, there are one or more despised occupation...",1,CONTRADICTION,The two answers directly contradict each other...,CONTRADICTION,The answers provide directly opposing factual ...,CULTURAL_DISCREPANCY,The answers reflect differences in cultural pe...
4,Specialization by sex in the manufacture of tr...,"['Males alone perform the activity, female par...",CULTURAL_DISCREPANCY,Is the manufacture of true cloth on a loom pre...,"Yes, males alone perform the activity, with ne...","No, both sexes participate, but males do appre...",1,CULTURAL_DISCREPANCY,The answers reflect differences in the distrib...,CULTURAL_DISCREPANCY,The answers present different perspectives on ...,CULTURAL_DISCREPANCY,The answers reflect differences in cultural pr...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
180,Wall material of secondary or alternative hous...,"['Missing data', 'Stone, stucco, concrete, or ...",NOT_ENOUGH_INFO,"Is stone, stucco, concrete, or fired brick com...",The data is missing.,"Yes, stone, stucco, concrete, or fired brick a...",1,NOT_ENOUGH_INFO,ANSWER_1 states that there is no available dat...,NOT_ENOUGH_INFO,"ANSWER_1 states that the data is missing, whic...",NOT_ENOUGH_INFO,ANSWER_1 does not provide any information abou...
181,Presence/absence of culturally patterned forms...,"['Missing data', 'Trance behavior is known to ...",NOT_ENOUGH_INFO,Are altered states of consciousness typically ...,There is no available data on this.,"No, trance behavior occurs without a belief in...",1,NOT_ENOUGH_INFO,The first answer states that there is no avail...,NOT_ENOUGH_INFO,The question asks if altered states of conscio...,NOT_ENOUGH_INFO,ANSWER_1 states that there is no available dat...
182,Degree of rigidity in social structures. Consi...,"['Missing data', 'Rigid, characterized as: non...",NOT_ENOUGH_INFO,Is the social structure characterized by flexi...,No data is available to answer this question.,"No, it is rigid with non-egalitarian, ascripti...","old E184 = ""No, it is rigid with non-egalitari...",NOT_ENOUGH_INFO,The first answer states that there is no data ...,NOT_ENOUGH_INFO,The first answer indicates a lack of informati...,NOT_ENOUGH_INFO,ANSWER_1 indicates a lack of data to address t...
183,"Specialization by sex in hunting, including tr...","['Males alone perform the activity, female par...",CULTURAL_DISCREPANCY,Is hunting typically an activity performed exc...,"Yes, males alone perform the activity, with ne...","No, both sexes participate, but males do appre...",1,CULTURAL_DISCREPANCY,The answers reflect variations in hunting prac...,CULTURAL_DISCREPANCY,The answers present different perspectives on ...,CULTURAL_DISCREPANCY,The answers reflect differences in cultural no...


In [38]:
feverdplaceq.columns

Index(['claim', 'evidence', 'label', 'question', 'answer1', 'answer2',
       'annotation', 'discp_qwen:32b', 'reason_qwen:32b', 'discp_llama3.3:70b',
       'reason_llama3.3:70b', 'discp_gpt-4o-2024-08-06',
       'reason_gpt-4o-2024-08-06'],
      dtype='object')

In [39]:
metadata_feverdplaceq = {
  "name": "fever_dplace_q",
  "version": "1.0.0",
  "license": "mit",
  "language": ["en"],
  "pretty_name": "FEVER-DPLACE-Q",
  "description": "A controlled dataset of question-answer triplets combining FEVER (Thorne et al., 2018) and D-PLACE (Kirby et al., 2016) sources. Each sample contains pairs of answers with explicit entailments and discrepancies generated using GPT-4o and manually reviewed. The dataset is designed for research on factual consistency, cultural discrepancy detection, and model evaluation in multilingual reasoning.",
  "size_categories": ["n<1K"],
  "instances": 185,
  "features": feverdplaceq.dtypes.apply(lambda x: x.name).to_dict(),
  "creator": "MIND Research Team",
  "source": {
    "datasets": ["FEVER v1 (Thorne et al., 2018)", "D-PLACE (Kirby et al., 2016)"],
    "note": "FEVER and D-PLACE items were used as templates for generating questions and controlled discrepancy instances with GPT-4o."
  },
  "dataset_type": "question_answering_discrepancy_detection",
  "tags": [
    "text",
    "question-answering",
    "reasoning",
    "entailment",
    "discrepancy-detection",
    "fever",
    "d-place",
    "gpt-4o",
    "llama3.3",
    "qwen"
  ],
  "citation": "@inproceedings{calvo-bartolome2025discrepancy,\n  title={{D}iscrepancy {D}etection at the {D}ata {L}evel: {T}oward {C}onsistent {M}ultilingual {Q}uestion {A}nswering},\n  author={Lorena Calvo-Bartolom{\\'e} and Val{\\'e}rie Aldana and Karla Cantarero and Alonso Madro{\\~n}al de Mesa and Jer{\\'o}nimo Arenas-Garc{\\'\\i}a and Jordan Lee Boyd-Graber},\n  booktitle={Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing},\n  month={nov},\n  year={2025},\n  address={Suzhou, China},\n  publisher={Association for Computational Linguistics}\n}\n\nIf you use this dataset, please also cite FEVER (Thorne et al., 2018) and D-PLACE (Kirby et al., 2016)."
}

# Save metadata to JSON file
with open("../../../upload_hf_mind/fever_dplace_q/metadata.json", "w") as f:
    json.dump(metadata_feverdplaceq, f, indent=2)

print("Metadata saved successfully!")
print(json.dumps(metadata_feverdplaceq, indent=2))

Metadata saved successfully!
{
  "name": "fever_dplace_q",
  "version": "1.0.0",
  "license": "mit",
  "language": [
    "en"
  ],
  "pretty_name": "FEVER-DPLACE-Q",
  "description": "A controlled dataset of question-answer triplets combining FEVER (Thorne et al., 2018) and D-PLACE (Kirby et al., 2016) sources. Each sample contains pairs of answers with explicit entailments and discrepancies generated using GPT-4o and manually reviewed. The dataset is designed for research on factual consistency, cultural discrepancy detection, and model evaluation in multilingual reasoning.",
  "size_categories": [
    "n<1K"
  ],
  "instances": 185,
  "features": {
    "claim": "object",
    "evidence": "object",
    "label": "object",
    "question": "object",
    "answer1": "object",
    "answer2": "object",
    "annotation": "object",
    "discp_qwen:32b": "object",
    "reason_qwen:32b": "object",
    "discp_llama3.3:70b": "object",
    "reason_llama3.3:70b": "object",
    "discp_gpt-4o-2024-08-0