In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.14.6-py3-none-any.whl (493 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.14.0 (from datasets)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m29.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dill, multiprocess, huggingface-hub, datasets
Successfully installed datasets-2.1

In [None]:
import json
import pandas as pd
from tqdm import tqdm

def create_dataframe(dataset):
    paper_ids, questions, yes_nos, answers, evidences, highlighted_evidences = [], [], [], [], [], []
    for item in dataset:
        paper_ids.append(item['paper_id'])
        questions.append(item['question'])
        yes_nos.append(item['yes_no'])
        answers.append(item['answer'])
        evidences.append('\n'.join(item['evidence']))
        highlighted_evidences.append('\n'.join(item['highlighted_evidence']))

    df = pd.DataFrame({
        'Paper ID': paper_ids,
        'Question': questions,
        'Yes/No Question': yes_nos,
        'Answer': answers,
        'Evidence': evidences,
        'Highlighted Evidence': highlighted_evidences
    })
    return df

datasets = []
for mode in ['train', 'dev']:  # Adjust these as per the available JSON files
    with open(f'qasper-{mode}-v0.3.json', 'r') as file:
        data = json.load(file)

    dataset = []
    for paper_id, paper_data in tqdm(data.items()):
        for qa in paper_data['qas']:
            question = qa['question']
            for answer in qa['answers']:
                ans = answer['answer']  # Corrected this line to access the answer dictionary directly
                if not ans['unanswerable']:
                    dataset.append({
                        'paper_id': paper_id,
                        'question': question,
                        'yes_no': ans['yes_no'],
                        'answer': ans['free_form_answer'] if ans['free_form_answer'] else "Yes" if ans['yes_no'] else "No",
                        'evidence': ans['evidence'],
                        'highlighted_evidence': ans['highlighted_evidence']
                    })
    datasets.extend(dataset)

# Remove duplicate questions
question_set = set()
unique_datasets = [item for item in datasets if item['question'] not in question_set and not question_set.add(item['question'])]

df = create_dataframe(unique_datasets)
print(df.head())

# Save to CSV
# df.to_csv('qa_dataset.csv', index=False)


100%|██████████| 888/888 [00:00<00:00, 180487.59it/s]
100%|██████████| 281/281 [00:00<00:00, 78317.46it/s]

     Paper ID                                           Question  \
0  1909.00694                          What is the seed lexicon?   
1  1909.00694                              What are the results?   
2  1909.00694      How are relations used to propagate polarity?   
3  1909.00694                      How big is the Japanese data?   
4  1909.00694  What are labels available in dataset for super...   

  Yes/No Question                                             Answer  \
0            None  a vocabulary of positive and negative predicat...   
1            None  Using all data to train: AL -- BiGRU achieved ...   
2            None  based on the relation between events, the sugg...   
3            None  7000000 pairs of events were extracted from th...   
4            None                                                 No   

                                            Evidence  \
0  The seed lexicon consists of positive and nega...   
1  FLOAT SELECTED: Table 3: Performance of var




In [None]:
df.head()


Unnamed: 0,Paper ID,Question,Yes/No Question,Answer,Evidence,Highlighted Evidence
0,1909.00694,What is the seed lexicon?,,a vocabulary of positive and negative predicat...,The seed lexicon consists of positive and nega...,The seed lexicon consists of positive and nega...
1,1909.00694,What are the results?,,Using all data to train: AL -- BiGRU achieved ...,FLOAT SELECTED: Table 3: Performance of variou...,FLOAT SELECTED: Table 3: Performance of variou...
2,1909.00694,How are relations used to propagate polarity?,,"based on the relation between events, the sugg...","In this paper, we propose a simple and effecti...","As illustrated in Figure FIGREF1, our key idea..."
3,1909.00694,How big is the Japanese data?,,7000000 pairs of events were extracted from th...,"As a raw corpus, we used a Japanese web corpus...","As a raw corpus, we used a Japanese web corpus..."
4,1909.00694,What are labels available in dataset for super...,,No,Affective events BIBREF0 are events that typic...,"In this paper, we work on recognizing the pola..."


In [None]:
df.shape

(5, 281)

In [None]:
# To download and collect the QASPER dataset from Hugging Face, you can use the following Python script:


# Import the google.colab library.
# Use the google.colab.drive.mount() function to mount your Google Drive.
# Save the dataset to your Google Drive instead of saving it to your local computer.

import datasets
import google.colab

# Mount your Google Drive
google.colab.drive.mount('/content/drive')

# Download the QASPER dataset
dataset = datasets.load_dataset("allenai/qasper")

# Save the dataset to your Google Drive
dataset.save_to_disk('/content/drive/My Drive/QASPER_dataset')



Mounted at /content/drive


Downloading builder script:   0%|          | 0.00/5.95k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/8.14k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/9.64k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/10.8M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.87M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/888 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/281 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/416 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/888 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/281 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/416 [00:00<?, ? examples/s]

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
print("""
DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'abstract', 'full_text', 'qas', 'figures_and_tables'],
    }),

    test: Dataset({
        features: ['id', 'title', 'abstract', 'full_text', 'qas', 'figures_and_tables'],

    })
})
""")



DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'abstract', 'full_text', 'qas', 'figures_and_tables'],
    }),

    test: Dataset({
        features: ['id', 'title', 'abstract', 'full_text', 'qas', 'figures_and_tables'],

    })
})



In [None]:
type(dataset)

datasets.dataset_dict.DatasetDict

In [None]:
import json
import pandas as pd

# Load the QASPER train JSON file
with open("/content/drive/MyDrive/qasper-dev-v0.3.json", "r") as f:
    data = json.load(f)

# Create a Pandas DataFrame from the JSON data
df = pd.DataFrame(data)

# Check the column names of the DataFrame
print(df.columns)
df

Index(['1912.01214', '1810.08699', '1609.00425', '1801.05147', '1811.00383',
       '1909.09067', '1704.06194', '1909.00512', '2003.03106', '1708.01464',
       ...
       '1808.09029', '2004.04721', '1905.07791', '2002.04181', '1908.06264',
       '1709.10367', '1909.03582', '1908.06267', '1701.05574', '1907.01468'],
      dtype='object', length=281)


Unnamed: 0,1912.01214,1810.08699,1609.00425,1801.05147,1811.00383,1909.09067,1704.06194,1909.00512,2003.03106,1708.01464,...,1808.09029,2004.04721,1905.07791,2002.04181,1908.06264,1709.10367,1909.03582,1908.06267,1701.05574,1907.01468
title,Cross-lingual Pre-training Based Transfer for ...,pioNER: Datasets and Baselines for Armenian Na...,Identifying Dogmatism in Social Media: Signals...,Adversarial Learning for Chinese NER from Crow...,Addressing word-order Divergence in Multilingu...,A Corpus for Automatic Readability Assessment ...,Improved Neural Relation Detection for Knowled...,How Contextual are Contextualized Word Represe...,Sensitive Data Detection and Classification in...,Massively Multilingual Neural Grapheme-to-Phon...,...,Pyramidal Recurrent Unit for Language Modeling,Translation Artifacts in Cross-lingual Transfe...,Predicting Annotation Difficulty to Improve Ta...,Performance Comparison of Crowdworkers and NLP...,EmotionX-IDEA: Emotion BERT -- an Affectional ...,Structured Embedding Models for Grouped Data,Clickbait? Sensational Headline Generation wit...,Message Passing Attention Networks for Documen...,Harnessing Cognitive Features for Sarcasm Dete...,How we do things with words: Analyzing text as...
abstract,Transfer learning between different language p...,"In this work, we tackle the problem of Armenia...",We explore linguistic and behavioral features ...,"To quickly obtain new labeled data, we can cho...",Transfer learning approaches for Neural Machin...,"In this paper, we present a corpus for use in ...",Relation detection is a core component for man...,Replacing static word embeddings with contextu...,Massive digital data processing provides a wid...,Grapheme-to-phoneme conversion (g2p) is necess...,...,LSTMs are powerful tools for modeling contextu...,Both human and machine translation play a cent...,Modern NLP systems require high-quality annota...,We report results of a comparison of the accur...,"In this paper, we investigate the emotion reco...",Word embeddings are a powerful approach for an...,Sensational headlines are headlines that captu...,Graph neural networks have recently emerged as...,"In this paper, we propose a novel mechanism fo...",In this article we describe our experiences wi...
full_text,"[{'section_name': 'Introduction', 'paragraphs'...","[{'section_name': 'Introduction', 'paragraphs'...","[{'section_name': 'Introduction', 'paragraphs'...","[{'section_name': 'Introduction', 'paragraphs'...","[{'section_name': 'Introduction', 'paragraphs'...","[{'section_name': 'Introduction', 'paragraphs'...","[{'section_name': 'Introduction', 'paragraphs'...","[{'section_name': 'Introduction', 'paragraphs'...","[{'section_name': 'Introduction', 'paragraphs'...","[{'section_name': 'Introduction', 'paragraphs'...",...,"[{'section_name': 'Introduction', 'paragraphs'...","[{'section_name': 'Introduction', 'paragraphs'...","[{'section_name': 'Introduction', 'paragraphs'...","[{'section_name': 'Introduction', 'paragraphs'...","[{'section_name': 'Introduction', 'paragraphs'...","[{'section_name': 'Introduction', 'paragraphs'...","[{'section_name': 'Introduction', 'paragraphs'...","[{'section_name': 'Introduction', 'paragraphs'...","[{'section_name': 'Introduction', 'paragraphs'...","[{'section_name': 'Introduction', 'paragraphs'..."
qas,[{'question': 'which multilingual approaches d...,[{'question': 'what ner models were evaluated?...,[{'question': 'what are the topics pulled from...,[{'question': 'What accuracy does the proposed...,[{'question': 'How do they match words before ...,[{'question': 'Which information about text st...,[{'question': 'On which benchmarks they achiev...,[{'question': 'What experiments are proposed t...,[{'question': 'What is the performance of BERT...,[{'question': 'how is model compactness measur...,...,"[{'question': 'what data did they use?', 'ques...",[{'question': 'What are examples of these arti...,[{'question': 'How much higher quality is the ...,"[{'question': 'Who are the crowdworkers?', 'qu...","[{'question': 'what were the baselines?', 'que...",[{'question': 'Do they evaluate on English onl...,"[{'question': 'What is future work planed?', '...",[{'question': 'Which component is the least im...,[{'question': 'What other evaluation metrics a...,[{'question': 'What approaches do they use tow...
figures_and_tables,"[{'file': '1-Figure1-1.png', 'caption': 'Figur...","[{'file': '2-Figure1-1.png', 'caption': 'Fig. ...","[{'file': '2-Figure1-1.png', 'caption': 'Figur...","[{'file': '3-Figure1-1.png', 'caption': 'Figur...","[{'file': '2-Table1-1.png', 'caption': 'Table ...","[{'file': '3-Table1-1.png', 'caption': 'Table ...","[{'file': '2-Figure1-1.png', 'caption': 'Figur...","[{'file': '5-Figure1-1.png', 'caption': 'Figur...","[{'file': '2-Table1-1.png', 'caption': 'Table ...","[{'file': '3-Table1-1.png', 'caption': 'Table ...",...,"[{'file': '1-Figure1-1.png', 'caption': 'Figur...","[{'file': '4-Table1-1.png', 'caption': 'Table ...","[{'file': '3-Table1-1.png', 'caption': 'Table ...","[{'file': '3-Figure1-1.png', 'caption': 'Figur...","[{'file': '1-Table1-1.png', 'caption': 'Table ...","[{'file': '2-Figure1-1.png', 'caption': 'Figur...","[{'file': '3-Figure1-1.png', 'caption': 'Figur...","[{'file': '5-Table1-1.png', 'caption': 'Table ...","[{'file': '3-Table1-1.png', 'caption': 'Table ...",[]


# **Data Transformation on QAS column**

In [None]:
import json
import pandas as pd
from tqdm import tqdm

def create_dataframe(dataset):
    paper_ids, questions, yes_nos, answers, evidences, highlighted_evidences = [], [], [], [], [], []
    for item in dataset:
        paper_ids.append(item['paper_id'])
        questions.append(item['question'])
        yes_nos.append(item['yes_no'])
        answers.append(item['answer'])
        evidences.append('\n'.join(item['evidence']))
        highlighted_evidences.append('\n'.join(item['highlighted_evidence']))

    df = pd.DataFrame({
        'Paper ID': paper_ids,
        'Question': questions,
        'Yes/No Question': yes_nos,
        'Answer': answers,
        'Evidence': evidences,
        'Highlighted Evidence': highlighted_evidences
    })
    return df

datasets = []
for mode in ['train', 'dev']:  # Adjust these as per the available JSON files
    with open(f'qasper-{mode}-v0.3.json', 'r') as file:
        data = json.load(file)

    dataset = []
    for paper_id, paper_data in tqdm(data.items()):
        for qa in paper_data['qas']:
            question = qa['question']
            for answer in qa['answers']:
                ans = answer['answer']  # Corrected this line to access the answer dictionary directly
                if not ans['unanswerable']:
                    dataset.append({
                        'paper_id': paper_id,
                        'question': question,
                        'yes_no': ans['yes_no'],
                        'answer': ans['free_form_answer'] if ans['free_form_answer'] else "Yes" if ans['yes_no'] else "No",
                        'evidence': ans['evidence'],
                        'highlighted_evidence': ans['highlighted_evidence']
                    })
    datasets.extend(dataset)



100%|██████████| 888/888 [00:00<00:00, 118835.49it/s]
100%|██████████| 281/281 [00:00<00:00, 116095.29it/s]


In [None]:
df


Unnamed: 0,1912.01214,1810.08699,1609.00425,1801.05147,1811.00383,1909.09067,1704.06194,1909.00512,2003.03106,1708.01464,...,1808.09029,2004.04721,1905.07791,2002.04181,1908.06264,1709.10367,1909.03582,1908.06267,1701.05574,1907.01468
title,Cross-lingual Pre-training Based Transfer for ...,pioNER: Datasets and Baselines for Armenian Na...,Identifying Dogmatism in Social Media: Signals...,Adversarial Learning for Chinese NER from Crow...,Addressing word-order Divergence in Multilingu...,A Corpus for Automatic Readability Assessment ...,Improved Neural Relation Detection for Knowled...,How Contextual are Contextualized Word Represe...,Sensitive Data Detection and Classification in...,Massively Multilingual Neural Grapheme-to-Phon...,...,Pyramidal Recurrent Unit for Language Modeling,Translation Artifacts in Cross-lingual Transfe...,Predicting Annotation Difficulty to Improve Ta...,Performance Comparison of Crowdworkers and NLP...,EmotionX-IDEA: Emotion BERT -- an Affectional ...,Structured Embedding Models for Grouped Data,Clickbait? Sensational Headline Generation wit...,Message Passing Attention Networks for Documen...,Harnessing Cognitive Features for Sarcasm Dete...,How we do things with words: Analyzing text as...
abstract,Transfer learning between different language p...,"In this work, we tackle the problem of Armenia...",We explore linguistic and behavioral features ...,"To quickly obtain new labeled data, we can cho...",Transfer learning approaches for Neural Machin...,"In this paper, we present a corpus for use in ...",Relation detection is a core component for man...,Replacing static word embeddings with contextu...,Massive digital data processing provides a wid...,Grapheme-to-phoneme conversion (g2p) is necess...,...,LSTMs are powerful tools for modeling contextu...,Both human and machine translation play a cent...,Modern NLP systems require high-quality annota...,We report results of a comparison of the accur...,"In this paper, we investigate the emotion reco...",Word embeddings are a powerful approach for an...,Sensational headlines are headlines that captu...,Graph neural networks have recently emerged as...,"In this paper, we propose a novel mechanism fo...",In this article we describe our experiences wi...
full_text,"[{'section_name': 'Introduction', 'paragraphs'...","[{'section_name': 'Introduction', 'paragraphs'...","[{'section_name': 'Introduction', 'paragraphs'...","[{'section_name': 'Introduction', 'paragraphs'...","[{'section_name': 'Introduction', 'paragraphs'...","[{'section_name': 'Introduction', 'paragraphs'...","[{'section_name': 'Introduction', 'paragraphs'...","[{'section_name': 'Introduction', 'paragraphs'...","[{'section_name': 'Introduction', 'paragraphs'...","[{'section_name': 'Introduction', 'paragraphs'...",...,"[{'section_name': 'Introduction', 'paragraphs'...","[{'section_name': 'Introduction', 'paragraphs'...","[{'section_name': 'Introduction', 'paragraphs'...","[{'section_name': 'Introduction', 'paragraphs'...","[{'section_name': 'Introduction', 'paragraphs'...","[{'section_name': 'Introduction', 'paragraphs'...","[{'section_name': 'Introduction', 'paragraphs'...","[{'section_name': 'Introduction', 'paragraphs'...","[{'section_name': 'Introduction', 'paragraphs'...","[{'section_name': 'Introduction', 'paragraphs'..."
qas,[{'question': 'which multilingual approaches d...,[{'question': 'what ner models were evaluated?...,[{'question': 'what are the topics pulled from...,[{'question': 'What accuracy does the proposed...,[{'question': 'How do they match words before ...,[{'question': 'Which information about text st...,[{'question': 'On which benchmarks they achiev...,[{'question': 'What experiments are proposed t...,[{'question': 'What is the performance of BERT...,[{'question': 'how is model compactness measur...,...,"[{'question': 'what data did they use?', 'ques...",[{'question': 'What are examples of these arti...,[{'question': 'How much higher quality is the ...,"[{'question': 'Who are the crowdworkers?', 'qu...","[{'question': 'what were the baselines?', 'que...",[{'question': 'Do they evaluate on English onl...,"[{'question': 'What is future work planed?', '...",[{'question': 'Which component is the least im...,[{'question': 'What other evaluation metrics a...,[{'question': 'What approaches do they use tow...
figures_and_tables,"[{'file': '1-Figure1-1.png', 'caption': 'Figur...","[{'file': '2-Figure1-1.png', 'caption': 'Fig. ...","[{'file': '2-Figure1-1.png', 'caption': 'Figur...","[{'file': '3-Figure1-1.png', 'caption': 'Figur...","[{'file': '2-Table1-1.png', 'caption': 'Table ...","[{'file': '3-Table1-1.png', 'caption': 'Table ...","[{'file': '2-Figure1-1.png', 'caption': 'Figur...","[{'file': '5-Figure1-1.png', 'caption': 'Figur...","[{'file': '2-Table1-1.png', 'caption': 'Table ...","[{'file': '3-Table1-1.png', 'caption': 'Table ...",...,"[{'file': '1-Figure1-1.png', 'caption': 'Figur...","[{'file': '4-Table1-1.png', 'caption': 'Table ...","[{'file': '3-Table1-1.png', 'caption': 'Table ...","[{'file': '3-Figure1-1.png', 'caption': 'Figur...","[{'file': '1-Table1-1.png', 'caption': 'Table ...","[{'file': '2-Figure1-1.png', 'caption': 'Figur...","[{'file': '3-Figure1-1.png', 'caption': 'Figur...","[{'file': '5-Table1-1.png', 'caption': 'Table ...","[{'file': '3-Table1-1.png', 'caption': 'Table ...",[]


**Remove Duplicates for the generated Questions and Answers**

In [None]:

# Remove duplicate questions
question_set = set()
unique_datasets = [item for item in datasets if item['question'] not in question_set and not question_set.add(item['question'])]

df = create_dataframe(unique_datasets)
print(df.head())

     Paper ID                                           Question  \
0  1909.00694                          What is the seed lexicon?   
1  1909.00694                              What are the results?   
2  1909.00694      How are relations used to propagate polarity?   
3  1909.00694                      How big is the Japanese data?   
4  1909.00694  What are labels available in dataset for super...   

  Yes/No Question                                             Answer  \
0            None  a vocabulary of positive and negative predicat...   
1            None  Using all data to train: AL -- BiGRU achieved ...   
2            None  based on the relation between events, the sugg...   
3            None  7000000 pairs of events were extracted from th...   
4            None                                                 No   

                                            Evidence  \
0  The seed lexicon consists of positive and nega...   
1  FLOAT SELECTED: Table 3: Performance of var

In [None]:
df.head()

Unnamed: 0,Paper ID,Question,Yes/No Question,Answer,Evidence,Highlighted Evidence
0,1909.00694,What is the seed lexicon?,,a vocabulary of positive and negative predicat...,The seed lexicon consists of positive and nega...,The seed lexicon consists of positive and nega...
1,1909.00694,What are the results?,,Using all data to train: AL -- BiGRU achieved ...,FLOAT SELECTED: Table 3: Performance of variou...,FLOAT SELECTED: Table 3: Performance of variou...
2,1909.00694,How are relations used to propagate polarity?,,"based on the relation between events, the sugg...","In this paper, we propose a simple and effecti...","As illustrated in Figure FIGREF1, our key idea..."
3,1909.00694,How big is the Japanese data?,,7000000 pairs of events were extracted from th...,"As a raw corpus, we used a Japanese web corpus...","As a raw corpus, we used a Japanese web corpus..."
4,1909.00694,What are labels available in dataset for super...,,No,Affective events BIBREF0 are events that typic...,"In this paper, we work on recognizing the pola..."


In [None]:
df.shape

(3051, 6)

In [None]:
!pip install ipdb

Collecting ipdb
  Downloading ipdb-0.13.13-py3-none-any.whl (12 kB)
Installing collected packages: ipdb
Successfully installed ipdb-0.13.13


In [None]:
import json
from tqdm import tqdm
import ipdb

def remove_duplicate(d):
    question_set = set()
    dataset = {}
    counter = 0
    for sample in d:
        if sample['question'] not in question_set:
            question_set.add(sample['question'])
            sample['answer'] = [sample['answer']]
            dataset[sample['question']] = sample
        else:
            dataset[sample['question']]['answer'].append(sample['answer'])
            counter += 1
    print(f'[!] remove {counter} samples and save {len(dataset)} samples')
    dataset = [dataset[key] for key in dataset]
    return dataset

if __name__ == "__main__":
    yes_no_datasets, datasets = [], []
    num_free_form, num_extract, num_yes_no = 0, 0, 0
    for mode in ['train', 'dev']:
        data = json.load(open(f'qasper-{mode}-v0.3.json'))
        yes_no_dataset, dataset = [], []
        for paper_id in tqdm(data):
            paper = data[paper_id]
            for qa in paper['qas']:
                question = qa['question']
                for a in qa['answers']:
                    a = a['answer']
                    if a['unanswerable'] is False:
                        evidence = a['evidence'] + a['highlighted_evidence']
                        if a['free_form_answer']:
                            answer = a['free_form_answer']
                            dataset.append({
                                'question': question,
                                'answer': answer,
                                'evidence': '\n'.join(evidence),
                                'yes_no': False,
                                'paper_id': paper_id
                            })
                            num_free_form += 1
                        elif a['extractive_spans']:
                            answer = ''.join([f'* {i}\n' for idx, i in enumerate(a['extractive_spans'])])
                            answer = 'The answers are shown as follows:\n' + answer
                            dataset.append({
                                'question': question,
                                'answer': answer,
                                'evidence': '\n'.join(evidence),
                                'yes_no': False,
                                'paper_id': paper_id
                            })
                            num_extract += 1
                        else:
                            if a['yes_no'] is not None and evidence:
                                if a['yes_no'] is True:
                                    answer = 1
                                else:
                                    answer = 0
                                yes_no_dataset.append({
                                    'question': question,
                                    'answer': answer,
                                    'evidence': '\n'.join(evidence),
                                    'yes_no': True,
                                    'paper_id': paper_id
                                })
                                dataset.append({
                                    'question': question,
                                    'answer': 'Yes.' if answer else 'No.',
                                    'evidence': '\n'.join(evidence),
                                    'yes_no': True,
                                    'paper_id': paper_id
                                })
                                num_yes_no += 1
                            else:
                                # raise Exception(f'[!] something wrong')
                                pass

        print(f'[!] collect {len(dataset)} samples')

        datasets.append(dataset)
        yes_no_datasets.append(yes_no_dataset)
    # train_dataset = datasets[0] + datasets[1]
    train_dataset = datasets[0]
    test_dataset = datasets[1]
    json.dump(train_dataset, open(f'qasper_train_sft.json', 'w'), indent=4, ensure_ascii=False)
    test_dataset = remove_duplicate(test_dataset)
    json.dump(test_dataset, open(f'qasper_test_sft.json', 'w'), indent=4, ensure_ascii=False)




    train_dataset = yes_no_datasets[0] + yes_no_datasets[1]
    test_dataset = yes_no_datasets[1]
    json.dump(train_dataset, open(f'qasper_yes_no_train_sft.json', 'w'), indent=4, ensure_ascii=False)
    json.dump(test_dataset, open(f'qasper_yes_no_test_sft.json', 'w'), indent=4, ensure_ascii=False)


    print(f'[!] Generation: {num_free_form}; Extraction: {num_extract}; Yes or No: {num_yes_no}')

100%|██████████| 888/888 [00:00<00:00, 63977.98it/s]


[!] collect 2314 samples


100%|██████████| 281/281 [00:00<00:00, 42544.11it/s]


[!] collect 1553 samples
[!] remove 678 samples and save 875 samples
[!] Generation: 1053; Extraction: 2325; Yes or No: 489


In [None]:
import json
from tqdm import tqdm
import pandas as pd  # Added to use DataFrame

def create_dataframe(datasets):
    questions, answers = [], []
    for dataset in datasets:
        for item in dataset:
            questions.append(item['question'])
            answers.append(item['answer'])  # Adjust this line according to the structure of your answers
    df = pd.DataFrame({'Question': questions, 'Answer': answers})
    return df

if __name__ == "__main__":
    yes_no_datasets, datasets = [], []
    num_free_form, num_extract, num_yes_no = 0, 0, 0

    for mode in ['train', 'dev']:
        data = json.load(open(f'qasper-{mode}-v0.3.json'))
        dataset = []
        for paper_id in tqdm(data):
            paper = data[paper_id]

            for qa in paper['qas']:
                question = qa['question']

                for a in qa['answers']:
                    a = a['answer']
                    if a['unanswerable'] is False:
                        evidence = a['evidence'] + a['highlighted_evidence']
                        answer = None  # Initialize answer

                        if a['free_form_answer']:
                            answer = a['free_form_answer']
                            num_free_form += 1
                        elif a['extractive_spans']:
                            answer = ''.join([f'* {i}\n' for idx, i in enumerate(a['extractive_spans'])])
                            answer = 'The answers are shown as follows:\n' + answer
                            num_extract += 1
                        elif a['yes_no'] is not None and evidence:
                            answer = 'Yes.' if a['yes_no'] else 'No.'
                            num_yes_no += 1

                        if answer:  # Only add if there's an answer
                            dataset.append({
                                'question': question,
                                'answer': answer,
                                'evidence': '\n'.join(evidence),
                                'yes_no': a['yes_no'] if 'yes_no' in a else None,
                                'paper_id': paper_id
                            })

        datasets.append(dataset)
        print(f'[!] collect {len(dataset)} samples')

    # Creating DataFrame for the questions and answers
    df = create_dataframe(datasets)
    print(df.head())  # Print the first few rows of the DataFrame

    # Optionally, you can save this DataFrame to a CSV file
    # df.to_csv('questions_answers.csv', index=False)


100%|██████████| 888/888 [00:00<00:00, 65801.14it/s]


[!] collect 2314 samples


100%|██████████| 281/281 [00:00<00:00, 30080.12it/s]

[!] collect 1553 samples
                                        Question  \
0                      What is the seed lexicon?   
1                      What is the seed lexicon?   
2                          What are the results?   
3  How are relations used to propagate polarity?   
4  How are relations used to propagate polarity?   

                                              Answer  
0  a vocabulary of positive and negative predicat...  
1  The answers are shown as follows:\n* seed lexi...  
2  Using all data to train: AL -- BiGRU achieved ...  
3  based on the relation between events, the sugg...  
4  cause relation: both events in the relation sh...  





In [None]:
df

Unnamed: 0,Question,Answer
0,What is the seed lexicon?,a vocabulary of positive and negative predicat...
1,What is the seed lexicon?,The answers are shown as follows:\n* seed lexi...
2,What are the results?,Using all data to train: AL -- BiGRU achieved ...
3,How are relations used to propagate polarity?,"based on the relation between events, the sugg..."
4,How are relations used to propagate polarity?,cause relation: both events in the relation sh...
...,...,...
3862,What is the best reported system?,The answers are shown as follows:\n* the MILR ...
3863,What cognitive features are used?,"Readability (RED), Number of Words (LEN), Avg..."
3864,What approaches do they use towards text analy...,The answers are shown as follows:\n* Domain ex...
3865,What approaches do they use towards text analy...,Modeling considerations: the variables (both ...


In [None]:
import json
from tqdm import tqdm
import pandas as pd  # Added to use DataFrame

def create_dataframe(datasets):
    questions, answers, paper_ids = [], [], []  # Added paper_ids list
    for dataset in datasets:
        for item in dataset:
            questions.append(item['question'])
            answers.append(item['answer'])
            paper_ids.append(item['paper_id'])  # Added this line to store paper IDs
    df = pd.DataFrame({'Question': questions, 'Answer': answers, 'Paper ID': paper_ids})  # Added 'Paper ID' column
    return df

if __name__ == "__main__":
    yes_no_datasets, datasets = [], []
    num_free_form, num_extract, num_yes_no = 0, 0, 0

    for mode in ['train', 'dev']:
        data = json.load(open(f'qasper-{mode}-v0.3.json'))
        dataset = []
        for paper_id in tqdm(data):
            paper = data[paper_id]

            for qa in paper['qas']:
                question = qa['question']

                for a in qa['answers']:
                    a = a['answer']
                    if a['unanswerable'] is False:
                        evidence = a['evidence'] + a['highlighted_evidence']
                        answer = None  # Initialize answer

                        if a['free_form_answer']:
                            answer = a['free_form_answer']
                            num_free_form += 1
                        elif a['extractive_spans']:
                            answer = ''.join([f'* {i}\n' for idx, i in enumerate(a['extractive_spans'])])
                            answer = 'The answers are shown as follows:\n' + answer
                            num_extract += 1
                        elif a['yes_no'] is not None and evidence:
                            answer = 'Yes.' if a['yes_no'] else 'No.'
                            num_yes_no += 1

                        if answer:  # Only add if there's an answer
                            dataset.append({
                                'question': question,
                                'answer': answer,
                                'evidence': '\n'.join(evidence),
                                'yes_no': a['yes_no'] if 'yes_no' in a else None,
                                'paper_id': paper_id
                            })

        datasets.append(dataset)
        print(f'[!] collect {len(dataset)} samples')

    # Creating DataFrame for the questions, answers, and paper IDs
    df = create_dataframe(datasets)
    print(df.head())  # Print the first few rows of the DataFrame

    # Optionally, you can save this DataFrame to a CSV file
    df.to_csv('questions_answers_paper_ids.csv', index=False)


100%|██████████| 888/888 [00:00<00:00, 51025.32it/s]


[!] collect 2314 samples


100%|██████████| 281/281 [00:00<00:00, 35916.48it/s]

[!] collect 1553 samples
                                        Question  \
0                      What is the seed lexicon?   
1                      What is the seed lexicon?   
2                          What are the results?   
3  How are relations used to propagate polarity?   
4  How are relations used to propagate polarity?   

                                              Answer    Paper ID  
0  a vocabulary of positive and negative predicat...  1909.00694  
1  The answers are shown as follows:\n* seed lexi...  1909.00694  
2  Using all data to train: AL -- BiGRU achieved ...  1909.00694  
3  based on the relation between events, the sugg...  1909.00694  
4  cause relation: both events in the relation sh...  1909.00694  





In [None]:
import json
import pandas as pd
from tqdm import tqdm

def create_dataframe(dataset):
    paper_ids, questions, yes_nos, answers, evidences, highlighted_evidences = [], [], [], [], [], []
    for item in dataset:
        paper_ids.append(item['paper_id'])
        questions.append(item['question'])
        yes_nos.append(item['yes_no'])
        answers.append(item['answer'])
        evidences.append('\n'.join(item['evidence']))
        highlighted_evidences.append('\n'.join(item['highlighted_evidence']))

    df = pd.DataFrame({
        'Paper ID': paper_ids,
        'Question': questions,
        'Yes/No Question': yes_nos,
        'Answer': answers,
        'Evidence': evidences,
        'Highlighted Evidence': highlighted_evidences
    })
    return df

datasets = []
for mode in ['train', 'dev']:  # Adjust these as per the available JSON files
    with open(f'qasper-{mode}-v0.3.json', 'r') as file:
        data = json.load(file)

    dataset = []
    for paper_id, paper_data in tqdm(data.items()):
        for qa in paper_data['qas']:
            question = qa['question']
            for answer in qa['answers']:
                ans = answer['answer']  # Corrected this line to access the answer dictionary directly
                if not ans['unanswerable']:
                    dataset.append({
                        'paper_id': paper_id,
                        'question': question,
                        'yes_no': ans['yes_no'],
                        'answer': ans['free_form_answer'] if ans['free_form_answer'] else "Yes" if ans['yes_no'] else "No",
                        'evidence': ans['evidence'],
                        'highlighted_evidence': ans['highlighted_evidence']
                    })
    datasets.extend(dataset)

# Remove duplicate questions
question_set = set()
unique_datasets = [item for item in datasets if item['question'] not in question_set and not question_set.add(item['question'])]

df = create_dataframe(unique_datasets)
print(df.head())

# Save to CSV
# df.to_csv('qa_dataset.csv', index=False)


100%|██████████| 888/888 [00:00<00:00, 245148.55it/s]
100%|██████████| 281/281 [00:00<00:00, 5116.18it/s]

     Paper ID                                           Question  \
0  1909.00694                          What is the seed lexicon?   
1  1909.00694                              What are the results?   
2  1909.00694      How are relations used to propagate polarity?   
3  1909.00694                      How big is the Japanese data?   
4  1909.00694  What are labels available in dataset for super...   

  Yes/No Question                                             Answer  \
0            None  a vocabulary of positive and negative predicat...   
1            None  Using all data to train: AL -- BiGRU achieved ...   
2            None  based on the relation between events, the sugg...   
3            None  7000000 pairs of events were extracted from th...   
4            None                                                 No   

                                            Evidence  \
0  The seed lexicon consists of positive and nega...   
1  FLOAT SELECTED: Table 3: Performance of var




In [None]:
df

Unnamed: 0,Paper ID,Question,Yes/No Question,Answer,Evidence,Highlighted Evidence
0,1909.00694,What is the seed lexicon?,,a vocabulary of positive and negative predicat...,The seed lexicon consists of positive and nega...,The seed lexicon consists of positive and nega...
1,1909.00694,What are the results?,,Using all data to train: AL -- BiGRU achieved ...,FLOAT SELECTED: Table 3: Performance of variou...,FLOAT SELECTED: Table 3: Performance of variou...
2,1909.00694,How are relations used to propagate polarity?,,"based on the relation between events, the sugg...","In this paper, we propose a simple and effecti...","As illustrated in Figure FIGREF1, our key idea..."
3,1909.00694,How big is the Japanese data?,,7000000 pairs of events were extracted from th...,"As a raw corpus, we used a Japanese web corpus...","As a raw corpus, we used a Japanese web corpus..."
4,1909.00694,What are labels available in dataset for super...,,No,Affective events BIBREF0 are events that typic...,"In this paper, we work on recognizing the pola..."
...,...,...,...,...,...,...
3046,1701.05574,What is the best reported system?,,Gaze Sarcasm using Multi Instance Logistic Reg...,FLOAT SELECTED: Table 3: Classification result...,FLOAT SELECTED: Table 3: Classification result...
3047,1701.05574,What cognitive features are used?,,"Readability (RED), Number of Words (LEN), Avg...",FLOAT SELECTED: Table 2: The complete set of f...,FLOAT SELECTED: Table 2: The complete set of f...
3048,1907.01468,What approaches do they use towards text analy...,,No,This contrasts with much of the work in comput...,The approaches we use and what we mean by `suc...
3049,1907.01468,Do they demonstrate why interdisciplinary insi...,False,No,,


In [None]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.15, random_state=42)

print("Train DataFrame shape:", train_df.shape)
print("Test DataFrame shape:", test_df.shape)


Train DataFrame shape: (2593, 6)
Test DataFrame shape: (458, 6)


In [None]:
train_df

Unnamed: 0,Paper ID,Question,Yes/No Question,Answer,Evidence,Highlighted Evidence
809,1612.05310,Do they experiment with the dataset?,True,Yes,The overall Total Accuracy score reported in t...,The overall Total Accuracy score reported in t...
56,1904.09678,how is quality measured?,,Accuracy and the macro-F1 (averaged F1 over po...,FLOAT SELECTED: Table 1: Comparison of manuall...,FLOAT SELECTED: Table 1: Comparison of manuall...
2887,2002.07306,Is the system tested on low-resource languages?,True,Yes,We evaluate our approach for six target langua...,"French and Russian, and Arabic can be regarded..."
1590,1710.11154,Did they experiment on this corpus?,False,No,,
1373,1910.02339,What is the performance proposed model achieve...,,Operation accuracy: 71.89\nExecution accuracy:...,"Given a natural-language math problem, we need...",Our model outperforms both the original seq2pr...
...,...,...,...,...,...,...
1638,2003.08370,What classifiers were used in experiments?,,No,The Bi-LSTM model consists of a Bi-LSTM layer ...,The Bi-LSTM model consists of a Bi-LSTM layer ...
1095,2002.02562,How big is LibriSpeech dataset?,,No,We evaluated the proposed model using the publ...,We evaluated the proposed model using the publ...
1130,2003.00639,How does framework automatically chooses diffe...,,No,The adaptive multi-curricula learning framewor...,The adaptive multi-curricula learning framewor...
1294,1904.04019,In which domains is sarcasm conveyed in differ...,,No,We now discuss the relations among the results...,"From the in-corpus experiments, we obtain good..."


In [None]:
test_df

Unnamed: 0,Paper ID,Question,Yes/No Question,Answer,Evidence,Highlighted Evidence
270,1910.11769,How many annotators were there?,,No,We required all annotators have a `master' MTu...,Each passage was labelled by 3 unique annotat...
544,1606.05320,What kind of information do the HMMs learn tha...,,The HMM can identify punctuation or pick up on...,We interpret the HMM and LSTM states in the hy...,We see some examples where the HMM and LSTM co...
2494,1707.06806,Which pretrained word vectors did they use?,,No,"Since the input of our method is textual data,...","Since the input of our method is textual data,..."
554,1809.10644,what was the baseline?,,No,All of our results are produced from 10-fold c...,We trained a logistic regression baseline mode...
2322,1802.07862,Does their NER model learn NER from both text ...,True,Yes,(proposed) Bi-LSTM/CRF + Bi-CharLSTM with moda...,(proposed) Bi-LSTM/CRF + Bi-CharLSTM with moda...
...,...,...,...,...,...,...
2308,1912.03804,How id Depechemood trained?,,By multiplying crowd-annotated document-emotio...,Depechemood is a lexicon-based emotion detecti...,Depechemood is a lexicon-based emotion detecti...
70,1712.09127,Which GAN do they use?,,No,"We assume that for each corpora INLINEFORM0 , ...","We assume that for each corpora INLINEFORM0 , ..."
1709,1910.06061,Did they evaluate against baseline?,True,Yes,Our contributions are as follows: We propose t...,We evaluate our newly proposed models and rela...
2909,1909.01383,by how much did the BLEU score improve?,,On average 0.64,The BLEU scores are provided in Table TABREF24...,The BLEU scores are provided in Table TABREF24...


In [None]:
# Save to CSV
# df.to_csv('qa_dataset.csv', index=False)

In [None]:
import requests
import pandas as pd

# Define the URL
url = "https://datasets-server.huggingface.co/first-rows?dataset=allenai%2Fqasper&config=qasper&split=train"

# Make the GET request
response = requests.get(url)
response.raise_for_status()  # This will raise an exception for HTTP errors

# Convert the returned JSON data into a DataFrame
data = response.json()
df = pd.DataFrame(data)

# Optionally, print the first few rows to verify
print(df.head())

# Save to CSV or any other format if needed
# df.to_csv('filename.csv', index=False)


ValueError: ignored

In [None]:
df = pd.DataFrame(data['rows'])
df

Unnamed: 0,row_idx,row,truncated_cells
0,0,"{'id': '1909.00694', 'title': 'Minimally Super...",[]
1,1,"{'id': '2003.07723', 'title': 'PO-EMO: Concept...",[]
2,2,"{'id': '1705.09665', 'title': 'Community Ident...",[]
3,3,"{'id': '1908.06606', 'title': 'Question Answer...","[abstract, full_text, qas, figures_and_tables]"
4,4,"{'id': '1811.00942', 'title': 'Progress and Tr...","[abstract, full_text, qas, figures_and_tables]"
5,5,"{'id': '1805.02400', 'title': 'Stay On-Topic: ...","[abstract, full_text, qas, figures_and_tables]"
6,6,"{'id': '1907.05664', 'title': 'Saliency Maps G...","[abstract, full_text, qas, figures_and_tables]"
7,7,"{'id': '1910.14497', 'title': 'Probabilistic B...","[abstract, full_text, qas, figures_and_tables]"
8,8,"{'id': '1912.02481', 'title': 'Massive vs. Cur...","[abstract, full_text, qas, figures_and_tables]"
9,9,"{'id': '1810.04528', 'title': 'Is there Gender...","[abstract, full_text, qas, figures_and_tables]"


In [None]:
print(data['rows'][:5])  # Display the first 5 rows


[{'row_idx': 0, 'row': {'id': '1909.00694', 'title': 'Minimally Supervised Learning of Affective Events Using Discourse Relations', 'abstract': 'Recognizing affective events that trigger positive or negative sentiment has a wide range of natural language processing applications but remains a challenging problem mainly because the polarity of an event is not necessarily predictable from its constituent words. In this paper, we propose to propagate affective polarity using discourse relations. Our method is simple and only requires a very small seed lexicon and a large raw corpus. Our experiments using Japanese data show that our method learns affective events effectively without manually labeled data. It also improves supervised learning results when labeled data are small.', 'full_text': {'section_name': ['Introduction', 'Related Work', 'Proposed Method', 'Proposed Method ::: Polarity Function', 'Proposed Method ::: Discourse Relation-Based Event Pairs', 'Proposed Method ::: Discourse 

In [None]:
import requests
import pandas as pd

# Define the URL
url = "https://datasets-server.huggingface.co/first-rows?dataset=allenai%2Fqasper&config=qasper&split=train"

# Make the GET request
response = requests.get(url)
response.raise_for_status()  # This will raise an exception for HTTP errors

# Convert the returned JSON data into a DataFrame
data = response.json()
df = pd.DataFrame(data['rows'])


# Optionally, print the first few rows to verify
print(df.head())

# Save to CSV or any other format if needed
# df.to_csv('filename.csv', index=False)


   row_idx                                                row  \
0        0  {'id': '1909.00694', 'title': 'Minimally Super...   
1        1  {'id': '2003.07723', 'title': 'PO-EMO: Concept...   
2        2  {'id': '1705.09665', 'title': 'Community Ident...   
3        3  {'id': '1908.06606', 'title': 'Question Answer...   
4        4  {'id': '1811.00942', 'title': 'Progress and Tr...   

                                  truncated_cells  
0                                              []  
1                                              []  
2                                              []  
3  [abstract, full_text, qas, figures_and_tables]  
4  [abstract, full_text, qas, figures_and_tables]  


In [None]:
df

Unnamed: 0,row_idx,row,truncated_cells
0,0,"{'id': '1909.00694', 'title': 'Minimally Super...",[]
1,1,"{'id': '2003.07723', 'title': 'PO-EMO: Concept...",[]
2,2,"{'id': '1705.09665', 'title': 'Community Ident...",[]
3,3,"{'id': '1908.06606', 'title': 'Question Answer...","[abstract, full_text, qas, figures_and_tables]"
4,4,"{'id': '1811.00942', 'title': 'Progress and Tr...","[abstract, full_text, qas, figures_and_tables]"
5,5,"{'id': '1805.02400', 'title': 'Stay On-Topic: ...","[abstract, full_text, qas, figures_and_tables]"
6,6,"{'id': '1907.05664', 'title': 'Saliency Maps G...","[abstract, full_text, qas, figures_and_tables]"
7,7,"{'id': '1910.14497', 'title': 'Probabilistic B...","[abstract, full_text, qas, figures_and_tables]"
8,8,"{'id': '1912.02481', 'title': 'Massive vs. Cur...","[abstract, full_text, qas, figures_and_tables]"
9,9,"{'id': '1810.04528', 'title': 'Is there Gender...","[abstract, full_text, qas, figures_and_tables]"


In [None]:
from datasets import load_dataset
import pandas as pd

# Load the dataset
dataset = load_dataset("allenai/qasper")

# Convert the 'train' split to a DataFrame (you can do the same for 'validation' or 'test' if available)
df_train = pd.DataFrame(dataset['train'])

# Display the first few rows
print(df_train.head())


           id                                              title  \
0  1909.00694  Minimally Supervised Learning of Affective Eve...   
1  2003.07723  PO-EMO: Conceptualization, Annotation, and Mod...   
2  1705.09665  Community Identity and User Engagement in a Mu...   
3  1908.06606  Question Answering based Clinical Text Structu...   
4  1811.00942   Progress and Tradeoffs in Neural Language Models   

                                            abstract  \
0  Recognizing affective events that trigger posi...   
1  Most approaches to emotion analysis regarding ...   
2  A community's identity defines and shapes its ...   
3  Clinical text structuring is a critical and fu...   
4  In recent years, we have witnessed a dramatic ...   

                                           full_text  \
0  {'section_name': ['Introduction', 'Related Wor...   
1  {'section_name': ['', ' ::: ', ' :::  ::: ', '...   
2  {'section_name': ['Introduction', 'A typology ...   
3  {'section_name': ['Introduc

In [None]:
print(dataset['train'][0])  # This will print the first example from the training set.


{'id': '1909.00694', 'title': 'Minimally Supervised Learning of Affective Events Using Discourse Relations', 'abstract': 'Recognizing affective events that trigger positive or negative sentiment has a wide range of natural language processing applications but remains a challenging problem mainly because the polarity of an event is not necessarily predictable from its constituent words. In this paper, we propose to propagate affective polarity using discourse relations. Our method is simple and only requires a very small seed lexicon and a large raw corpus. Our experiments using Japanese data show that our method learns affective events effectively without manually labeled data. It also improves supervised learning results when labeled data are small.', 'full_text': {'section_name': ['Introduction', 'Related Work', 'Proposed Method', 'Proposed Method ::: Polarity Function', 'Proposed Method ::: Discourse Relation-Based Event Pairs', 'Proposed Method ::: Discourse Relation-Based Event Pa

In [None]:


# Extract relevant fields
ids = [item['id'] for item in dataset['train']]
titles = [item['title'] for item in dataset['train']]
abstracts = [item['abstract'] for item in dataset['train']]


full_text = [item['full_text'] for item in dataset['train']]

qas = [item['qas'] for item in dataset['train']]

figures_and_tables = [item['figures_and_tables'] for item in dataset['train']]


# Construct a DataFrame
df = pd.DataFrame({
    'id': ids,
    'title': titles,
    'abstract': abstracts,
    'full_text': full_text,
    'qas': qas,
    'figures_and_tables': figures_and_tables
})

df


Unnamed: 0,id,title,abstract,full_text,qas,figures_and_tables
0,1909.00694,Minimally Supervised Learning of Affective Eve...,Recognizing affective events that trigger posi...,"{'section_name': ['Introduction', 'Related Wor...","{'question': ['What is the seed lexicon?', 'Wh...",{'caption': ['Figure 1: An overview of our met...
1,2003.07723,"PO-EMO: Conceptualization, Annotation, and Mod...",Most approaches to emotion analysis regarding ...,"{'section_name': ['', ' ::: ', ' ::: ::: ', '...",{'question': ['Does the paper report macro F1?...,{'caption': ['Figure 1: Temporal distribution ...
2,1705.09665,Community Identity and User Engagement in a Mu...,A community's identity defines and shapes its ...,"{'section_name': ['Introduction', 'A typology ...",{'question': ['Do they report results only on ...,{'caption': ['Figure 1: A: Within a community ...
3,1908.06606,Question Answering based Clinical Text Structu...,Clinical text structuring is a critical and fu...,"{'section_name': ['Introduction', 'Related Wor...",{'question': ['What data is the language model...,{'caption': ['Fig. 1. An illustrative example ...
4,1811.00942,Progress and Tradeoffs in Neural Language Models,"In recent years, we have witnessed a dramatic ...","{'section_name': ['Introduction', 'Background ...",{'question': ['What aspects have been compared...,{'caption': ['Table 1: Comparison of neural la...
...,...,...,...,...,...,...
883,1702.03274,Hybrid Code Networks: practical and efficient ...,End-to-end learning of recurrent neural networ...,"{'section_name': ['Introduction', 'Model descr...",{'question': ['Does the latent dialogue state ...,{'caption': ['Figure 1: Operational loop. Trap...
884,1610.03112,Leveraging Recurrent Neural Networks for Multi...,Social norms are shared rules that govern and ...,{'section_name': ['Introduction and Related Wo...,{'question': ['Does this paper propose a new t...,{'caption': ['Table 1: Statistics of the corpu...
885,1607.03542,Open-Vocabulary Semantic Parsing with both Dis...,Traditional semantic parsers map language onto...,"{'section_name': ['Introduction', 'Open vocabu...",{'question': ['What knowledge base do they use...,{'caption': ['Figure 1: Overview of the compon...
886,1812.10860,Can You Tell Me How to Get Past Sesame Street?...,Natural language understanding has recently se...,"{'section_name': ['Introduction', 'Related Wor...",{'question': ['Do some pretraining objectives ...,{'caption': ['Figure 1: Our common model desig...


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 888 entries, 0 to 887
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   id                  888 non-null    object
 1   title               888 non-null    object
 2   abstract            888 non-null    object
 3   full_text           888 non-null    object
 4   qas                 888 non-null    object
 5   figures_and_tables  888 non-null    object
dtypes: object(6)
memory usage: 41.8+ KB


'full_text' contains dictionaries.
'qas' contains dictionaries.
'figures_and_tables' contains dictionaries.


In [None]:
import json
print()

for column in df.columns:
    if any(isinstance(cell, dict) for cell in df[column]):
        print(f"'{column}' contains dictionaries.")
print()
print('-----------------------------------------------')
print()
df['full_text_string'] = df['full_text'].apply(json.dumps)
duplicates = df[['id', 'title', 'abstract', 'full_text_string']].duplicated()
duplicates


'full_text' contains dictionaries.
'qas' contains dictionaries.
'figures_and_tables' contains dictionaries.

-----------------------------------------------



0      False
1      False
2      False
3      False
4      False
       ...  
883    False
884    False
885    False
886    False
887    False
Length: 888, dtype: bool

# New Section

In [None]:
# Expand the 'question' list from each dictionary into its own series
qas_expanded = df['qas'].apply(lambda x: pd.Series(x['question',]))

# Stack the DataFrame to get a single column of questions and reset the index
qas_df = qas_expanded.stack().reset_index(drop=True).to_frame(name='question')

# Display the resulting DataFrame
print(qas_df)


                                               question
0                             What is the seed lexicon?
1                                 What are the results?
2         How are relations used to propagate polarity?
3                         How big is the Japanese data?
4     What are labels available in dataset for super...
...                                                 ...
2588                     What task do they evaluate on?
2589  Do some pretraining objectives perform better ...
2590  Did the authors try stacking multiple convolut...
2591  How many feature maps are generated for a give...
2592  How does the number of parameters compare to o...

[2593 rows x 1 columns]


In [None]:
qas_df = df[['qas']]
print(qas_df)


                                                   qas
0    {'question': ['What is the seed lexicon?', 'Wh...
1    {'question': ['Does the paper report macro F1?...
2    {'question': ['Do they report results only on ...
3    {'question': ['What data is the language model...
4    {'question': ['What aspects have been compared...
..                                                 ...
883  {'question': ['Does the latent dialogue state ...
884  {'question': ['Does this paper propose a new t...
885  {'question': ['What knowledge base do they use...
886  {'question': ['Do some pretraining objectives ...
887  {'question': ['Did the authors try stacking mu...

[888 rows x 1 columns]
