In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import json
import pandas as pd
from tqdm import tqdm

# Define the create_dataframe function
def create_dataframe(dataset):
    paper_ids, paper_titles, questions, yes_nos, answers, evidences, highlighted_evidences, merged_titles_questions = [], [], [], [], [], [], [], []
    for item in dataset:
        paper_ids.append(item['paper_id'])
        paper_titles.append(item['paper_title'])
        questions.append(item['question'])
        yes_nos.append(item['yes_no'])
        answers.append(item['answer'])
        evidences.append('\n'.join(item['evidence']))
        highlighted_evidences.append('\n'.join(item['highlighted_evidence']))
        merged_titles_questions.append(f"In the paper {item['paper_title']} , ({item['question']})")
    # Construct the DataFrame
    df = pd.DataFrame({
        'Paper ID': paper_ids,
        'Paper Title': paper_titles,
        'Question': questions,
        'Yes/No': yes_nos,
        'Answer': answers,
        'Evidence': evidences,
        'Highlighted Evidence': highlighted_evidences,
        'Merged Title and Question': merged_titles_questions
    })

    return df

# Assuming the JSON structure includes a 'paper_title' field
datasets = []  # Create an empty list to store the loaded data
for mode in ['train', 'dev']:
    file_path = f'/content/drive/MyDrive/qasper-{mode}-v0.3.json'  # Adjusted to use the `mode` variable in the file path

    with open(file_path, 'r') as file:
        data = json.load(file)

    dataset = []
    for paper_id, paper_data in tqdm(data.items()):
      #paper_title = paper_data['title']
      for qa in paper_data['qas']:
          question = qa['question']
          for answer in qa['answers']:
              ans = answer['answer']
              if not ans['unanswerable']:
                  dataset.append({
                      'paper_id': paper_id,
                      'paper_title': paper_title,
                      'question': question,
                      'yes_no': ans['yes_no'],
                      'answer': ans['free_form_answer'] if ans['free_form_answer'] else "Yes" if ans['yes_no'] else "No",
                      'evidence': ans['evidence'],
                      'highlighted_evidence': ans['highlighted_evidence']
                  })
      datasets.append(dataset)


# Define a new function to filter out duplicate questions for each paper
def filter_unique_questions(dataset):
    unique_questions = {}
    for item in dataset:
        # Create a unique key for each paper_id and question pair
        unique_key = (item['paper_id'], item['question'])
        if unique_key not in unique_questions:
            unique_questions[unique_key] = item
    return list(unique_questions.values())

# Filter the dataset to include only unique questions per paper
unique_dataset = filter_unique_questions(dataset)

# Now create the DataFrame with the unique dataset
unique_df = create_dataframe(unique_dataset)
unique_df.head()  # Display the first few rows of the dataframe

# Save to CSV
# df.to_csv('/content/drive/MyDrive/qa_dataset.csv', index=False)  # Adjust the path as necessary

100%|██████████| 888/888 [00:00<00:00, 103655.29it/s]
100%|██████████| 281/281 [00:00<00:00, 41822.48it/s]


Unnamed: 0,Paper ID,Paper Title,Question,Yes/No,Answer,Evidence,Highlighted Evidence,Merged Title and Question
0,1912.01214,How we do things with words: Analyzing text as...,which multilingual approaches do they compare ...,,No,Table TABREF19 and TABREF26 report zero-shot r...,We compare our approaches with related approac...,In the paper How we do things with words: Anal...
1,1912.01214,How we do things with words: Analyzing text as...,what are the pivot-based baselines?,,No,Table TABREF19 and TABREF26 report zero-shot r...,We compare our approaches with related approac...,In the paper How we do things with words: Anal...
2,1912.01214,How we do things with words: Analyzing text as...,which datasets did they experiment with?,,No,We evaluate our cross-lingual pre-training bas...,We evaluate our cross-lingual pre-training bas...,In the paper How we do things with words: Anal...
3,1912.01214,How we do things with words: Analyzing text as...,what language pairs are explored?,,"De-En, En-Fr, Fr-En, En-Es, Ro-En, En-De, Ar-E...","For MultiUN corpus, we use four languages: Eng...","For MultiUN corpus, we use four languages: Eng...",In the paper How we do things with words: Anal...
4,1810.08699,How we do things with words: Analyzing text as...,what ner models were evaluated?,,No,In this section we describe a number of experi...,In this section we describe a number of experi...,In the paper How we do things with words: Anal...


In [None]:
unique_df

Unnamed: 0,Paper ID,Paper Title,Question,Yes/No,Answer,Evidence,Highlighted Evidence,Merged Title and Question
0,1912.01214,How we do things with words: Analyzing text as...,which multilingual approaches do they compare ...,,No,Table TABREF19 and TABREF26 report zero-shot r...,We compare our approaches with related approac...,In the paper How we do things with words: Anal...
1,1912.01214,How we do things with words: Analyzing text as...,what are the pivot-based baselines?,,No,Table TABREF19 and TABREF26 report zero-shot r...,We compare our approaches with related approac...,In the paper How we do things with words: Anal...
2,1912.01214,How we do things with words: Analyzing text as...,which datasets did they experiment with?,,No,We evaluate our cross-lingual pre-training bas...,We evaluate our cross-lingual pre-training bas...,In the paper How we do things with words: Anal...
3,1912.01214,How we do things with words: Analyzing text as...,what language pairs are explored?,,"De-En, En-Fr, Fr-En, En-Es, Ro-En, En-De, Ar-E...","For MultiUN corpus, we use four languages: Eng...","For MultiUN corpus, we use four languages: Eng...",In the paper How we do things with words: Anal...
4,1810.08699,How we do things with words: Analyzing text as...,what ner models were evaluated?,,No,In this section we describe a number of experi...,In this section we describe a number of experi...,In the paper How we do things with words: Anal...
...,...,...,...,...,...,...,...,...
940,1701.05574,How we do things with words: Analyzing text as...,What is the best reported system?,,Gaze Sarcasm using Multi Instance Logistic Reg...,FLOAT SELECTED: Table 3: Classification result...,FLOAT SELECTED: Table 3: Classification result...,In the paper How we do things with words: Anal...
941,1701.05574,How we do things with words: Analyzing text as...,What cognitive features are used?,,"Readability (RED), Number of Words (LEN), Avg...",FLOAT SELECTED: Table 2: The complete set of f...,FLOAT SELECTED: Table 2: The complete set of f...,In the paper How we do things with words: Anal...
942,1907.01468,How we do things with words: Analyzing text as...,What approaches do they use towards text analy...,,No,This contrasts with much of the work in comput...,The approaches we use and what we mean by `suc...,In the paper How we do things with words: Anal...
943,1907.01468,How we do things with words: Analyzing text as...,Do they demonstrate why interdisciplinary insi...,False,No,,,In the paper How we do things with words: Anal...


In [None]:
import json
import pandas as pd
from tqdm import tqdm

def create_dataframe(dataset):
    paper_ids, questions, yes_nos, answers, evidences, highlighted_evidences, merged_titles_questions = [], [], [], [], [], [], []
    for item in dataset:
        paper_ids.append(item['paper_id'])
        questions.append(item['question'])
        yes_nos.append(item['yes_no'])
        answers.append(item['answer'])
        evidences.append('\n'.join(item['evidence']))
        highlighted_evidences.append('\n'.join(item['highlighted_evidence']))
        merged_titles_questions.append(f"In the paper {item['paper_title']}, {item['question']}")  # Merging title and question

    df = pd.DataFrame({
        'Paper ID': paper_ids,
        'Question': questions,
        'Yes/No Question': yes_nos,
        'Answer': answers,
        'Evidence': evidences,
        'Highlighted Evidence': highlighted_evidences,
        'Merged Title and Question': merged_titles_questions  # New column
    })
    return df

for mode in ['train', 'dev']:
    file_path = f'/content/drive/MyDrive/qasper-{mode}-v0.3.json'  # Adjusted to use the `mode` variable in the file path

    with open(file_path, 'r') as file:
        data = json.load(file)

    dataset = []
    for paper_id, paper_data in tqdm(data.items()):
        paper_title = paper_data.get('title', 'No Title')  # Assuming 'title' is the key for paper title
        for qa in paper_data['qas']:
            question = qa['question']
            for answer in qa['answers']:
                ans = answer['answer']
                if not ans['unanswerable']:
                    dataset.append({
                        'paper_id': paper_id,
                        'paper_title': paper_title,  # Add the paper title to the dataset
                        'question': question,
                        'yes_no': ans['yes_no'],
                        'answer': ans['free_form_answer'] if ans['free_form_answer'] else "Yes" if ans['yes_no'] else "No",
                        'evidence': ans['evidence'],
                        'highlighted_evidence': ans['highlighted_evidence']
                    })
    datasets.extend(dataset)

# Remove duplicate questions
question_set = set()
unique_datasets = [item for item in datasets if (item['paper_id'], item['question']) not in question_set and not question_set.add((item['paper_id'], item['question']))]

df = create_dataframe(unique_datasets)
print(df.head())

# Save to CSV
# df.to_csv('qa_dataset.csv', index=False)


100%|██████████| 888/888 [00:00<00:00, 62302.06it/s]
100%|██████████| 281/281 [00:00<00:00, 31056.64it/s]


     Paper ID                                           Question  \
0  1909.00694                          What is the seed lexicon?   
1  1909.00694                              What are the results?   
2  1909.00694      How are relations used to propagate polarity?   
3  1909.00694                      How big is the Japanese data?   
4  1909.00694  What are labels available in dataset for super...   

  Yes/No Question                                             Answer  \
0            None  a vocabulary of positive and negative predicat...   
1            None  Using all data to train: AL -- BiGRU achieved ...   
2            None  based on the relation between events, the sugg...   
3            None  7000000 pairs of events were extracted from th...   
4            None                                                 No   

                                            Evidence  \
0  The seed lexicon consists of positive and nega...   
1  FLOAT SELECTED: Table 3: Performance of var

In [None]:
df

Unnamed: 0,Paper ID,Question,Yes/No Question,Answer,Evidence,Highlighted Evidence,Merged Title and Question
0,1909.00694,What is the seed lexicon?,,a vocabulary of positive and negative predicat...,The seed lexicon consists of positive and nega...,The seed lexicon consists of positive and nega...,In the paper Minimally Supervised Learning of ...
1,1909.00694,What are the results?,,Using all data to train: AL -- BiGRU achieved ...,FLOAT SELECTED: Table 3: Performance of variou...,FLOAT SELECTED: Table 3: Performance of variou...,In the paper Minimally Supervised Learning of ...
2,1909.00694,How are relations used to propagate polarity?,,"based on the relation between events, the sugg...","In this paper, we propose a simple and effecti...","As illustrated in Figure FIGREF1, our key idea...",In the paper Minimally Supervised Learning of ...
3,1909.00694,How big is the Japanese data?,,7000000 pairs of events were extracted from th...,"As a raw corpus, we used a Japanese web corpus...","As a raw corpus, we used a Japanese web corpus...",In the paper Minimally Supervised Learning of ...
4,1909.00694,What are labels available in dataset for super...,,No,Affective events BIBREF0 are events that typic...,"In this paper, we work on recognizing the pola...",In the paper Minimally Supervised Learning of ...
...,...,...,...,...,...,...,...
3261,1701.05574,What is the best reported system?,,Gaze Sarcasm using Multi Instance Logistic Reg...,FLOAT SELECTED: Table 3: Classification result...,FLOAT SELECTED: Table 3: Classification result...,In the paper Harnessing Cognitive Features for...
3262,1701.05574,What cognitive features are used?,,"Readability (RED), Number of Words (LEN), Avg...",FLOAT SELECTED: Table 2: The complete set of f...,FLOAT SELECTED: Table 2: The complete set of f...,In the paper Harnessing Cognitive Features for...
3263,1907.01468,What approaches do they use towards text analy...,,No,This contrasts with much of the work in comput...,The approaches we use and what we mean by `suc...,In the paper How we do things with words: Anal...
3264,1907.01468,Do they demonstrate why interdisciplinary insi...,False,No,,,In the paper How we do things with words: Anal...


In [None]:
# Save to CSV
df.to_csv('new_qa_dataset.csv', index=False)

In [None]:
from google.colab import files
files.download('new_qa_dataset.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>