In [None]:
import pandas as pd
import os

In [None]:
meta_df = pd.read_csv('story_meta.csv', encoding="utf-8")

In [None]:
# Get Aggregated DataFrame of Questions
# Params:
# - origins (list of strings): story origins to filter by
# - split (string): filter by train, test, or validation splits
# Returns:
# A single pd.DataFrame with the aggregate questions

def get_question_df(origins=[], split=""):
  if split not in ["", "train", "test", "val"]:
    print('Incorrect split argument: expected "train", "test", "val", or default empty string.')
    return

  if split == "":
    filtered_meta = meta_df
  else:
    filtered_meta = meta_df[meta_df['split'] == split]

  if len(origins) != 0:
    filtered_meta = meta_df[meta_df['origin'].isin(origins)]

  def get_q_file(row):
    df = pd.read_csv('data-by-train-split/questions/' + row[1] + '/' + row[0] + '-questions.csv')
    df['filename'] = row[0] + '-questions.csv'
    df['split'] = row[1]
    df['origin'] = row[2]
    return df

  qdfs = [ get_q_file(row)
    for row in zip(filtered_meta['filename'].to_list(), filtered_meta['split'].to_list(), filtered_meta['origin'].to_list())
  ]

  return pd.concat(qdfs, ignore_index=True)

In [None]:
# Get Aggregated DataFrame of Questions
# Params:
# - origins (list of strings): story origins to filter by
# - split (string): filter by train, test, or validation splits
# - sent_level (boolean): if true, return sentence-level stories, and else return section-level stories
# Returns:
# A single pd.DataFrame with the aggregate story sections or sentences

def get_story_df(origins=[], split="", sent_level=False):
  if split not in ["", "train", "test", "val"]:
    print('Incorrect split argument: expected "train", "test", "val", or default empty string.')
    return

  if split == "":
    filtered_meta = meta_df
  else:
    filtered_meta = meta_df[meta_df['split'] == split]

  if len(origins) != 0:
    filtered_meta = meta_df[meta_df['origin'].isin(origins)]

  def get_s_file(row):
    file_str = 'data-by-train-split/'
    if sent_level:
      file_str += 'sentence-stories/'
    else:
      file_str += 'section-stories/'
    
    file_str += row[1] + '/' + row[0] + '-story.csv'

    df = pd.read_csv(file_str)
    df['filename'] = row[0] + '-questions.csv'
    df['split'] = row[1]
    df['origin'] = row[2]
    return df

  sdfs = [ get_s_file(row)
    for row in zip(filtered_meta['filename'].to_list(), filtered_meta['split'].to_list(), filtered_meta['origin'].to_list())
  ]

  return pd.concat(sdfs, ignore_index=True)

In [None]:
story_df = get_story_df()
story_df

In [None]:
story_df.info()

In [None]:
story_df = story_df.convert_dtypes()
story_df.info()

In [None]:
qa_df = get_question_df()
qa_df.head()

In [None]:
qa_df.info()

In [None]:
qa_df['cor_section'] = qa_df['cor_section'].astype('str')

In [None]:
qa_df = qa_df.convert_dtypes()
qa_df.info()

In [None]:
train_data = []
train_contexts_data = []
 
for i, context in enumerate(train_contexts):
    qas = []
    for qa in train_questions_answers:
        if qa["context_index"] == i:
            answer_start = context.find(qa["answer"])
            if answer_start != -1:
                qas.append({
                    "id": str(len(qas) + 1).zfill(5),
                    "is_impossible": False,
                    "question": qa["question"],
                    "answers": [
                        {
                            "text": qa["answer"],
                            "answer_start": answer_start,
                        }
                    ],
                })
    train_contexts_data.append({
        "context": context,
        "qas": qas,
    })
 
train_data.extend(train_contexts_data)

In [None]:
output = open("temp.jsonl","w")
index = 23
row_qa = qa_df.iloc[23]
filename = row_qa["filename"]
context_list = list(map(int, row_qa["cor_section"].split(',')))
rows = story_df[(story_df["filename"] == filename) & (story_df["section"].isin(context_list))]
context = " ".join(rows["text"])
question = row_qa["question"]

for i in range(1,6):
    count = "answer"+str(i)
    if pd.isna(row_qa[count]):
        continue
    answer = row_qa[count]
    output.write(f"{{\"id\": \"{index}_{i}\", \"context\": \"{context}\", \"question\": \"{question}\", \"answer\": \"{answer}\"}}\n")

In [None]:
output = open("qa.jsonl","w")


for index, row_qa in qa_df.iterrows():
    data = {}
    filename = row_qa["filename"]
    context_list = list(map(int, row_qa["cor_section"].split(',')))    
    rows = story_df[(story_df["filename"] == filename) & (story_df["section"].isin(context_list))]
    context = " ".join(rows["text"]).replace("\"", "\\\"")
    context = ' '.join(context.splitlines())
    question = row_qa["question"].replace("\"", "\\\"")
    question = ' '.join(question.splitlines())
    for i in range(1,6):
        count = "answer"+str(i)
        if pd.isna(row_qa[count]):
            continue
        answer = row_qa[count].replace("\"", "\\\"")
        answer = ' '.join(answer.splitlines())
        output.write(f"{{\"id\": \"{index}_{i}\", \"context\": \"{context}\", \"question\": \"{question}\", \"answer\": \"{answer}\"}}\n")