In [None]:
import random
import pandas as pd
import re
from datetime import datetime, timedelta

In [None]:
df_time_event = pd.read_csv("time_event.csv")

In [None]:
df = df_time_event[['Year', 'Event']]

def generate_options(year):
    try:
        # Handle '–' in BCE (e.g., '3000–2500 BCE')
        if '–' in year and 'BCE' in year:
            start, end = year.replace('BCE', '').strip().split('–')
            distractors = [f"{int(start) + random.randint(-10, 10)}–{int(end) + random.randint(-10, 10)} BCE" for _ in range(2)]
        
        # Handle ' or ' in BCE (e.g., '2334 or 2270 BCE')
        elif ' or ' in year and 'BCE' in year:
            options = year.replace('BCE', '').strip().split(' or ')
            distractors = [f"{int(option) + random.randint(-10, 10)} BCE" for option in options for _ in range(1)]
        
        # Handle '/' in BCE (e.g., '470/469 BCE')
        elif '/' in year and 'BCE' in year:
            options = year.replace('BCE', '').strip().split('/')
            distractors = [f"{int(option) + random.randint(-1, 1)} BCE" for option in options for _ in range(1)]
        
        # Handle single year with 'c.' prefix in BCE (e.g., 'c. 3200 BCE')
        elif 'c.' in year and 'BCE' in year and '–' not in year:
            base_year = int(year.replace('c.', '').replace('BCE', '').strip())
            distractors = [f"c. {base_year + random.randint(-10, 10)} BCE" for _ in range(2)]
            
        # Handle century in simple form (e.g., '200s')
        elif year.endswith('s'):
            base_year = int(year[:-1])
            distractors = [
                f"{base_year + random.randint(-1, 1) * 100}s", 
                f"{base_year + random.randint(-1, 1) * 100}s"
            ]
            
        # Handle range in BCE with 'c.' prefix (e.g., 'c. 60–44 BCE')
        elif 'c.' in year and '–' in year and 'BCE' in year:
            # Replace all types of dashes with a standard hyphen
            year = year.replace('–', '-')
            start, end = map(int, year.replace('BCE', '').replace('c.', '').strip().split('-'))

            distractors = [
                f"c. {start + random.randint(-10, 10)}–{end + random.randint(-10, 10)} BCE", 
                f"c. {start + random.randint(-10, 10)}–{end + random.randint(-10, 10)} BCE"
            ]

        # Handle range in CE with 'c.' prefix without BCE/CE (e.g., 'c. 1–50')
        elif 'c.' in year and '–' in year and 'BCE' not in year and 'CE' not in year:
            start, end = year.replace('c.', '').strip().split('–')
            distractors = [f"c. {int(start) + random.randint(-10, 10)}–{int(end) + random.randint(-10, 10)}" for _ in range(2)]

        # Handle range in CE without BCE/CE (e.g., '100–940')
        elif '–' in year and 'BCE' not in year and 'CE' not in year and 'c.' not in year:
            start, end = year.strip().split('–')
            distractors = [f"{int(start) + random.randint(-10, 10)}–{int(end) + random.randint(-10, 10)}" for _ in range(2)]

        # Handle single year with 'c.' prefix without BCE/CE (e.g., 'c. 500')
        elif 'c.' in year and '–' not in year and 'BCE' not in year and 'CE' not in year:
            base_year = int(year.replace('c.', '').strip())
            distractors = [f"c. {base_year + random.randint(-20, 20)}" for _ in range(2)]
        
        # Handle single year in BCE without 'c.' prefix (e.g., '3150 BCE')
        elif 'BCE' in year and 'c.' not in year and '–' not in year:
            base_year = int(year.replace('BCE', '').strip())
            distractors = [f"{base_year + random.randint(-50, 50)} BCE" for _ in range(2)]

        # Handle modern years (e.g., '2022')
        elif year.isdigit():
            base_year = int(year)
            distractors = [f"{base_year + random.randint(-5, 5)}" for _ in range(2)]

        else:
            # If year format doesn't match any of the known formats, return None
            return [None]*3 + [None]
        
        # Shuffle the options and find the correct answer index
        options = distractors + [year]
        random.shuffle(options)
        return options + [chr(65 + options.index(year))]
    except ValueError:
        # If there is a value error while processing a year, return None
        return [None]*3 + [None]

# Create a list of lists containing the option and answer data
options_data = df['Year'].apply(lambda x: generate_options(x)).tolist()

# Create a new dataframe from the options_data and concatenate it with the original dataframe
options_df = pd.DataFrame(options_data, columns=['Option A', 'Option B', 'Option C', 'Answer'])
new_df = pd.concat([df, options_df], axis=1)

# Create the questions column
new_df['Question'] = new_df['Event'].apply(lambda x: f"In what year(s) did \"{x}\" occur?")

In [None]:
new_df.loc[141, ['Option A', 'Option B', 'Option C', 'Answer']] = ["100 BCE – 50 CE", "100 BCE – 100 CE", "200 BCE – 200 CE", "B"]
new_df.loc[142, ['Option A', 'Option B', 'Option C', 'Answer']] = ["100 BCE – 300 CE", "100 BCE – 100 CE", "100 BCE – 200 CE", "A"]
new_df.loc[146, ['Option A', 'Option B', 'Option C', 'Answer']] = ["c. 60–55 BCE", "c. 60–50 BCE", "c. 60–44 BCE", "C"]

In [None]:
df_facts = new_df[['Question', 'Option A', 'Option B', 'Option C', 'Answer']]
df_facts['Category'] = 'Facts'

In [None]:
# Expanded events with typical times (24-hour format)
event_times = {
    "breakfast": 8,
    "lunch": 13,
    "dinner": 20,
    "morning jog": 6,
    "evening walk": 18,
    "afternoon nap": 15,
    "farmer starting their day": 5,
    "baker baking bread": 4,
    "night guard starting their shift": 22,
    "office worker starting their day": 9,
    "matinee show": 14,
    "evening concert": 19,
    "morning market": 7,
    "afternoon parade": 16,
    "sunrise": 6,
    "sunset": 19,
    "morning dew": 5,
    "evening star appearance": 20,
    "morning rush hour": 7,
    "evening rush hour": 17,
    "school start": 8,
    "school end": 15,
    "university lectures start": 9,
    "nightclub opening": 22,
    "library opening": 10,
    "gym peak hours": 18,
    "first train": 5,
    "last train": 23,
    "newspaper delivery": 5,
    "mail delivery": 13,
    "construction work start": 8,
    "bank opening": 9,
    "stock market opening": 9,
    "stock market close": 16,
    "morning prayers": 5,
    "evening prayers": 19,
    "lunch break at offices": 12,
    "TV prime time": 20,
    "movie theater matinee": 13,
    "afternoon tea": 16,
    "bar peak hours": 21,
    "supermarket peak hours": 17,
    "fast food lunch rush": 12,
    "restaurant dinner rush": 20,
    "museum opening": 10,
    "park peak hours": 15,
    "zoo opening": 9,
    "beach peak hours": 14,
    "mountain sunrise view": 6,
    "forest evening ambiance": 18,
    "fishing activity start": 5,
    "campfire typical start": 19,
    "morning yoga": 6,
    "night meditation": 21,
    "gardening peak time": 7,
    "sundown gatherings": 19,
    "flea market opening": 8,
    "carnival evening events": 20,
    "midnight snack": 24,
    "early bird cafe opening": 5,
    "night owl diner rush": 23,
    "spa opening": 10,
    "winery tour start": 11,
    "midnight movie screening": 0,
    "early morning radio show": 5,
    "lunchtime radio show": 12,
    "drive time radio show": 17,
    "late night radio show": 22,
    "baker's first batch": 4,
    "midnight train": 0,
    "dawn chorus": 4,
    "afternoon siesta": 14,
    "city council meeting": 18,
    "local pub opening": 16,
    "local pub last call": 23,
    "wedding ceremony typical start": 15,
    "funeral typical start": 10,
    "midnight mass": 0,
    "evening mass": 18,
    "ice cream parlor peak time": 14,
    "orchestra evening performance": 20,
    "breakfast at a diner": 6,
    "city park morning joggers": 6,
    "city park evening strollers": 19,
    "evening street performers": 18,
    "morning birdwatching": 6,
    "nighttime stargazing": 22,
    "early morning fish market": 4,
    "typical office coffee break": 10,
    "afternoon office meeting": 15,
    "fireworks display": 21,
    "evening carnival rides": 19,
    "midday town bell": 12,
    "sunrise on a mountain peak": 6,
    "sunset on a beach": 19,
    "night market opening": 18,
    "morning milk delivery": 5,
    "afternoon craft workshops": 14,
    "evening cooking classes": 18,
    "afternoon community gatherings": 15,
    "morning monastery prayers": 5,
    "noon temple bells": 12,
    "dusk temple rituals": 18,
    "evening desert safari": 17,
    "nighttime desert camp": 20,
    "mountain morning trek": 6,
    "afternoon mountain descent": 15,
    "rafting morning trips": 8,
    "night jungle safari": 19,
    "early morning surfing": 6,
    "dusk beach volleyball": 18,
    "evening lighthouse lighting": 19,
    "morning ferry rides": 7,
    "night ferry last ride": 21,
    "morning orchard harvest": 5,
    "evening vineyard tours": 17,
    "morning factory shift": 6,
    "night factory shift": 19
}

In [None]:
qa_data = []
num_questions_to_generate = 300  # Change this to generate desired number of questions

seen_combinations = set()

while len(qa_data) < num_questions_to_generate:
    # Randomly pick two different events from the list
    event1, event2 = random.sample(list(event_times.keys()), 2)
    
    # Create a canonical representation to ensure uniqueness
    canonical_combination = tuple(sorted([event1, event2]))

    # If combination is seen before, skip this iteration
    if canonical_combination in seen_combinations:
        continue

    # If combination is new, add it to the seen set
    seen_combinations.add(canonical_combination)

    # Check the correct answer
    if event_times[event1] < event_times[event2]:
        correct_answer = "A"
    elif event_times[event1] > event_times[event2]:
        correct_answer = "B"
    else:
        correct_answer = "C"

    # Construct the question and options
    question = f"Which event typically happens earlier: {event1} or {event2}?"
    option_a = event1.capitalize()
    option_b = event2.capitalize()
    option_c = "Around the same time"

    qa_data.append([question, option_a, option_b, option_c, correct_answer])

# Create dataframe
df_comparison1 = pd.DataFrame(qa_data, columns=["Question", "Option A", "Option B", "Option C", "Answer"])
df_comparison1['Category'] = 'Comparison'

In [None]:
df_typical_time_pair = pd.read_csv("typical_time_pairs.csv")
df_typical_time_pair.columns = ['statement_1', 'statement_2']

In [None]:
questions = []
options_a = []
options_b = []
answers = []

for i, row in df_typical_time_pair.iterrows():
    left_statement = row['statement_1']
    right_statement = row['statement_2']
    
    # Randomly assign left and right statements to option A or B for the first 201 rows
    if i < 201:
        if np.random.rand() > 0.5:
            options_a.append(left_statement)
            options_b.append(right_statement)
            answers.append('A')
            questions.append(f"Which statement is more typical in terms of time? A: '{left_statement}' or B: '{right_statement}'?")
        else:
            options_a.append(right_statement)
            options_b.append(left_statement)
            answers.append('B')
            questions.append(f"Which statement is more typical in terms of time? A: '{right_statement}' or B: '{left_statement}'?")
    else:
        options_a.append(left_statement)
        options_b.append(right_statement)
        answers.append('C')
        questions.append(f"Which statement is more typical in terms of time? A: '{left_statement}' or B: '{right_statement}'?")

# Create a new DataFrame with the questions, options, and answers
mcq_df = pd.DataFrame({
    'Question': questions,
    'Option A': options_a,
    'Option B': options_b,
    'Option C': ['Neither statement is typical.'] * 300,
    'Answer': answers
})
df_comparison2 = mcq_df.sample(frac=1).reset_index(drop=True)
df_comparison2['Category'] = 'Comparison'

In [None]:
# Get commonsense questions from Mctaco dataset
# Read the tsv file
df_mctaco_dev = pd.read_csv('dev_3783.tsv', sep='\t', header=None)
df_mctaco_test = pd.read_csv('test_9442.tsv', sep='\t', header=None)
df_mctaco = pd.concat([df_mctaco_dev, df_mctaco_test])

# Assign column names manually
df_mctaco.columns = ['Sentence', 'Question', 'Answer', 'Correct', 'Type']

# Filter the dataframe to only include rows where the last column is 'Event Duration'
df_mctaco = df_mctaco[df_mctaco['Type'] == 'Typical Time']

In [None]:
df_mctaco['Question'] = df_mctaco['Sentence'] + ' ' + df_mctaco['Question']

new_df_mctaco = pd.DataFrame(columns=['Question', 'Option A', 'Option B', 'Option C', 'Answer'])

questions = df_mctaco['Question'].unique()

for question in questions:
    # Get all rows with the current question
    current_df = df_mctaco[df_mctaco['Question'] == question]
    # Check if there is a correct answer
    if len(current_df[current_df['Correct'] == 'yes']) > 0:
        # Get the correct answer
        correct_answer = current_df[current_df['Correct'] == 'yes']['Answer'].iloc[0]
        # Get all incorrect answers
        incorrect_answers = current_df[current_df['Correct'] == 'no']['Answer'].tolist()
        # Check if there are at least two incorrect answers
        if len(incorrect_answers) >= 2:
            # Randomly select two incorrect answers
            distractors = random.sample(incorrect_answers, 2)
            options = [correct_answer, distractors[0], distractors[1]]
            # Shuffle the options
            random.shuffle(options)
            correct_option = 'ABC'[options.index(correct_answer)]
            new_df_mctaco = new_df_mctaco.append({'Question': question, 
                                    'Option A': options[0], 
                                    'Option B': options[1], 
                                    'Option C': options[2], 
                                    'Answer': correct_option}, ignore_index=True)
        else:
            # Handle the case where there are not enough incorrect answers
            pass
    else:
        # Handle the case where there is no correct answer
        pass
    
new_df_mctaco['Category'] = 'Commonsense'

In [None]:
# Load SQuAD dataset
squad_dataset = load_dataset('squad')

# Create a dataframe to store the questions and options
squad_df = pd.DataFrame(columns=['Question', 'Option A', 'Option B', 'Option C', 'Answer'])

def is_temporal_frequency_question(question):
    """
    Function to determine if a question is related to temporal frequency.
    """
    # Define keywords related to frequency
    keywords = ['in what era', 'in what year', 'at what time', 'in which year', 'what year', 'when was', 'when did', 'when were', 'when does', 'what time', 'at what time period']
    return any(keyword in question.lower() for keyword in keywords)

def generate_distractors(correct_answer):
    """
    Function to generate distractors for a given correct answer.
    """
    # Define possible distractors
    possible_distractors = extractors = [
    "January 1, 2000",
    "February 29, 1904",
    "March 15",
    "April 4, 1968",
    "May 8, 1945",
    "June 28, 1914",
    "July 20, 1969",
    "August 6, 1945",
    "September 11, 2001",
    "October 31, 1517",
    "November 22, 1963",
    "December 7, 1941",
    '1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968', '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020',
    "1st century",
    "2nd century",
    "3rd century",
    "4th century",
    "5th century",
    "6th century",
    "7th century",
    "8th century",
    "9th century",
    "10th century",
    "11th century",
    "12th century",
    "13th century",
    "14th century",
    "15th century",
    "16th century",
    "17th century",
]
    # Remove the correct answer from possible distractors
    possible_distractors = [d for d in possible_distractors if d != correct_answer]
    # Randomly select two distractors
    return random.sample(possible_distractors, 2)

combined_dataset = concatenate_datasets([squad_dataset['train'], squad_dataset['validation']])

# Process the SQuAD dataset
for example in combined_dataset:
    question = example['question']
    context = example['context']
    if is_temporal_frequency_question(question):
        # Create the full question with context and question
        full_question = context + ' ' + question
        # Get the correct answer
        correct_answer = example['answers']['text'][0]
        # Generate distractors
        distractor1, distractor2 = generate_distractors(correct_answer)
        # Randomly assign the options to A, B, C
        options = [correct_answer, distractor1, distractor2]
        random.shuffle(options)
        correct_option = 'ABC'[options.index(correct_answer)]
        # Append the question and options to the dataframe
        squad_df = pd.concat([squad_df, pd.DataFrame([{'Question': full_question, 
                                           'Option A': options[0], 
                                           'Option B': options[1], 
                                           'Option C': options[2], 
                                           'Answer': correct_option}])], ignore_index=True)

squad_df['Category'] = 'Reading Comprehension'

In [None]:
df_typical_time = pd.concat([df_facts, df_comparison1, df_comparison2, new_df_mctaco, squad_df])