In [None]:
import pandas as pd
import random
import re
from datasets import load_dataset
from datasets import concatenate_datasets

In [None]:
# real events
data = """Christmas: Once a year,
Diwali: Once a year,
Hanukkah: Once a year,
Ramadan: Once a year,
Easter: Once a year,
Thanksgiving: Once a year,
Valentine's Day: Once a year,
Halloween: Once a year,
Labor Day: Once a year,
Pi Day: Once a year,
Olympic Games: Every 4 years,
FIFA World Cup: Every 4 years,
Super Bowl: Once a year,
Wimbledon: Once a year,
NBA Finals: Once a year,
Tour de France: Once a year,
Rugby World Cup: Every 4 years,
ICC Cricket World Cup: Every 4 years,
Monaco Grand Prix (F1): Once a year,
Solar eclipse: 2 to 5 times a year,
Lunar eclipse: 0 to 3 times a year,
Perseid meteor shower: Once a year,
Monarch butterfly migration: Twice a year,
U.S. Presidential elections: Every 4 years,
UN General Assembly meetings: Once a year,
G7 Summit: Once a year,
U.S. Congressional Election: Every 2 years,
British General Election: Every 5 years,
Indian General Election: Every 5 years,
Oscars: Once a year,
Grammy Awards: Once a year,
Cannes Film Festival: Once a year,
Venice Film Festival: Once a year,
Golden Globe Awards: Once a year,
Tony Awards: Once a year,
Sundance Film Festival: Once a year,
Comet Halley appearance: Every 76 years,
Brood X cicada emergence: Every 17 years,
Leonid meteor storm: Every 33 years,
Hajj pilgrimage: Once a year,
Yom Kippur: Once a year,
Good Friday: Once a year,
Leap Year: Every 4 years,
Amazon Prime Day: Once a year,
Black Friday: Once a year,
Cyber Monday: Once a year,
Mercury transit: About 13 to 14 times per century,
World's Fair: Every 5 years,
TED annual conference: Once a year,
Burning Man Festival: Once a year,
Consumer Electronics Show: Once a year,
Venus Orbital Completion: About every 225 Earth days,
Jupiter Orbital Completion: About every 12 Earth years,
Saturn Orbital Completion: About every 29.5 Earth years,
Mars Orbital Completion: About every 687 Earth days,
Uranus Orbital Completion: About every 84 Earth years,
Neptune Orbital Completion: About every 165 Earth years,
Pluto's Orbital Completion: About every 248 Earth years,
Bastille Day: Once a year,
Guy Fawkes Night: Once a year,
Juneteenth: Once a year,
Chinese New Year: Once a year,
Dragon Boat Festival: Once a year,
Mid-Autumn Festival: Once a year,
Saint Patrick's Day: Once a year,
Cinco de Mayo: Once a year,
Commonwealth Games: Every 4 years,
Asian Games: Every 4 years,
Africa Cup of Nations: Every 2 years,
Copa America: Every 4 years,
Boston Marathon: Once a year,
New York City Marathon: Once a year,
Isle of Man TT: Once a year,
Apple's Product Launch Events: 3 to 4 times a year,
Google I/O: Once a year,
Microsoft Build: Once a year,
Paris Air Show: Every 2 years,
International Motor Show Germany: Every 2 years,
World Economic Forum: Once a year,
Frankfurt Book Fair: Once a year,
National Novel Writing Month: Once a year,
Edinburgh Fringe Festival: Once a year,
Art Basel: Once a year,  
Great Wildebeest Migration: Once a year, 
Navaratri: Twice a year,
Pongal: Once a year,
Passover: Once a year,
Rosh Hashanah: Once a year,
Ashura: Once a year,
Fields Medal in Mathematics Announcement: Every 4 years,
Nobel Prize Announcement: Once a year,
Perihelion (Earth closest to the Sun): Once a year,
Aphelion (Earth farthest from the Sun): Once a year,
Coachella Music Festival: Once a year,
Glastonbury Festival: Once a year,
MTV Video Music Awards: Once a year,
Sanremo Music Festival: Once a year,
Eurovision Song Contest: Once a year,
International Day of Peace: Once a year,
Earth Hour: Once a year,
World Water Day: Once a year,
International Yoga Day: Once a year,
World Health Day: Once a year,
International Women's Day: Once a year,
Equinox: Twice a year,
Oktoberfest in Munich: Once a year,
World Elephant Day: Once a year,
National Puzzle Day: Once a year,
G20 Summit: Once a year,
WTO Ministerial Conference: Every 2 years,
International AIDS Conference: Every 2 years,
Chinese National People's Congress: Every 5 years,
Mexican Presidential Election: Every 6 years,
Australian Federal Election: Every 3 years,
Russian Presidential Election: Every 6 years,
Blue moon: Every 33 months,
UEFA European Football Championship: Every 4 years,
U.S. Census: Every 10 years,
Earth's Orbit completion around the Sun: Once a year,
G7 Summit: Once a year,
San Diego Comic-Con International: Once a year,
World Cancer Day: Once a year,
Beaujolais Nouveau Day: Once a year,
International Firefighters' Day: Once a year,
International Workers' Day: Once a year,
Met Gala: Once a year,
Iditarod Trail Sled Dog Race: Once a year,
World Health Assembly: Once a year,
Earth's rotation on its axis: Once a day,
U.S. Supreme Court term: Once a year,
MLB World Series: Once a year,
El Niño: Every 2 to 7 years,
Pulitzer Prizes Announcement: Once a year,
Montreux Jazz Festival: Once a year,
Geneva International Motor Show: Once a year,
Bloomsday: Once a year,
Mother’s Day: Once a year,
Father’s Day: Once a year,
Veterans Day: Once a year,
World Children's Day: Once a year,
Mardi Gras: Once a year,
Day of the Dead: Once a year,
Double Ninth Festival: Once a year,
Qixi Festival: Once a year,
Running of the Bulls: Once a year,
Africa Day: Once a year,
Pan American Games: Every 4 years,
Hungry Ghost Festival: Once a year,
World Environment Day: Once a year"""

In [None]:
# convert real events to dict
event_frequency = {}
lines = data.split("\n")
for line in lines:
    if line.strip():  
        event, freq = line.split(":")
        event_frequency[event.strip()] = freq.strip().rstrip(',')

In [None]:
def generate_distractors(correct_frequency):
    # Identify if the unit is "year"
    if "year" in correct_frequency:
        unit = "year"
    else:
        # Extract unit and number from the frequency
        match = re.search(r'(\d+)\s*(\w+)', correct_frequency)
        if match:
            num = int(match.group(1))
            unit = match.group(2)
        else:
            return []

    distractors = set()
    if unit == "year":
        # Generate distractors for years as before
        while len(distractors) < 2:
            num_years = random.randint(1, 10)
            distractor = f"Every {num_years} years" if num_years > 1 else "Once a year"
            distractors.add(distractor)
    else:
        # For units other than "year", shift the number randomly by 1-3 units
        while len(distractors) < 2:
            shifted_num = num + random.randint(1, 3)
            distractor = f"Every {shifted_num} {unit}s"
            distractors.add(distractor)

    return list(distractors)

def generate_mcq(event, correct_frequency):
    distractors = generate_distractors(correct_frequency)
    
    # Combine correct answer with distractors
    options = [correct_frequency] + distractors
    random.shuffle(options)

    # Question
    question = f"How often does {event} occur?"
    answer = options.index(correct_frequency)  # Save the index (0-based) of the correct answer

    return question, options, answer

# Generate MCQs
mcq_data = [generate_mcq(event, freq) for event, freq in event_frequency.items()]

# Convert to DataFrame
df_facts_frequency = pd.DataFrame(mcq_data, columns=["Question", "Options", "Answer"])
df_facts_frequency["Option A"] = df_facts_frequency["Options"].apply(lambda x: x[0] if len(x) > 0 else None)
df_facts_frequency["Option B"] = df_facts_frequency["Options"].apply(lambda x: x[1] if len(x) > 1 else None)
df_facts_frequency["Option C"] = df_facts_frequency["Options"].apply(lambda x: x[2] if len(x) > 2 else None)
df_facts_frequency["Answer"] = df_facts_frequency["Answer"].apply(lambda x: chr(65 + x))  # Convert 0, 1, 2 to A, B, C
df_facts_frequency = df_facts_frequency.drop(columns=["Options"])

In [None]:
artificial_events = {
    "Blinking of an average person": "always",
    "Visiting the moon": "rarely",
    "Annual gathering of imaginary creatures": "yearly",
    "People dreaming": "always",
    "Discovering a unicorn": "never",
    "Polar bears sunbathing": "rarely",
    "Annual migration of flying pigs": "never",
    "Celebration of the underground gnome festival": "biannually",
    "Charging of magical wands": "sometimes",
    "Martian holidays": "once a decade",
    "Atlantis underwater music festival": "yearly",
    "Reading a book in the dream world": "often",
    "Elf's weekly dance-off": "once a week",
    "Yeti's mountain gathering": "quarterly",
    "Mermaids singing at dawn": "sometimes",
    "Annual invisibility cloak fashion show": "yearly",
    "Dragon flight training": "biweekly",
    "Magic potion tasting": "monthly",
    "Rainbows touching the ground": "rarely",
    "Giant's annual sneeze": "yearly",
    "Leprechaun gold counting": "daily",
    "Gremlin's tech convention": "quarterly",
    "Fairy's midnight ball": "once a week",
    "Time travel expeditions": "never",
    "Aliens watching Earth movies": "sometimes",
    "Centaur's forest marathon": "biennially",
    "Unicorn rainbow races": "monthly",
    "Time machine maintenance": "never",
    "Wizard's duel": "biweekly",
    "Galactic council meetings": "biannually",
    "Flying carpets cleaning": "once a week",
    "Narnia's winter fest": "yearly",
    "Genie's lamp polishing": "sometimes",
    "Quidditch summer league": "yearly",
    "Goblin's treasure hunt": "once a decade",
    "Werewolf transformation therapy": "monthly",
    "Vampire's sunblock application": "always",
    "Phoenix egg hatching": "once a century",
    "Troll's stone turning contest": "yearly",
    "Minotaur's labyrinth puzzle solving": "monthly",
    "Griffin's skydiving event": "biennially",
    "Zombie's peaceful walk in the park": "sometimes",
    "Gorgon's beauty parlor visits": "once a week",
    "Teleportation device trials": "rarely",
    "Chimera's talent show": "biennially",
    "Hydra's heads grooming": "daily",
    "Hippogriff's annual flight show": "yearly",
    "Owls delivering non-magical letters": "rarely",
    "Basilisk's summer shedding": "yearly",
    "Kraken's deep-sea disco": "quarterly",
    "Witches' hat knitting circle": "monthly",
    "Giants' mountaintop yoga": "biweekly",
    "Cyclops' photography workshop": "sometimes",
    "Mummy's annual unwrapping": "never",
    "Sphinx's riddle competition": "once a decade",
    "Oracles' forecast festival": "quarterly",
    "Djinn's dream weaving workshop": "biannually",
    "Banshee's vocal training": "biweekly",
    "Harpy's sky dance": "once a week",
    "Cherub's chocolate festival": "yearly",
    "Naga's underwater basket weaving": "biannually",
    "Shapeshifter's masquerade ball": "monthly",
    "Medusa's hair braiding session": "daily",
    "Brownie's home cleaning service": "always",
    "Satyr's flute festival": "biennially",
    "Kappa's river party": "quarterly",
    "Salamander's fire juggling contest": "once a week",
    "Pixie's starlight serenade": "biannually",
    "Ghoul's storytelling night": "monthly"
}

In [None]:
frequency_ranking = {
    "always": 100,
    "often": 90,
    "daily": 89,
    "Once a day": 89, 
    "sometimes": 80,
    "once a week": 75,
    "biweekly": 70,
    "monthly": 65,
    "quarterly": 63,
    "3 to 4 times a year": 62,
    "Twice a year": 61,
    "2 to 5 times a year": 60,  
    "Once a year": 58,
    "yearly": 58,  
    "biannually": 57,
    "0 to 3 times a year": 56, 
    "About every 225 Earth days": 55,
    "About every 687 Earth days": 54, 
    "biennially": 52,
    "Every 2 years": 52, 
    "Every 33 months": 51,  
    "Every 2 to 7 years": 50, 
    "Every 3 years": 49,
    "Every 4 years": 48,
    "Every 5 years": 47,
    "Every 6 years": 46,
    "About 13 to 14 times per century": 45,
    "once a decade": 43,
    "Every 10 years": 43, 
    "About every 12 Earth years": 42,
    "Every 17 years": 41,
    "About every 29.5 Earth years": 40,
    "Every 33 years": 39,
    "Every 76 years": 38,
    "About every 84 Earth years": 37,
    "once a century": 36,
    "About every 165 Earth years": 35,
    "About every 248 Earth years": 30, 
    "rarely": 10,
    "never": 0,
}

In [None]:
# events comparison
real_events = event_frequency
used_combinations = set()

def generate_question():
    if random.choice([True, False]):
        # Real events comparison
        event_a, event_b = random.sample(list(real_events.keys()), 2)
        desc_a = ""
        desc_b = ""
        question = f"Compare the frequency of '{event_a}' and '{event_b}'."
    else:
        # Artificial events comparison
        event_a, event_b = random.sample(list(artificial_events.keys()), 2)
        desc_a = f"which happens {artificial_events[event_a]}"
        desc_b = f"which happens {artificial_events[event_b]}"
        question = f"Compare the frequency of '{event_a}' ({desc_a}) and '{event_b}' ({desc_b})."
    
    # Check if the combination is unique
    sorted_combination = tuple(sorted([event_a, event_b]))
    if sorted_combination in used_combinations:
        return generate_question()  # Generate another question
    used_combinations.add(sorted_combination)
    
    freq_a_rank = frequency_ranking[real_events[event_a] if event_a in real_events else artificial_events[event_a]]
    freq_b_rank = frequency_ranking[real_events[event_b] if event_b in real_events else artificial_events[event_b]]
    
    if freq_a_rank > freq_b_rank:
        answer = "A"
    elif freq_b_rank > freq_a_rank:
        answer = "B"
    else:
        answer = "C"
    
    return [question, f"{event_a} is more frequent", f"{event_b} is more frequent", "Both events are equally frequent", answer]

# Generate questions
data = [generate_question() for _ in range(800)]
df_events_frequency_comparison = pd.DataFrame(data, columns=["Question", "Option A", "Option B", "Option C", "Answer"])
df_events_frequency_comparison = df_events_frequency_comparison.drop_duplicates(subset=['Question'])
df_events_frequency_comparison['Category'] = 'Comparison'

In [None]:
frequency_to_years = {
    "daily": 1/365, 
    "once a day": 1/365,
    "once a week": 1/52,
    "biweekly": 1/26,
    "monthly": 1/12,
    "quarterly": 1/4,
    "3 to 4 times a year": 1/3.5,
    "twice a year": 1/2,
    "2 to 5 times a year": 1/3.5,  
    "once a year": 1,
    "yearly": 1,  
    "biannually": 2,
    "0 to 3 times a year": 1/1.5, 
    "about every 225 Earth days": 1/365 * 225,
    "about every 687 Earth days": 1/365 * 687, 
    "biennially": 2,
    "every 2 years": 2, 
    "every 33 months": 33/12,  
    "every 2 to 7 years": 2, 
    "every 3 years": 3,
    "every 4 years": 4,
    "every 5 years": 5,
    "every 6 years": 6,
    "about 13 to 14 times per century": 100/13.5,
    "once a decade": 10,
    "every 10 years": 10, 
    "about every 12 Earth years": 12,
    "every 17 years": 17,
    "about every 29.5 Earth years": 29.5,
    "every 33 years": 33,
    "every 76 years": 76,
    "about every 84 Earth years": 84,
    "once a century": 100,
    "about every 165 Earth years": 165,
    "about every 248 Earth years": 248, 
}

used_combinations = set()

def generate_basic_computation_question():
    event_type = random.choice(["actual", "artificial"])
    if event_type == "actual":
        event, frequency = random.choice(list(real_events.items()))
    else:
        event, frequency = random.choice(list(artificial_events.items()))

    freq_value_in_years = frequency_to_years.get(frequency, None)

    if freq_value_in_years:
        q_type = random.choice(["how_many_times", "next_appearance", "past_appearance"])

        if q_type == "how_many_times":
            if "to" in frequency:  # Avoid frequencies with a range for this type
                return generate_basic_computation_question()

            duration = random.choice([5, 10, 20, 25, 50, 75, 100])
            times = int(duration / freq_value_in_years)
            question = f"If '{event}' happens {frequency}, how many times will it occur in {duration} years?"

            # Generate options, ensuring they're positive
            options = [times, abs(times + random.randint(1,3)), abs(times - random.randint(1,3))]
            random.shuffle(options)
            correct_option = options.index(times)
            choices = [f"It will occur {opt} times" for opt in options]

        elif q_type == "next_appearance":
            last_seen_year = random.randint(1800, 2023)
            next_appearance = int(last_seen_year + freq_value_in_years)
            question = f"'{event}' appears {frequency}. If it was last seen in {last_seen_year}, when will it next appear?"
            
            # Generate options
            options = [next_appearance, next_appearance + random.randint(1,3), next_appearance - random.randint(1,3)]
            random.shuffle(options)
            correct_option = options.index(next_appearance)
            choices = [f"It will appear in {opt}" for opt in options]

        elif q_type == "past_appearance":
            if "times a year" in frequency or "days" in frequency:  # Avoid non-serious frequencies
                return generate_basic_computation_question()

            current_year = random.randint(1800, 2023)
            past_appearance = int(current_year - freq_value_in_years)
            question = f"'{event}' appears {frequency}. If it took place in {current_year}, when did it previously occur?"

            # Generate options
            options = [past_appearance, past_appearance + random.randint(1,3), past_appearance - random.randint(1,3)]
            random.shuffle(options)
            correct_option = options.index(past_appearance)
            choices = [f"It previously occurred in {opt}" for opt in options]

        if question in used_combinations:
            return generate_basic_computation_question()
        used_combinations.add(question)

        return {
            'Question': question,
            'Option A': choices[0],
            'Option B': choices[1],
            'Option C': choices[2],
            'Answer': chr(65 + correct_option)  # Convert the index to A, B, or C
        }
    else:
        return generate_basic_computation_question()  # Generate another question if the chosen frequency isn't suitable

data = [generate_basic_computation_question() for _ in range(1200)]
df_event_frequency_computation = pd.DataFrame(data)
df_event_frequency_computation = df_event_frequency_computation.drop_duplicates(subset=['Question'])
df_event_frequency_computation['Category'] = 'Computation'

In [None]:
used_combinations = set()

def generate_integrated_reasoning_question():
    question_templates = [
        "If a person's job contract has a renewal every {duration} years, and they started working in {start_year} and renewed it {renewal_times} times without gaps, until what year is their current contract valid?",
        "A solar eclipse happens at least {frequency} times a year. If the first one in {year} is in {month}, in which month can we expect the next one?",
        "If a plant blooms every {days} days and it last bloomed on January 1, on what date will it next bloom?",
        "A comet passes Earth every {years} years. If its last appearance was in {last_seen}, when will it next appear?",
        "If a magazine publishes a special edition every {months} months and the last one was in January, in which month will the next special edition be?",
        "A company holds a general meeting every {quarter} quarters. If the last one was in Q1 of a year, which quarter will the next meeting be?",
        "A species of cicada emerges every {years} years. If they last emerged in {year}, when will they next emerge?",
        "If a leap year occurs every 4 years and the last one was in {year}, when is the next leap year?",
        "A festival is celebrated every {years} years. If it was last celebrated in {year}, when will it next be celebrated?",
        "If a building undergoes maintenance every {months} months and the last maintenance was in January, which month will the next maintenance be?"
    ]
    
    template = random.choice(question_templates)
    
    months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
    
    if "job contract" in template:
        duration = random.choice([2, 3, 4, 5])
        start_year = random.choice(range(1800, 2023))
        renewal_times = random.choice(range(1, 5))
        end_year = start_year + duration * (renewal_times + 1)
        question = template.format(duration=duration, start_year=start_year, renewal_times=renewal_times)
        options = [end_year, end_year + random.choice([1,2,3]), end_year - random.choice([1,2,3])]
    
    elif "solar eclipse" in template:
        frequency = random.choice([2, 3, 4])
        year = random.choice(range(1800, 2030))
        month = random.choice(months)
        next_month_index = (months.index(month) + int(12 / frequency)) % 12
        next_month = months[next_month_index]
        question = template.format(frequency=frequency, year=year, month=month)
        other_months = random.sample([m for m in months if m != next_month], 2)
        options = [next_month] + other_months
        
    elif "plant blooms" in template:
        days = random.choice([15, 30, 45, 60])
        question = template.format(days=days)
        bloom_date = f"January {days+1}"
        options = [bloom_date, f"January {days-1}", f"January {days+5}"]

    elif "comet passes Earth" in template:
        years = random.choice([76, 50, 120])
        last_seen = random.choice(range(1800, 2023))
        next_appearance = last_seen + years
        question = template.format(years=years, last_seen=last_seen)
        options = [next_appearance, next_appearance + random.choice([1,2,3]), next_appearance - random.choice([1,2,3])]
    
    elif "magazine publishes a special edition" in template:
        frequency_in_months = random.randint(1, 11)
        next_month_index = (months.index("January") + frequency_in_months) % 12
        next_month = months[next_month_index]
        question = template.format(months=frequency_in_months)
        other_months = random.sample([m for m in months if m != next_month], 2)
        options = [next_month] + other_months

    elif "company holds a general meeting" in template:
        frequency_in_quarters = random.randint(1, 3)
        next_quarter = "Q" + str((1 + frequency_in_quarters) % 4 or 4)
        question = template.format(quarter=frequency_in_quarters)
        other_quarters = random.sample([q for q in ["Q1", "Q2", "Q3", "Q4"] if q != next_quarter], 2)
        options = [next_quarter] + other_quarters

    elif "species of cicada emerges" in template:
        frequency_in_years = random.randint(2, 30) # Given cicadas have periodic cycles from 2-30 years
        rand_year = random.randint(1800, 2023)
        next_emergence_year = rand_year + frequency_in_years
        question = template.format(years=frequency_in_years, year=rand_year)
        other_years = [str(next_emergence_year + random.randint(1,3)), str(next_emergence_year - random.randint(1,3))]
        options = [str(next_emergence_year)] + other_years

    elif "leap year occurs every 4 years" in template:
        rand_year = random.randint(1800, 2023)
        next_leap_year = rand_year + 4
        question = template.format(year=rand_year)
        other_years = [str(next_leap_year + 4), str(next_leap_year - 4)]
        options = [str(next_leap_year)] + other_years

    elif "festival is celebrated" in template:
        rand_year = random.randint(1800, 2023)
        frequency_in_years = random.randint(1, 25)
        next_festival_year = rand_year + frequency_in_years
        question = template.format(years=frequency_in_years, year=rand_year)
        other_years = [str(next_festival_year + random.randint(1,3)), str(next_festival_year - random.randint(1,3))]
        options = [str(next_festival_year)] + other_years

    elif "building undergoes maintenance" in template:
        frequency_in_months = random.randint(1, 11)
        next_month_index = (months.index("January") + frequency_in_months) % 12
        next_month = months[next_month_index]
        question = template.format(months=frequency_in_months)
        other_months = random.sample([m for m in months if m != next_month], 2)
        options = [next_month] + other_months
    
    random.shuffle(options)
    correct_option = options.index(
        end_year if "job contract" in template 
        else next_month if any(phrase in template for phrase in ["solar eclipse", "magazine publishes a special edition", "building undergoes maintenance"]) 
        else next_quarter if "company holds a general meeting" in template 
        else str(next_emergence_year) if "species of cicada emerges" in template 
        else str(next_leap_year) if "leap year occurs every 4 years" in template 
        else str(next_festival_year) if "festival is celebrated" in template 
        else bloom_date if "plant blooms" in template 
        else next_appearance
    )
    if question in used_combinations:
        return generate_integrated_reasoning_question()
    used_combinations.add(question)


    return question, options[0], options[1], options[2], chr(65 + correct_option)

questions = []
for _ in range(1000):
    q, opt_a, opt_b, opt_c, ans = generate_integrated_reasoning_question()
    questions.append([q, opt_a, opt_b, opt_c, ans])

df_integrated_frequency = pd.DataFrame(questions, columns=["Question", "Option A", "Option B", "Option C", "Answer"])
df_integrated_frequency = df_integrated_frequency.drop_duplicates(subset=['Question'])
df_integrated_frequency['Category'] = 'Application'

In [None]:
def ordinal(n):
    if 10 <= n % 100 <= 20:
        suffix = 'th'
    else:
        suffix = {1: 'st', 2: 'nd', 3: 'rd'}.get(n % 10, 'th')
    return str(n) + suffix

In [None]:
used_combinations = set()

def generate_abstract_temporal_frequency_questions():
    templates = [
        "On Planet Alpha, 1 day is equivalent to {days} Earth days. How many Earth days elapse between daily events on Alpha?",
        "In a spaceship experiencing time dilation, 1 year inside equates to {years} Earth years. If an event occurs inside every year, how frequently is it observed from Earth?",
        "During a time loop, an Earth day repeats {loops} times. How often would a daily event be observed?",
        "If an athlete trains every {days} days, starting on {days_of_week}, on which day will they train next?",
        "A particular star becomes visible from Earth every {years} years. If it was last observed in {year}, in which year will it next be visible?",
        "A TV series releases a new episode every {days} days. If the inaugural episode was released on a {start_day}, on which day will the {episode_number} episode be released?",
        "A town hosts a carnival every {years} years in {month}. If the preceding carnival was in {year}, when will the subsequent one occur?",
        "If a tree sheds its leaves every {months} months, starting in January, which month will witness the next shedding?"
    ]
    
    days_of_week = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
    months = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]

    template = random.choice(templates)
    
    if "Planet Alpha" in template:
        days = random.randint(2, 100)
        question = template.format(days=days)
        correct_answer = str(days)
        options = [str(days + random.randint(1, 10)), str(days - random.randint(1, 10)), correct_answer]

    elif "spaceship experiencing time dilation" in template:
        years = random.randint(2, 100)
        question = template.format(years=years)
        correct_answer = str(years)
        options = [str(years + random.randint(1, 10)), str(years - random.randint(1, 10)), correct_answer]

    elif "time loop" in template:
        loops = random.randint(2, 10)
        question = template.format(loops=loops)
        correct_answer = str(loops)
        options = [str(loops + 1), str(loops - 1), correct_answer]

    elif "athlete trains" in template:
        start_day = random.choice(days_of_week)
        day_gap = random.choice(range(1, 8))
        next_training_day = days_of_week[(days_of_week.index(start_day) + day_gap) % 7]
        question = template.format(days=day_gap, days_of_week=start_day)
        correct_answer = next_training_day
        incorrect_days = random.sample([day for day in days_of_week if day != next_training_day], 2)
        options = [next_training_day] + incorrect_days

    elif "particular star becomes visible" in template:
        year_last_seen = random.choice(range(1800, 2023))
        visibility_gap = random.choice([20, 50, 75, 100, 150, 200])
        next_visibility_year = year_last_seen + visibility_gap
        question = template.format(years=visibility_gap, year=year_last_seen)
        correct_answer = str(next_visibility_year)
        incorrect_years = [next_visibility_year + random.choice([-1, -2, 1, 2]) for _ in range(2)]
        options = [str(next_visibility_year)] + incorrect_years
    
    elif "TV series releases" in template:
        day_gap = random.choice(range(1, 8))
        start_day = random.choice(days_of_week)
        episode_number = random.randint(2, 10)
        episode_release_day = days_of_week[(days_of_week.index(start_day) + (episode_number-1) * day_gap) % 7]
        question = template.format(days=day_gap, start_day=start_day, episode_number=ordinal(episode_number))
        correct_answer = episode_release_day
        incorrect_days = random.sample([day for day in days_of_week if day != episode_release_day], 2)
        options = [episode_release_day] + incorrect_days

    elif "town hosts a carnival" in template:
        year_last_carnival = random.choice(range(1800, 2023))
        carnival_gap = random.choice([2, 3, 4, 5])
        next_carnival_year = year_last_carnival + carnival_gap
        month = random.choice(months)
        question = template.format(years=carnival_gap, month=month, year=year_last_carnival)
        correct_answer = f"{month} {str(next_carnival_year)}"
        incorrect_years = [next_carnival_year + random.choice([-1, -2, 1, 2]) for _ in range(2)]
        options = [f"{month} {str(next_carnival_year)}"] + [f"{month} {str(year)}" for year in incorrect_years]

    elif "tree sheds its leaves" in template:
        month_gap = random.choice([2, 3, 4, 6])
        next_shed_month = months[month_gap % 12]
        question = template.format(months=month_gap)
        correct_answer = next_shed_month
        incorrect_months = random.sample([month for month in months if month != next_shed_month], 2)
        options = [next_shed_month] + incorrect_months

    random.shuffle(options)
    correct_option = options.index(correct_answer)
    
    if question in used_combinations:
        return generate_abstract_temporal_frequency_questions()
    used_combinations.add(question)

    return question, options[0], options[1], options[2], chr(65 + correct_option)

questions = []
for _ in range(1000):
    q, opt_a, opt_b, opt_c, ans = generate_abstract_temporal_frequency_questions()
    questions.append([q, opt_a, opt_b, opt_c, ans])

df_abstract_frequency = pd.DataFrame(questions, columns=["Question", "Option A", "Option B", "Option C", "Answer"])
df_abstract_frequency = df_abstract_frequency.drop_duplicates(subset=['Question'])
df_abstract_frequency['Category'] = 'Application'

In [None]:
# Get commonsense questions from Mctaco dataset
# Read the tsv file
df_mctaco_dev = pd.read_csv('dev_3783.tsv', sep='\t', header=None)
df_mctaco_test = pd.read_csv('test_9442.tsv', sep='\t', header=None)
df_mctaco = pd.concat([df_mctaco_dev, df_mctaco_test])

# Assign column names manually
df_mctaco.columns = ['Sentence', 'Question', 'Answer', 'Correct', 'Type']

# Filter the dataframe to only include rows where the last column is 'Frequency'
df_mctaco = df_mctaco[df_mctaco['Type'] == 'Frequency']
df_mctaco['Question'] = df_mctaco['Sentence'] + ' ' + df_mctaco['Question']

new_df_mctaco = pd.DataFrame(columns=['Question', 'Option A', 'Option B', 'Option C', 'Answer'])

questions = df_mctaco['Question'].unique()

for question in questions:
    # Get all rows with the current question
    current_df = df_mctaco[df_mctaco['Question'] == question]
    # Check if there is a correct answer
    if len(current_df[current_df['Correct'] == 'yes']) > 0:
        # Get the correct answer
        correct_answer = current_df[current_df['Correct'] == 'yes']['Answer'].iloc[0]
        # Get all incorrect answers
        incorrect_answers = current_df[current_df['Correct'] == 'no']['Answer'].tolist()
        # Check if there are at least two incorrect answers
        if len(incorrect_answers) >= 2:
            # Randomly select two incorrect answers
            distractors = random.sample(incorrect_answers, 2)
            options = [correct_answer, distractors[0], distractors[1]]
            # Shuffle the options
            random.shuffle(options)
            correct_option = 'ABC'[options.index(correct_answer)]
            new_df_mctaco = new_df_mctaco.append({'Question': question, 
                                    'Option A': options[0], 
                                    'Option B': options[1], 
                                    'Option C': options[2], 
                                    'Answer': correct_option}, ignore_index=True)
        else:
            # Handle the case where there are not enough incorrect answers
            pass
    else:
        # Handle the case where there is no correct answer
        pass
    
new_df_mctaco['Category'] = 'Commonsense'

In [None]:
# Load SQuAD dataset
squad_dataset = load_dataset('squad')

# Create a dataframe to store the questions and options
squad_df = pd.DataFrame(columns=['Question', 'Option A', 'Option B', 'Option C', 'Answer'])

def is_temporal_frequency_question(question):
    """
    Function to determine if a question is related to temporal frequency.
    """
    # Define keywords related to frequency
    keywords = ['how often', 'how many times', 'how frequently', 'how frequent', 'how recurrently', 'what is the frequency of', 'how regular']
    return any(keyword in question.lower() for keyword in keywords)

def generate_distractors(correct_answer):
    """
    Function to generate distractors for a given correct answer.
    """
    # Define possible distractors
    possible_distractors = ['once a day', 'twice a week', 'three times a month', 
                            'four times a year', 'every other day', 'once a week', 
                            'twice a month', 'three times a year', 'once a month', 
                            'every day', 'every week', 'every month', 'every year',
                            'never', 'rarely', 'occasionally', 'frequently', 'always',
                            'daily', 'weekly', 'monthly', 'yearly']
    # Remove the correct answer from possible distractors
    possible_distractors = [d for d in possible_distractors if d != correct_answer]
    # Randomly select two distractors
    return random.sample(possible_distractors, 2)

combined_dataset = concatenate_datasets([squad_dataset['train'], squad_dataset['validation']])

# Process the SQuAD dataset
for example in combined_dataset:
    question = example['question']
    context = example['context']
    if is_temporal_frequency_question(question):
        # Create the full question with context and question
        full_question = context + ' ' + question
        # Get the correct answer
        correct_answer = example['answers']['text'][0]
        # Generate distractors
        distractor1, distractor2 = generate_distractors(correct_answer)
        # Randomly assign the options to A, B, C
        options = [correct_answer, distractor1, distractor2]
        random.shuffle(options)
        correct_option = 'ABC'[options.index(correct_answer)]
        # Append the question and options to the dataframe
        squad_df = pd.concat([squad_df, pd.DataFrame([{'Question': full_question, 
                                           'Option A': options[0], 
                                           'Option B': options[1], 
                                           'Option C': options[2], 
                                           'Answer': correct_option}])], ignore_index=True)

squad_df['Category'] = 'Reading Comprehension'

In [None]:
df_temporal_frequency = pd.concat([df_facts_frequency, df_events_frequency_comparison, df_event_frequency_computation, df_integrated_frequency, df_abstract_frequency, new_df_mctaco, squad_df], ignore_index=True)