In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import pandas as pd
import re
import json
from datetime import datetime

In [21]:
file_path = "/content/drive/MyDrive/Courses/CIS531/Term_Project/olympics/Processed_Data/athletes_processed.csv"
athletes_df = pd.read_csv(file_path)
athletes_df.head(), athletes_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11110 entries, 0 to 11109
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    11110 non-null  object
dtypes: object(1)
memory usage: 86.9+ KB


(                                                text
 0  code: 1532872 | name: aleksanyan artur | name_...
 1  code: 1532873 | name: amoyan malkhas | name_tv...
 2  code: 1532874 | name: galstyan slavik | name_t...
 3  code: 1532944 | name: harutyunyan arsen | name...
 4  code: 1532945 | name: tevanyan vazgen | name_t...,
 None)

Parsing the information from my file :)

In [61]:
def parse_info(text):
    pattern = r"(\w+):\s(.*?)\s(?=\w+:|$)"
    matches = re.findall(pattern, text)
    return {key.strip().lower(): value.strip() for key, value in matches}

In [23]:
athletes_df['parsed'] = athletes_df['text'].apply(parse_info)

athletes_df['parsed'].head()

Unnamed: 0,parsed
0,"{'code': '1532872 |', 'name': 'aleksanyan artu..."
1,"{'code': '1532873 |', 'name': 'amoyan malkhas ..."
2,"{'code': '1532874 |', 'name': 'galstyan slavik..."
3,"{'code': '1532944 |', 'name': 'harutyunyan ars..."
4,"{'code': '1532945 |', 'name': 'tevanyan vazgen..."


Removing the | bars I'd previously added. These helped for my readability, but didn't turn out to be as helpful as I'd thought they'd be overall.

In [25]:
def clean_parsed_info(parsed_info):
  cleaned_info = {key: value.strip(" |") for key, value in parsed_info.items()}
  return cleaned_info

In [26]:
athletes_df['cleaned_parsed'] = athletes_df['parsed'].apply(clean_parsed_info)

athletes_df['cleaned_parsed'].head()

Unnamed: 0,cleaned_parsed
0,"{'code': '1532872', 'name': 'aleksanyan artur'..."
1,"{'code': '1532873', 'name': 'amoyan malkhas', ..."
2,"{'code': '1532874', 'name': 'galstyan slavik',..."
3,"{'code': '1532944', 'name': 'harutyunyan arsen..."
4,"{'code': '1532945', 'name': 'tevanyan vazgen',..."


Function to generate QA dataset - it will contain triplets with "context", "question", and "answer".

In [27]:
def generate_qa_entries(cleaned_data):
    context = " | ".join([f"{key}: {value}" for key, value in cleaned_data.items() if value])
    qa_entries = []
    name = cleaned_data.get("name", "the athlete")

    if "nickname" in cleaned_data and cleaned_data["nickname"]:
        qa_entries.append({
            "context": context,
            "question": f"What is the nickname of {name}?",
            "answer": cleaned_data["nickname"]
        })
    if "disciplines" in cleaned_data and cleaned_data["disciplines"]:
        qa_entries.append({
            "context": context,
            "question": f"What are the disciplines of {name}?",
            "answer": cleaned_data["disciplines"]
        })
    if "events" in cleaned_data and cleaned_data["events"]:
        qa_entries.append({
            "context": context,
            "question": f"What events does {name} compete in?",
            "answer": cleaned_data["events"]
        })
    if "birth_place" in cleaned_data and cleaned_data["birth_place"]:
        qa_entries.append({
            "context": context,
            "question": f"Where was {name} born?",
            "answer": cleaned_data["birth_place"]
        })
    if "birth_date" in cleaned_data and cleaned_data["birth_date"]:
        qa_entries.append({
            "context": context,
            "question": f"When was {name} born?",
            "answer": cleaned_data["birth_date"]
        })
    if "reason" in cleaned_data and cleaned_data["reason"]:
        qa_entries.append({
            "context": context,
            "question": f"Why did {name} choose their sport?",
            "answer": cleaned_data["reason"]
        })
    if "hero" in cleaned_data and cleaned_data["hero"]:
        qa_entries.append({
            "context": context,
            "question": f"Who is {name}'s hero?",
            "answer": cleaned_data["hero"]
        })
    if "philosophy" in cleaned_data and cleaned_data["philosophy"]:
        qa_entries.append({
            "context": context,
            "question": f"What is {name}'s philosophy?",
            "answer": cleaned_data["philosophy"]
        })

    return qa_entries

In [28]:
qa_data = []
for _, row in athletes_df.iterrows():
    cleaned_data = row['cleaned_parsed']
    qa_entries = generate_qa_entries(cleaned_data)
    qa_data.extend(qa_entries)

qa_df = pd.DataFrame(qa_data)
qa_df.head()

Unnamed: 0,context,question,answer
0,code: 1532872 | name: aleksanyan artur | name_...,What is the nickname of aleksanyan artur?,white bear
1,code: 1532872 | name: aleksanyan artur | name_...,What are the disciplines of aleksanyan artur?,['wrestling']
2,code: 1532872 | name: aleksanyan artur | name_...,What events does aleksanyan artur compete in?,"[""men's greco-roman 97kg""]"
3,code: 1532872 | name: aleksanyan artur | name_...,Where was aleksanyan artur born?,gyumri
4,code: 1532872 | name: aleksanyan artur | name_...,When was aleksanyan artur born?,1991-10-21


Cleaning the result :)

In [34]:
def clean_answer(answer):
    if isinstance(answer, str):
        # First I'm removing brackets and quotes
        answer = re.sub(r"[\[\]\"']", "", answer).strip()
        # Then I'm going through and seeing if there are references at the end of the answer, and removing them if there are.
        answer = re.sub(r"\s*\(.*?\)$", "", answer).strip()
    return answer

In [35]:
qa_df['answer'] = qa_df['answer'].apply(clean_answer)

qa_df.head()

Unnamed: 0,context,question,answer
0,code: 1532872 | name: aleksanyan artur | name_...,What is the nickname of aleksanyan artur?,white bear
1,code: 1532872 | name: aleksanyan artur | name_...,What are the disciplines of aleksanyan artur?,wrestling
2,code: 1532872 | name: aleksanyan artur | name_...,What events does aleksanyan artur compete in?,mens greco-roman 97kg
3,code: 1532872 | name: aleksanyan artur | name_...,Where was aleksanyan artur born?,gyumri
4,code: 1532872 | name: aleksanyan artur | name_...,When was aleksanyan artur born?,1991-10-21


In [36]:
output_csv_path = "/content/drive/MyDrive/Courses/CIS531/Term_Project/olympics/qa_data/athletes_qa.csv"
qa_df.to_csv(output_csv_path, index=False)
print(f"QA data saved to {output_csv_path}")

QA data saved to /content/drive/MyDrive/Courses/CIS531/Term_Project/olympics/qa_data/athletes_qa.csv


#Everything before this point was for cleaning and formatting my data for the athletes dataset. I'll do a very similar process for each other file, which will be done below.

##Coaches

In [37]:
file_path = "/content/drive/MyDrive/Courses/CIS531/Term_Project/olympics/Processed_Data/coaches_processed.csv"
coaches_df = pd.read_csv(file_path)
coaches_df.head(), coaches_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11110 entries, 0 to 11109
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   text            11110 non-null  object
 1   parsed          11110 non-null  object
 2   cleaned_parsed  11110 non-null  object
dtypes: object(3)
memory usage: 260.5+ KB


(                                                text
 0  code: 1533246 | name: pedrero ofelia | gender:...
 1  code: 1535775 | name: radhi shenaishil | gende...
 2  code: 1536055 | name: aflakikhamseh majid | ge...
 3  code: 1536059 | name: yousefy mehrdad | gender...
 4  code: 1536060 | name: maddah minoo | gender: f...,
 None)

Can use the same parse_info and clean_parsed_info functions :) The benefits of having made the processed files very similar in format initially!

In [38]:
coaches_df['parsed'] = coaches_df['text'].apply(parse_info)

coaches_df['parsed'].head()

Unnamed: 0,parsed
0,"{'code': '1533246 |', 'name': 'pedrero ofelia ..."
1,"{'code': '1535775 |', 'name': 'radhi shenaishi..."
2,"{'code': '1536055 |', 'name': 'aflakikhamseh m..."
3,"{'code': '1536059 |', 'name': 'yousefy mehrdad..."
4,"{'code': '1536060 |', 'name': 'maddah minoo |'..."


In [39]:
coaches_df['cleaned_parsed'] = coaches_df['parsed'].apply(clean_parsed_info)

coaches_df['cleaned_parsed'].head()

Unnamed: 0,cleaned_parsed
0,"{'code': '1533246', 'name': 'pedrero ofelia', ..."
1,"{'code': '1535775', 'name': 'radhi shenaishil'..."
2,"{'code': '1536055', 'name': 'aflakikhamseh maj..."
3,"{'code': '1536059', 'name': 'yousefy mehrdad',..."
4,"{'code': '1536060', 'name': 'maddah minoo', 'g..."


Main difference is the generating questions function, simply because each file contains different information which necessitates different questions!

In [40]:
def generate_coach_qa_entries(cleaned_data):
    context = " | ".join([f"{key}: {value}" for key, value in cleaned_data.items() if value])
    qa_entries = []
    name = cleaned_data.get("name", "the coach")

    if "function" in cleaned_data and cleaned_data["function"]:
        qa_entries.append({
            "context": context,
            "question": f"What is the position of {name}?",
            "answer": cleaned_data["function"]
        })
    if "gender" in cleaned_data and cleaned_data["gender"]:
        qa_entries.append({
            "context": context,
            "question": f"What is the gender of {name}?",
            "answer": cleaned_data["gender"]
        })
    if "disciplines" in cleaned_data and cleaned_data["disciplines"]:
        qa_entries.append({
            "context": context,
            "question": f"What disciplines does {name} coach?",
            "answer": cleaned_data["disciplines"]
        })
    if "events" in cleaned_data and cleaned_data["events"]:
        qa_entries.append({
            "context": context,
            "question": f"What events does {name} coach?",
            "answer": cleaned_data["events"]
        })
    if "birth_date" in cleaned_data and cleaned_data["birth_date"]:
        qa_entries.append({
            "context": context,
            "question": f"When was {name} born?",
            "answer": cleaned_data["birth_date"]
        })

    return qa_entries

In [41]:
qa_coach_data = []
for _, row in coaches_df.iterrows():
    cleaned_data = row['cleaned_parsed']
    qa_entries = generate_coach_qa_entries(cleaned_data)
    qa_coach_data.extend(qa_entries)

qa_coach_df = pd.DataFrame(qa_coach_data)

qa_coach_df.head()

Unnamed: 0,context,question,answer
0,code: 1533246 | name: pedrero ofelia | gender:...,What is the position of pedrero ofelia?,coach
1,code: 1533246 | name: pedrero ofelia | gender:...,What is the gender of pedrero ofelia?,female
2,code: 1533246 | name: pedrero ofelia | gender:...,What disciplines does pedrero ofelia coach?,artistic swimming
3,code: 1533246 | name: pedrero ofelia | gender:...,What events does pedrero ofelia coach?,team
4,code: 1535775 | name: radhi shenaishil | gende...,What is the position of radhi shenaishil?,head coach


Using same clean_answer function for now.. Not sure if it's needed but shouldn't hurt.

In [42]:
qa_coach_df['answer'] = qa_coach_df['answer'].apply(clean_answer)

qa_coach_df.head()

Unnamed: 0,context,question,answer
0,code: 1533246 | name: pedrero ofelia | gender:...,What is the position of pedrero ofelia?,coach
1,code: 1533246 | name: pedrero ofelia | gender:...,What is the gender of pedrero ofelia?,female
2,code: 1533246 | name: pedrero ofelia | gender:...,What disciplines does pedrero ofelia coach?,artistic swimming
3,code: 1533246 | name: pedrero ofelia | gender:...,What events does pedrero ofelia coach?,team
4,code: 1535775 | name: radhi shenaishil | gende...,What is the position of radhi shenaishil?,head coach


In [43]:
output_csv_path = "/content/drive/MyDrive/Courses/CIS531/Term_Project/olympics/qa_data/coaches_qa.csv"

qa_coach_df.to_csv(output_csv_path, index=False)

print(f"QA data for coaches saved to {output_csv_path}")

QA data for coaches saved to /content/drive/MyDrive/Courses/CIS531/Term_Project/olympics/qa_data/coaches_qa.csv


#Events

Going to try to run it all at once this time instead of in modules. Fingers crossed!

In [44]:
file_path = "/content/drive/MyDrive/Courses/CIS531/Term_Project/olympics/Processed_Data/events_processed.csv"
events_df = pd.read_csv(file_path)
events_df.head(), events_df.info()

# Parsing
events_df['parsed'] = events_df['text'].apply(parse_info)

events_df['parsed'].head()


# Cleaning
events_df['cleaned_parsed'] = events_df['parsed'].apply(clean_parsed_info)

events_df['cleaned_parsed'].head()

# Generating questions (honestly not much to it for the events)
def generate_qa_entries(cleaned_data):
    context = " | ".join([f"{key}: {value}" for key, value in cleaned_data.items() if value])
    qa_entries = []
    event = cleaned_data.get("event", "this event")

    if "sport" in cleaned_data and cleaned_data["sport"]:
        qa_entries.append({
            "context": context,
            "question": f"What sport is associated with {event}?",
            "answer": cleaned_data["sport"]
        })

    return qa_entries


qa_data = []
for _, row in events_df.iterrows():
    cleaned_data = row['cleaned_parsed']
    qa_entries = generate_qa_entries(cleaned_data)
    qa_data.extend(qa_entries)

qa_events_df = pd.DataFrame(qa_data)

qa_events_df.head()


qa_events_df['answer'] = qa_events_df['answer'].apply(clean_answer)

output_csv_path = "/content/drive/MyDrive/Courses/CIS531/Term_Project/olympics/qa_data/events_qa.csv"
qa_events_df.to_csv(output_csv_path, index=False)
print(f"QA data for events saved to {output_csv_path}")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 329 entries, 0 to 328
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    329 non-null    object
dtypes: object(1)
memory usage: 2.7+ KB
QA data for events saved to /content/drive/MyDrive/Courses/CIS531/Term_Project/olympics/qa_data/events_qa.csv


#Medallists!

In [51]:
file_path = "/content/drive/MyDrive/Courses/CIS531/Term_Project/olympics/Processed_Data/medallists_processed.csv"
medalists_df = pd.read_csv(file_path)  # lol I started off spelling medallists differently than I previously had... We'll just roll with the inconsistency at this point :)
medalists_df.head(), medalists_df.info()

# Parsing
medalists_df['parsed'] = medalists_df['text'].apply(parse_info)

medalists_df['parsed'].head()


# Cleaning
medalists_df['cleaned_parsed'] = medalists_df['parsed'].apply(clean_parsed_info)

medalists_df['cleaned_parsed'].head()

# Generating questions (medallists has a solid amount of information!!)
def generate_qa_entries(cleaned_data):
    context = " | ".join([f"{key}: {value}" for key, value in cleaned_data.items() if value])
    qa_entries = []
    name = cleaned_data.get("name", "this athlete")

    if "medal_type" in cleaned_data and cleaned_data["medal_type"]:
        qa_entries.append({
            "context": context,
            "question": f"What medal did {name} win?",
            "answer": cleaned_data["medal_type"]
        })
        medal_type = cleaned_data["medal_type"]
    if "medal_date" in cleaned_data and cleaned_data["medal_date"]:
        qa_entries.append({
            "context": context,
            "question": f"When did {name} win their {medal_type}?",
            "answer": cleaned_data["medal_date"]
        })
    if "event" in cleaned_data and cleaned_data["event"]:
        qa_entries.append({
            "context": context,
            "question": f"In what event did {name} win a {medal_type}?",
            "answer": cleaned_data["event"]
        })
    if "team" in cleaned_data and cleaned_data["team"]:
        qa_entries.append({
            "context": context,
            "question": f"For which team did {name} win their {medal_type}?",
            "answer": cleaned_data["team"]
        })
    if "discipline" in cleaned_data and cleaned_data["discipline"]:
        qa_entries.append({
            "context": context,
            "question": f"In which discipline did {name} compete when they won a {medal_type}?",
            "answer": cleaned_data["discipline"]
        })

    return qa_entries


qa_data = []
for _, row in medalists_df.iterrows():
    cleaned_data = row['cleaned_parsed']
    qa_entries = generate_qa_entries(cleaned_data)
    qa_data.extend(qa_entries)

qa_medallists_df = pd.DataFrame(qa_data)

qa_medallists_df.head()


qa_medallists_df['answer'] = qa_medallists_df['answer'].apply(clean_answer)

output_csv_path = "/content/drive/MyDrive/Courses/CIS531/Term_Project/olympics/qa_data/medallists_qa.csv"
qa_medallists_df.to_csv(output_csv_path, index=False)
print(f"QA data for medallists saved to {output_csv_path}")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2315 entries, 0 to 2314
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    2315 non-null   object
dtypes: object(1)
memory usage: 18.2+ KB
QA data for medallists saved to /content/drive/MyDrive/Courses/CIS531/Term_Project/olympics/qa_data/medallists_qa.csv


#Medals!

In [52]:
file_path = "/content/drive/MyDrive/Courses/CIS531/Term_Project/olympics/Processed_Data/medals_processed.csv"
medals_df = pd.read_csv(file_path)
medals_df.head(), medals_df.info()

# Parsing
medals_df['parsed'] = medals_df['text'].apply(parse_info)

medals_df['parsed'].head()


# Cleaning
medals_df['cleaned_parsed'] = medals_df['parsed'].apply(clean_parsed_info)

medals_df['cleaned_parsed'].head()

# Generating questions (medals kind of has the exact same information as medallists.. which I suppose makes sense. but it makes me wonder if it's even worth it to make a separate file.)
def generate_qa_entries(cleaned_data):
    context = " | ".join([f"{key}: {value}" for key, value in cleaned_data.items() if value])
    qa_entries = []
    name = cleaned_data.get("name", "this athlete")

    if "medal_type" in cleaned_data and cleaned_data["medal_type"]:
        qa_entries.append({
            "context": context,
            "question": f"What medal did {name} win?",
            "answer": cleaned_data["medal_type"]
        })
        medal_type = cleaned_data["medal_type"]
    if "medal_date" in cleaned_data and cleaned_data["medal_date"]:
        qa_entries.append({
            "context": context,
            "question": f"When did {name} win their {medal_type}?",
            "answer": cleaned_data["medal_date"]
        })
    if "event" in cleaned_data and cleaned_data["event"]:
        qa_entries.append({
            "context": context,
            "question": f"In what event did {name} win their {medal_type}?",
            "answer": cleaned_data["event"]
        })
    if "country_long" in cleaned_data and cleaned_data["country_long"]:
        qa_entries.append({
            "context": context,
            "question": f"For which country did {name} win their {medal_type}?",
            "answer": cleaned_data["country_long"]
        })
    if "discipline" in cleaned_data and cleaned_data["discipline"]:
        qa_entries.append({
            "context": context,
            "question": f"In which discipline did {name} win their {medal_type}?",
            "answer": cleaned_data["discipline"]
        })

    return qa_entries


qa_data = []
for _, row in medals_df.iterrows():
    cleaned_data = row['cleaned_parsed']
    qa_entries = generate_qa_entries(cleaned_data)
    qa_data.extend(qa_entries)

qa_medals_df = pd.DataFrame(qa_data)

qa_medals_df.head()


qa_medals_df['answer'] = qa_medals_df['answer'].apply(clean_answer)

output_csv_path = "/content/drive/MyDrive/Courses/CIS531/Term_Project/olympics/qa_data/medals_qa.csv"
qa_medals_df.to_csv(output_csv_path, index=False)
print(f"QA data for medals saved to {output_csv_path}")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1044 entries, 0 to 1043
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    1044 non-null   object
dtypes: object(1)
memory usage: 8.3+ KB
QA data for medals saved to /content/drive/MyDrive/Courses/CIS531/Term_Project/olympics/qa_data/medals_qa.csv


#Medals_total_processed! Harder than I thought it would be.

In [80]:
file_path = "/content/drive/MyDrive/Courses/CIS531/Term_Project/olympics/Processed_Data/medals_total_processed.csv"
medals_total_df = pd.read_csv(file_path)
medals_total_df.head(), medals_total_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92 entries, 0 to 91
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    92 non-null     object
dtypes: object(1)
memory usage: 864.0+ bytes


(                                                                                                                              text
 0   country_code: usa | country_long: united states of america | Gold Medal: 40 | Silver Medal: 44 | Bronze Medal: 42 | Total: 126
 1  country_code: chn | country_long: people's republic of china | Gold Medal: 40 | Silver Medal: 27 | Bronze Medal: 24 | Total: 91
 2                       country_code: jpn | country_long: japan | Gold Medal: 20 | Silver Medal: 12 | Bronze Medal: 13 | Total: 45
 3                   country_code: aus | country_long: australia | Gold Medal: 18 | Silver Medal: 19 | Bronze Medal: 16 | Total: 53
 4                      country_code: fra | country_long: france | Gold Medal: 16 | Silver Medal: 26 | Bronze Medal: 22 | Total: 64,
 None)

In [81]:
# Had to make a special parsing function to handle this dataset because it would only output gold medal results... then it would only output bronze medal results... ooof this was work haha
def parse_medals_total_info(text):
    pattern = r"([\w\s]+):\s([^|]+)"
    matches = re.findall(pattern, text)

    parsed_info = {key.strip().lower().replace(" ", "_"): value.strip() for key, value in matches}

    # making sure the keys include the medal type lol
    if "medal" in parsed_info and "country_long" in parsed_info:
        country_info = parsed_info["country_long"].lower()
        if "gold" in country_info:
            parsed_info["gold_medal"] = parsed_info.pop("gold_medal")
        elif "silver" in country_info:
            parsed_info["silver_medal"] = parsed_info.pop("silver_medal")
        elif "bronze" in country_info:
            parsed_info["bronze_medal"] = parsed_info.pop("bronze_medal")

    return parsed_info

medals_total_df['parsed'] = medals_total_df['text'].apply(parse_medals_total_info)
pd.set_option('display.max_colwidth', None)
medals_total_df['parsed'].head()

Unnamed: 0,parsed
0,"{'country_code': 'usa', 'country_long': 'united states of america', 'gold_medal': '40', 'silver_medal': '44', 'bronze_medal': '42', 'total': '126'}"
1,"{'country_code': 'chn', 'country_long': 'people's republic of china', 'gold_medal': '40', 'silver_medal': '27', 'bronze_medal': '24', 'total': '91'}"
2,"{'country_code': 'jpn', 'country_long': 'japan', 'gold_medal': '20', 'silver_medal': '12', 'bronze_medal': '13', 'total': '45'}"
3,"{'country_code': 'aus', 'country_long': 'australia', 'gold_medal': '18', 'silver_medal': '19', 'bronze_medal': '16', 'total': '53'}"
4,"{'country_code': 'fra', 'country_long': 'france', 'gold_medal': '16', 'silver_medal': '26', 'bronze_medal': '22', 'total': '64'}"


In [82]:
##### Had to make a new cleaning function for medals_total.. my info was getting all messed up with the other function. This was rough to figure out too lol.
def clean_medals_total_info(parsed_info):
    cleaned_info = {key: value.strip(" |") for key, value in parsed_info.items()}

    #medal_types = ["gold", "silver", "bronze"]
    #for medal in medal_types:
    #    cleaned_info[f"{medal}_medal"] = "0"

    if "medal" in cleaned_info and "country_long" in cleaned_info:
        country_info = cleaned_info["country_long"].lower()
        if "gold" in country_info:
            cleaned_info["gold_medal"] = cleaned_info.get("gold_medal", "")
        elif "silver" in country_info:
            cleaned_info["silver_medal"] = cleaned_info.get("silver_medal", "")
        elif "bronze" in country_info:
            cleaned_info["bronze_medal"] = cleaned_info.get("bronze_medal", "")


    if "total" in parsed_info:
        cleaned_info["total"] = parsed_info["total"]


    #cleaned_info["country_long"] = re.sub(r"\b(gold|silver|bronze)\b", "", cleaned_info.get("country_long", ""), flags=re.IGNORECASE).strip()


    return cleaned_info


# Cleaning
medals_total_df['cleaned_parsed'] = medals_total_df['parsed'].apply(clean_parsed_info)

medals_total_df['cleaned_parsed'].head()

Unnamed: 0,cleaned_parsed
0,"{'country_code': 'usa', 'country_long': 'united states of america', 'gold_medal': '40', 'silver_medal': '44', 'bronze_medal': '42', 'total': '126'}"
1,"{'country_code': 'chn', 'country_long': 'people's republic of china', 'gold_medal': '40', 'silver_medal': '27', 'bronze_medal': '24', 'total': '91'}"
2,"{'country_code': 'jpn', 'country_long': 'japan', 'gold_medal': '20', 'silver_medal': '12', 'bronze_medal': '13', 'total': '45'}"
3,"{'country_code': 'aus', 'country_long': 'australia', 'gold_medal': '18', 'silver_medal': '19', 'bronze_medal': '16', 'total': '53'}"
4,"{'country_code': 'fra', 'country_long': 'france', 'gold_medal': '16', 'silver_medal': '26', 'bronze_medal': '22', 'total': '64'}"


In [83]:
# Generating questions (luckily didn't have tons of complications here)
def generate_qa_entries(cleaned_data):
    context = " | ".join([f"{key}: {value}" for key, value in cleaned_data.items() if value])
    qa_entries = []
    country = cleaned_data.get("country_long", "this country")

    if "gold_medal" in cleaned_data and cleaned_data["gold_medal"]:
        qa_entries.append({
            "context": context,
            "question": f"How many gold medals did {country} win?",
            "answer": cleaned_data["gold_medal"]
        })
    if "silver_medal" in cleaned_data and cleaned_data["silver_medal"]:
        qa_entries.append({
            "context": context,
            "question": f"How many silver medals did {country} win?",
            "answer": cleaned_data["silver_medal"]
        })
    if "bronze_medal" in cleaned_data and cleaned_data["bronze_medal"]:
        qa_entries.append({
            "context": context,
            "question": f"How many bronze medals did {country} win?",
            "answer": cleaned_data["bronze_medal"]
        })
    if "total" in cleaned_data and cleaned_data["total"]:
        qa_entries.append({
            "context": context,
            "question": f"What is the total number of medals won by {country}?",
            "answer": cleaned_data["total"]
        })

    return qa_entries


qa_data = []
for _, row in medals_total_df.iterrows():
    cleaned_data = row['cleaned_parsed']
    qa_entries = generate_qa_entries(cleaned_data)
    qa_data.extend(qa_entries)

qa_medals_total_df = pd.DataFrame(qa_data)

qa_medals_total_df.head()


qa_medals_total_df['answer'] = qa_medals_total_df['answer'].apply(clean_answer)

output_csv_path = "/content/drive/MyDrive/Courses/CIS531/Term_Project/olympics/qa_data/medals_total_qa.csv"
qa_medals_total_df.to_csv(output_csv_path, index=False)
print(f"QA data for medals_total saved to {output_csv_path}")

QA data for medals_total saved to /content/drive/MyDrive/Courses/CIS531/Term_Project/olympics/qa_data/medals_total_qa.csv


#NOCs! (names of countries)

Here I also had issues with parsing and cleaning. Because this has happened multiple times now, I'm going to add parse_info and clean_parsed_info (or variations) in each block of code instead of continually using the same one from much earlier.

In [86]:
file_path = "/content/drive/MyDrive/Courses/CIS531/Term_Project/olympics/Processed_Data/nocs_processed.csv"
nocs_df = pd.read_csv(file_path)
nocs_df.head(), nocs_df.info()

# Parsing
def parse_info(text):
    pattern = r"([\w\s]+):\s([^|]+)"
    matches = re.findall(pattern, text)
    return {key.strip().lower().replace(" ", "_"): value.strip() for key, value in matches}

nocs_df['parsed'] = nocs_df['text'].apply(parse_info)

nocs_df['parsed'].head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 224 entries, 0 to 223
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    224 non-null    object
dtypes: object(1)
memory usage: 1.9+ KB


Unnamed: 0,parsed
0,"{'code': 'afg', 'country_long': 'afghanistan'}"
1,"{'code': 'aho', 'country_long': 'netherlands antilles'}"
2,"{'code': 'ain', 'country_long': 'ain'}"
3,"{'code': 'alb', 'country_long': 'albania'}"
4,"{'code': 'alg', 'country_long': 'algeria'}"


In [87]:
# Cleaning
def clean_parsed_info(parsed_info):
  cleaned_info = {key: value.strip(" |") for key, value in parsed_info.items()}
  return cleaned_info
nocs_df['cleaned_parsed'] = nocs_df['parsed'].apply(clean_parsed_info)

nocs_df['cleaned_parsed'].head()

Unnamed: 0,cleaned_parsed
0,"{'code': 'afg', 'country_long': 'afghanistan'}"
1,"{'code': 'aho', 'country_long': 'netherlands antilles'}"
2,"{'code': 'ain', 'country_long': 'ain'}"
3,"{'code': 'alb', 'country_long': 'albania'}"
4,"{'code': 'alg', 'country_long': 'algeria'}"


In [88]:
# Generating questions
def generate_qa_entries(cleaned_data):
    context = " | ".join([f"{key}: {value}" for key, value in cleaned_data.items() if value])
    qa_entries = []
    noc_code = cleaned_data.get("code", "this NOC")
    country_name = cleaned_data.get("country_long", "this country")

    if "code" in cleaned_data and cleaned_data["code"]:
        qa_entries.append({
            "context": context,
            "question": f"What is the NOC code for {country_name}?",
            "answer": cleaned_data["code"]
        })
    if "country_long" in cleaned_data and cleaned_data["country_long"]:
        qa_entries.append({
            "context": context,
            "question": f"What is the full name of the country with NOC code {noc_code}?",
            "answer": cleaned_data["country_long"]
        })

    return qa_entries


qa_data = []
for _, row in nocs_df.iterrows():
    cleaned_data = row['cleaned_parsed']
    qa_entries = generate_qa_entries(cleaned_data)
    qa_data.extend(qa_entries)

qa_nocs_df = pd.DataFrame(qa_data)

qa_nocs_df.head()


qa_nocs_df['answer'] = qa_nocs_df['answer'].apply(clean_answer)

output_csv_path = "/content/drive/MyDrive/Courses/CIS531/Term_Project/olympics/qa_data/nocs_qa.csv"
qa_nocs_df.to_csv(output_csv_path, index=False)
print(f"QA data for nocs saved to {output_csv_path}")

QA data for nocs saved to /content/drive/MyDrive/Courses/CIS531/Term_Project/olympics/qa_data/nocs_qa.csv


#Schedules

In [95]:
file_path = "/content/drive/MyDrive/Courses/CIS531/Term_Project/olympics/Processed_Data/schedules_processed.csv"
schedules_df = pd.read_csv(file_path)
schedules_df.head(), schedules_df.info()

# Parsing
def parse_info(text):
    pattern = r"([\w\s]+):\s([^|]+)"
    matches = re.findall(pattern, text)
    return {key.strip().lower().replace(" ", "_"): value.strip() for key, value in matches}

schedules_df['parsed'] = schedules_df['text'].apply(parse_info)

schedules_df['parsed'].head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3895 entries, 0 to 3894
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    3895 non-null   object
dtypes: object(1)
memory usage: 30.6+ KB


Unnamed: 0,parsed
0,"{'start_date': '2024-07-24t15:00:00+02:00', 'end_date': '2024-07-24t16:45:00+02:00', 'day': '2024-07-24', 'event': 'football men's group b', 'event_medal': '0', 'gender': 'm', 'event_type': 'hteam', 'venue': 'geoffroy-guichard stadium', 'venue_code': 'ste', 'location_description': 'geoffroy-guichard stadium, saint-etienne', 'location_code': 'ste'}"
1,"{'start_date': '2024-07-24t15:00:00+02:00', 'end_date': '2024-07-24t16:45:00+02:00', 'day': '2024-07-24', 'event': 'football men's group c', 'event_medal': '0', 'gender': 'm', 'event_type': 'hteam', 'venue': 'parc des princes', 'venue_code': 'pdp', 'location_description': 'parc des princes, paris', 'location_code': 'pdp'}"
2,"{'start_date': '2024-07-24t15:30:00+02:00', 'end_date': '2024-07-24t15:46:00+02:00', 'day': '2024-07-24', 'event': 'rugby sevens men's pool b', 'event_medal': '0', 'gender': 'm', 'event_type': 'hteam', 'venue': 'stade de france', 'venue_code': 'sta', 'location_description': 'stade de france', 'location_code': 'sta'}"
3,"{'start_date': '2024-07-24t16:00:00+02:00', 'end_date': '2024-07-24t16:16:00+02:00', 'day': '2024-07-24', 'event': 'rugby sevens men's pool b', 'event_medal': '0', 'gender': 'm', 'event_type': 'hteam', 'venue': 'stade de france', 'venue_code': 'sta', 'location_description': 'stade de france', 'location_code': 'sta'}"
4,"{'start_date': '2024-07-24t16:30:00+02:00', 'end_date': '2024-07-24t16:46:00+02:00', 'day': '2024-07-24', 'event': 'rugby sevens men's pool c', 'event_medal': '0', 'gender': 'm', 'event_type': 'hteam', 'venue': 'stade de france', 'venue_code': 'sta', 'location_description': 'stade de france', 'location_code': 'sta'}"


In [96]:
from datetime import datetime

def clean_parsed_info(parsed_info):
  cleaned_info = {key: value.strip(" |") for key, value in parsed_info.items()}
  for date_field in ["start_date", "end_date"]:
        if date_field in cleaned_info and cleaned_info[date_field]:
            date_obj = datetime.strptime(cleaned_info[date_field], "%Y-%m-%dT%H:%M:%S%z")
            cleaned_info[date_field] = date_obj.strftime("%Y-%m-%d at %H:%M:%S%z")
  return cleaned_info

schedules_df['cleaned_parsed'] = schedules_df['parsed'].apply(clean_parsed_info)

schedules_df['cleaned_parsed'].head()

Unnamed: 0,cleaned_parsed
0,"{'start_date': '2024-07-24 at 15:00:00+0200', 'end_date': '2024-07-24 at 16:45:00+0200', 'day': '2024-07-24', 'event': 'football men's group b', 'event_medal': '0', 'gender': 'm', 'event_type': 'hteam', 'venue': 'geoffroy-guichard stadium', 'venue_code': 'ste', 'location_description': 'geoffroy-guichard stadium, saint-etienne', 'location_code': 'ste'}"
1,"{'start_date': '2024-07-24 at 15:00:00+0200', 'end_date': '2024-07-24 at 16:45:00+0200', 'day': '2024-07-24', 'event': 'football men's group c', 'event_medal': '0', 'gender': 'm', 'event_type': 'hteam', 'venue': 'parc des princes', 'venue_code': 'pdp', 'location_description': 'parc des princes, paris', 'location_code': 'pdp'}"
2,"{'start_date': '2024-07-24 at 15:30:00+0200', 'end_date': '2024-07-24 at 15:46:00+0200', 'day': '2024-07-24', 'event': 'rugby sevens men's pool b', 'event_medal': '0', 'gender': 'm', 'event_type': 'hteam', 'venue': 'stade de france', 'venue_code': 'sta', 'location_description': 'stade de france', 'location_code': 'sta'}"
3,"{'start_date': '2024-07-24 at 16:00:00+0200', 'end_date': '2024-07-24 at 16:16:00+0200', 'day': '2024-07-24', 'event': 'rugby sevens men's pool b', 'event_medal': '0', 'gender': 'm', 'event_type': 'hteam', 'venue': 'stade de france', 'venue_code': 'sta', 'location_description': 'stade de france', 'location_code': 'sta'}"
4,"{'start_date': '2024-07-24 at 16:30:00+0200', 'end_date': '2024-07-24 at 16:46:00+0200', 'day': '2024-07-24', 'event': 'rugby sevens men's pool c', 'event_medal': '0', 'gender': 'm', 'event_type': 'hteam', 'venue': 'stade de france', 'venue_code': 'sta', 'location_description': 'stade de france', 'location_code': 'sta'}"


In [98]:
# Generating questions
def generate_qa_entries(cleaned_data):
    context = " | ".join([f"{key}: {value}" for key, value in cleaned_data.items() if value])
    qa_entries = []
    event = cleaned_data.get("event", "this event")
    discipline = cleaned_data.get("discipline", "this discipline")

    if "start_date" in cleaned_data and cleaned_data["start_date"]:
        qa_entries.append({
            "context": context,
            "question": f"When did {event} start?",
            "answer": cleaned_data["start_date"]
        })
    if "end_date" in cleaned_data and cleaned_data["end_date"]:
        qa_entries.append({
            "context": context,
            "question": f"When did {event} end?",
            "answer": cleaned_data["end_date"]
        })
    if "venue" in cleaned_data and cleaned_data["venue"]:
        qa_entries.append({
            "context": context,
            "question": f"Where was {event} held?",
            "answer": cleaned_data["venue"]
        })
    if "discipline" in cleaned_data and cleaned_data["discipline"]:
        qa_entries.append({
            "context": context,
            "question": f"Which discipline does {event} belong to?",
            "answer": cleaned_data["discipline"]
        })

    return qa_entries


qa_data = []
for _, row in schedules_df.iterrows():
    cleaned_data = row['cleaned_parsed']
    qa_entries = generate_qa_entries(cleaned_data)
    qa_data.extend(qa_entries)

qa_schedules_df = pd.DataFrame(qa_data)

qa_schedules_df.head()


qa_schedules_df['answer'] = qa_schedules_df['answer'].apply(clean_answer)

output_csv_path = "/content/drive/MyDrive/Courses/CIS531/Term_Project/olympics/qa_data/schedules_qa.csv"
qa_schedules_df.to_csv(output_csv_path, index=False)
print(f"QA data for schedules saved to {output_csv_path}")

QA data for schedules saved to /content/drive/MyDrive/Courses/CIS531/Term_Project/olympics/qa_data/schedules_qa.csv


#Teams

In [11]:
file_path = "/content/drive/MyDrive/Courses/CIS531/Term_Project/olympics/Processed_Data/teams_processed.csv"
teams_df = pd.read_csv(file_path)
teams_df.head(), teams_df.info()

# Parsing
def parse_info(text):
    pattern = r"([\w\s]+):\s([^|]+)"
    matches = re.findall(pattern, text)
    return {key.strip().lower().replace(" ", "_"): value.strip() for key, value in matches}

teams_df['parsed'] = teams_df['text'].apply(parse_info)

teams_df['parsed'].head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1698 entries, 0 to 1697
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    1698 non-null   object
dtypes: object(1)
memory usage: 13.4+ KB


Unnamed: 0,parsed
0,"{'code': 'arcmteam3---chn01', 'team': 'people's republic of china', 'team_gender': 'm', 'country_code': 'chn', 'country_long': 'people's republic of china', 'discipline': 'archery', 'disciplines_code': 'arc', 'events': 'men's team', 'athletes': '['kao wenchao', 'li zhongyuan', 'wang yan']', 'coaches': '', 'athletes_codes': '['1913366', '1913367', '1913369']', 'num_athletes': '3.0', 'coaches_codes': ''}"
1,"{'code': 'arcmteam3---col01', 'team': 'colombia', 'team_gender': 'm', 'country_code': 'col', 'country_long': 'colombia', 'discipline': 'archery', 'disciplines_code': 'arc', 'events': 'men's team', 'athletes': '['arcila santiago', 'enriquez jorge', 'hernandez vera andres']', 'coaches': '', 'athletes_codes': '['1935642', '1543412', '1935644']', 'num_athletes': '3.0', 'coaches_codes': ''}"
2,"{'code': 'arcmteam3---fra01', 'team': 'france', 'team_gender': 'm', 'country_code': 'fra', 'country_long': 'france', 'discipline': 'archery', 'disciplines_code': 'arc', 'events': 'men's team', 'athletes': '['addis baptiste', 'chirault thomas', 'valladont jean-charles']', 'coaches': '', 'athletes_codes': '['1541270', '1541272', '1541275']', 'num_athletes': '3.0', 'coaches_codes': ''}"
3,"{'code': 'arcmteam3---gbr01', 'team': 'great britain', 'team_gender': 'm', 'country_code': 'gbr', 'country_long': 'great britain', 'discipline': 'archery', 'disciplines_code': 'arc', 'events': 'men's team', 'athletes': '['hall conor', 'hall tom', 'wise alex']', 'coaches': '', 'athletes_codes': '['1560988', '1560989', '1561003']', 'num_athletes': '3.0', 'coaches_codes': ''}"
4,"{'code': 'arcmteam3---ind01', 'team': 'india', 'team_gender': 'm', 'country_code': 'ind', 'country_long': 'india', 'discipline': 'archery', 'disciplines_code': 'arc', 'events': 'men's team', 'athletes': '['bommadevara dhiraj', 'jadhav pravin ramesh', 'rai tarundeep']', 'coaches': '', 'athletes_codes': '['1546108', '1546112', '1546110']', 'num_athletes': '3.0', 'coaches_codes': ''}"


In [14]:
# Cleaning
def clean_parsed_info(parsed_info):
  cleaned_info = {key: value.strip(" |") for key, value in parsed_info.items()}

  # making a better team name - more descriptive.
  team_name_parts = [
        cleaned_info.get("team", " "),
        cleaned_info.get("discipline", " "),
        cleaned_info.get("events", " "),
        f"code: {cleaned_info.get('code', '')}"
    ]

  cleaned_info["team"] = ", ".join(filter(None, team_name_parts))

  cleaned_info.pop('discipline', None)
  cleaned_info.pop('events', None)


  return cleaned_info


teams_df['cleaned_parsed'] = teams_df['parsed'].apply(clean_parsed_info)
pd.set_option('display.max_colwidth', None)
teams_df['cleaned_parsed'].head()

Unnamed: 0,cleaned_parsed
0,"{'code': 'arcmteam3---chn01', 'team': 'people's republic of china, archery, men's team, code: arcmteam3---chn01', 'team_gender': 'm', 'country_code': 'chn', 'country_long': 'people's republic of china', 'disciplines_code': 'arc', 'athletes': '['kao wenchao', 'li zhongyuan', 'wang yan']', 'coaches': '', 'athletes_codes': '['1913366', '1913367', '1913369']', 'num_athletes': '3.0', 'coaches_codes': ''}"
1,"{'code': 'arcmteam3---col01', 'team': 'colombia, archery, men's team, code: arcmteam3---col01', 'team_gender': 'm', 'country_code': 'col', 'country_long': 'colombia', 'disciplines_code': 'arc', 'athletes': '['arcila santiago', 'enriquez jorge', 'hernandez vera andres']', 'coaches': '', 'athletes_codes': '['1935642', '1543412', '1935644']', 'num_athletes': '3.0', 'coaches_codes': ''}"
2,"{'code': 'arcmteam3---fra01', 'team': 'france, archery, men's team, code: arcmteam3---fra01', 'team_gender': 'm', 'country_code': 'fra', 'country_long': 'france', 'disciplines_code': 'arc', 'athletes': '['addis baptiste', 'chirault thomas', 'valladont jean-charles']', 'coaches': '', 'athletes_codes': '['1541270', '1541272', '1541275']', 'num_athletes': '3.0', 'coaches_codes': ''}"
3,"{'code': 'arcmteam3---gbr01', 'team': 'great britain, archery, men's team, code: arcmteam3---gbr01', 'team_gender': 'm', 'country_code': 'gbr', 'country_long': 'great britain', 'disciplines_code': 'arc', 'athletes': '['hall conor', 'hall tom', 'wise alex']', 'coaches': '', 'athletes_codes': '['1560988', '1560989', '1561003']', 'num_athletes': '3.0', 'coaches_codes': ''}"
4,"{'code': 'arcmteam3---ind01', 'team': 'india, archery, men's team, code: arcmteam3---ind01', 'team_gender': 'm', 'country_code': 'ind', 'country_long': 'india', 'disciplines_code': 'arc', 'athletes': '['bommadevara dhiraj', 'jadhav pravin ramesh', 'rai tarundeep']', 'coaches': '', 'athletes_codes': '['1546108', '1546112', '1546110']', 'num_athletes': '3.0', 'coaches_codes': ''}"


In [20]:
# Generating questions
def generate_qa_entries(cleaned_data):
    context = " | ".join([f"{key}: {value}" for key, value in cleaned_data.items() if value])
    qa_entries = []
    team_name = cleaned_data.get("team", "this team")

    if "country_long" in cleaned_data and cleaned_data["country_long"]:
        qa_entries.append({
            "context": context,
            "question": f"Which country is represented by {team_name}?",
            "answer": cleaned_data["country_long"]
        })
    if "discipline" in cleaned_data and cleaned_data["discipline"]:
        qa_entries.append({
            "context": context,
            "question": f"In which discipline does {team_name} participate?",
            "answer": cleaned_data["discipline"]
        })
    if "num_athletes" in cleaned_data and cleaned_data["num_athletes"]:
        qa_entries.append({
            "context": context,
            "question": f"How many athletes are in {team_name}?",
            "answer": cleaned_data["num_athletes"]
        })
        num_ath = int(float(cleaned_data["num_athletes"]))
    if "athletes" in cleaned_data and cleaned_data["athletes"]:
        qa_entries.append({
            "context": context,
            "question": f"What are the names of the {num_ath} athletes in {team_name}?",
            "answer": cleaned_data["athletes"]
        })
    if "num_coaches" in cleaned_data and cleaned_data["num_coaches"]:
        qa_entries.append({
            "context": context,
            "question": f"How many coaches are in {team_name}?",
            "answer": cleaned_data["num_coaches"]
        })

    return qa_entries


qa_data = []
for _, row in teams_df.iterrows():
    cleaned_data = row['cleaned_parsed']
    qa_entries = generate_qa_entries(cleaned_data)
    qa_data.extend(qa_entries)

qa_teams_df = pd.DataFrame(qa_data)

qa_teams_df.head()


def clean_answer(answer):
    if isinstance(answer, str):
        # First I'm removing brackets and quotes
        answer = re.sub(r"[\[\]\"']", "", answer).strip()
        # Then I'm going through and seeing if there are references at the end of the answer, and removing them if there are.
        answer = re.sub(r"\s*\(.*?\)$", "", answer).strip()
    return answer

qa_teams_df['answer'] = qa_teams_df['answer'].apply(clean_answer)

output_csv_path = "/content/drive/MyDrive/Courses/CIS531/Term_Project/olympics/qa_data/teams_qa.csv"
qa_teams_df.to_csv(output_csv_path, index=False)
print(f"QA data for teams saved to {output_csv_path}")

QA data for teams saved to /content/drive/MyDrive/Courses/CIS531/Term_Project/olympics/qa_data/teams_qa.csv


#Technical Officials

In [21]:
file_path = "/content/drive/MyDrive/Courses/CIS531/Term_Project/olympics/Processed_Data/technical_officials_processed.csv"
tech_off_df = pd.read_csv(file_path)
tech_off_df.head(), tech_off_df.info()

# Parsing
def parse_info(text):
    pattern = r"([\w\s]+):\s([^|]+)"
    matches = re.findall(pattern, text)
    return {key.strip().lower().replace(" ", "_"): value.strip() for key, value in matches}

tech_off_df['parsed'] = tech_off_df['text'].apply(parse_info)

tech_off_df['parsed'].head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1021 entries, 0 to 1020
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    1021 non-null   object
dtypes: object(1)
memory usage: 8.1+ KB


Unnamed: 0,parsed
0,"{'code': '1536406', 'name': 'fratini elisabetta', 'gender': 'female', 'function': 'referee', 'category': 'j', 'organisation_code': 'ita', 'organisation_long': 'italy', 'disciplines': '['judo']'}"
1,"{'code': '1550583', 'name': 'ilic corina', 'gender': 'female', 'function': 'referee', 'category': 'j', 'organisation_code': 'cro', 'organisation_long': 'croatia', 'disciplines': '['judo']'}"
2,"{'code': '1550593', 'name': 'maafi ziad', 'gender': 'male', 'function': 'referee', 'category': 'j', 'organisation_code': 'alg', 'organisation_long': 'algeria', 'disciplines': '['judo']'}"
3,"{'code': '1550642', 'name': 'fumea maria', 'gender': 'female', 'function': 'judge', 'category': 'j', 'organisation_code': 'fig', 'organisation_long': 'international gymnastics federation', 'disciplines': '['artistic gymnastics', 'rhythmic gymnastics', 'trampoline gymnastics']'}"
4,"{'code': '1550643', 'name': 'leung li li', 'gender': 'female', 'function': 'judge', 'category': 'j', 'organisation_code': 'fig', 'organisation_long': 'international gymnastics federation', 'disciplines': '['artistic gymnastics', 'rhythmic gymnastics', 'trampoline gymnastics']'}"


In [22]:
# Cleaning
def clean_parsed_info(parsed_info):
  cleaned_info = {key: value.strip(" |") for key, value in parsed_info.items()}

  return cleaned_info


tech_off_df['cleaned_parsed'] = tech_off_df['parsed'].apply(clean_parsed_info)
pd.set_option('display.max_colwidth', None)
tech_off_df['cleaned_parsed'].head()

Unnamed: 0,cleaned_parsed
0,"{'code': '1536406', 'name': 'fratini elisabetta', 'gender': 'female', 'function': 'referee', 'category': 'j', 'organisation_code': 'ita', 'organisation_long': 'italy', 'disciplines': '['judo']'}"
1,"{'code': '1550583', 'name': 'ilic corina', 'gender': 'female', 'function': 'referee', 'category': 'j', 'organisation_code': 'cro', 'organisation_long': 'croatia', 'disciplines': '['judo']'}"
2,"{'code': '1550593', 'name': 'maafi ziad', 'gender': 'male', 'function': 'referee', 'category': 'j', 'organisation_code': 'alg', 'organisation_long': 'algeria', 'disciplines': '['judo']'}"
3,"{'code': '1550642', 'name': 'fumea maria', 'gender': 'female', 'function': 'judge', 'category': 'j', 'organisation_code': 'fig', 'organisation_long': 'international gymnastics federation', 'disciplines': '['artistic gymnastics', 'rhythmic gymnastics', 'trampoline gymnastics']'}"
4,"{'code': '1550643', 'name': 'leung li li', 'gender': 'female', 'function': 'judge', 'category': 'j', 'organisation_code': 'fig', 'organisation_long': 'international gymnastics federation', 'disciplines': '['artistic gymnastics', 'rhythmic gymnastics', 'trampoline gymnastics']'}"


In [23]:
# Generating questions
def generate_qa_entries(cleaned_data):
    context = " | ".join([f"{key}: {value}" for key, value in cleaned_data.items() if value])
    qa_entries = []
    name = cleaned_data.get("name", "this official")

    if "country_long" in cleaned_data and cleaned_data["country_long"]:
        qa_entries.append({
            "context": context,
            "question": f"Which country does {name} represent?",
            "answer": cleaned_data["country_long"]
        })
    if "gender" in cleaned_data and cleaned_data["gender"]:
        qa_entries.append({
            "context": context,
            "question": f"What is the gender of {name}?",
            "answer": cleaned_data["gender"]
        })
    if "function" in cleaned_data and cleaned_data["function"]:
        qa_entries.append({
            "context": context,
            "question": f"What is the role of {name}?",
            "answer": cleaned_data["function"]
        })
    if "disciplines" in cleaned_data and cleaned_data["disciplines"]:
        qa_entries.append({
            "context": context,
            "question": f"In which disciplines does {name} officiate?",
            "answer": cleaned_data["disciplines"]
        })
    if "organisation_long" in cleaned_data and cleaned_data["organisation_long"]:
        qa_entries.append({
            "context": context,
            "question": f"Which organization does {name} belong to?",
            "answer": cleaned_data["organisation_long"]
        })

    return qa_entries


qa_data = []
for _, row in tech_off_df.iterrows():
    cleaned_data = row['cleaned_parsed']
    qa_entries = generate_qa_entries(cleaned_data)
    qa_data.extend(qa_entries)

qa_tech_off_df = pd.DataFrame(qa_data)

qa_tech_off_df.head()


def clean_answer(answer):
    if isinstance(answer, str):
        # First I'm removing brackets and quotes
        answer = re.sub(r"[\[\]\"']", "", answer).strip()
        # Then I'm going through and seeing if there are references at the end of the answer, and removing them if there are.
        answer = re.sub(r"\s*\(.*?\)$", "", answer).strip()
    return answer

qa_tech_off_df['answer'] = qa_tech_off_df['answer'].apply(clean_answer)

output_csv_path = "/content/drive/MyDrive/Courses/CIS531/Term_Project/olympics/qa_data/technical_officials_qa.csv"
qa_tech_off_df.to_csv(output_csv_path, index=False)
print(f"QA data for technical officials saved to {output_csv_path}")

QA data for technical officials saved to /content/drive/MyDrive/Courses/CIS531/Term_Project/olympics/qa_data/technical_officials_qa.csv


#Venues

In [24]:
file_path = "/content/drive/MyDrive/Courses/CIS531/Term_Project/olympics/Processed_Data/venues_processed.csv"
venues_df = pd.read_csv(file_path)
venues_df.head(), venues_df.info()

# Parsing
def parse_info(text):
    pattern = r"([\w\s]+):\s([^|]+)"
    matches = re.findall(pattern, text)
    return {key.strip().lower().replace(" ", "_"): value.strip() for key, value in matches}

venues_df['parsed'] = venues_df['text'].apply(parse_info)

venues_df['parsed'].head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35 entries, 0 to 34
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    35 non-null     object
dtypes: object(1)
memory usage: 408.0+ bytes


Unnamed: 0,parsed
0,"{'venue': 'aquatics centre', 'sports': '['artistic swimming', 'diving', 'water polo']', 'date_start': '2024-07-27t09:00:00z', 'date_end': '2024-08-10t20:00:00z', 'tag': 'aquatics-centre'}"
1,"{'venue': 'bercy arena', 'sports': '['artistic gymnastics', 'basketball', 'trampoline']', 'date_start': '2024-07-27t09:00:00z', 'date_end': '2024-08-11t16:00:00z', 'tag': 'bercy-arena'}"
2,"{'venue': 'bordeaux stadium', 'sports': '['football']', 'date_start': '2024-07-25t17:00:00z', 'date_end': '2024-08-02t21:59:00z', 'tag': 'bordeaux-stadium'}"
3,"{'venue': 'champ de mars arena', 'sports': '['judo', 'wrestling']', 'date_start': '2024-07-27t08:00:00z', 'date_end': '2024-08-11t12:00:00z', 'tag': 'champ-de-mars-arena'}"
4,"{'venue': 'château de versailles', 'sports': '['equestrian', 'modern pentathlon']', 'date_start': '2024-07-27t07:30:00z', 'date_end': '2024-08-11t11:30:00z', 'tag': 'chateau-de-versailles'}"


In [26]:
from datetime import datetime

def clean_parsed_info(parsed_info):
    cleaned_info = {key: value.strip(" |") for key, value in parsed_info.items()}
    for date_field in ["date_start", "date_end"]:
        if date_field in cleaned_info and cleaned_info[date_field]:
            date_str = cleaned_info[date_field].replace("t", "T").replace("z", "+00:00")
            try:
                date_obj = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S%z")
                cleaned_info[date_field] = date_obj.strftime("%Y-%m-%d at %H:%M:%S%z")
            except ValueError as e:
                print(f"Error parsing date {cleaned_info[date_field]}: {e}")
                cleaned_info[date_field] = cleaned_info[date_field]
    return cleaned_info

venues_df['cleaned_parsed'] = venues_df['parsed'].apply(clean_parsed_info)

venues_df['cleaned_parsed'].head()

Unnamed: 0,cleaned_parsed
0,"{'venue': 'aquatics centre', 'sports': '['artistic swimming', 'diving', 'water polo']', 'date_start': '2024-07-27 at 09:00:00+0000', 'date_end': '2024-08-10 at 20:00:00+0000', 'tag': 'aquatics-centre'}"
1,"{'venue': 'bercy arena', 'sports': '['artistic gymnastics', 'basketball', 'trampoline']', 'date_start': '2024-07-27 at 09:00:00+0000', 'date_end': '2024-08-11 at 16:00:00+0000', 'tag': 'bercy-arena'}"
2,"{'venue': 'bordeaux stadium', 'sports': '['football']', 'date_start': '2024-07-25 at 17:00:00+0000', 'date_end': '2024-08-02 at 21:59:00+0000', 'tag': 'bordeaux-stadium'}"
3,"{'venue': 'champ de mars arena', 'sports': '['judo', 'wrestling']', 'date_start': '2024-07-27 at 08:00:00+0000', 'date_end': '2024-08-11 at 12:00:00+0000', 'tag': 'champ-de-mars-arena'}"
4,"{'venue': 'château de versailles', 'sports': '['equestrian', 'modern pentathlon']', 'date_start': '2024-07-27 at 07:30:00+0000', 'date_end': '2024-08-11 at 11:30:00+0000', 'tag': 'chateau-de-versailles'}"


In [27]:
# Generating questions
def generate_qa_entries(cleaned_data):
    context = " | ".join([f"{key}: {value}" for key, value in cleaned_data.items() if value])
    qa_entries = []
    venue_name = cleaned_data.get("venue", "this venue")

    if "sports" in cleaned_data and cleaned_data["sports"]:
        qa_entries.append({
            "context": context,
            "question": f"What sports are hosted at {venue_name}?",
            "answer": cleaned_data["sports"]
        })
    if "date_start" in cleaned_data and cleaned_data["date_start"]:
        qa_entries.append({
            "context": context,
            "question": f"When does activity start at {venue_name}?",
            "answer": cleaned_data["date_start"]
        })
    if "date_end" in cleaned_data and cleaned_data["date_end"]:
        qa_entries.append({
            "context": context,
            "question": f"When does activity end at {venue_name}?",
            "answer": cleaned_data["date_end"]
        })
    if "tag" in cleaned_data and cleaned_data["tag"]:
        qa_entries.append({
            "context": context,
            "question": f"What is the tag associated with {venue_name}?",
            "answer": cleaned_data["tag"]
        })

    return qa_entries


qa_data = []
for _, row in venues_df.iterrows():
    cleaned_data = row['cleaned_parsed']
    qa_entries = generate_qa_entries(cleaned_data)
    qa_data.extend(qa_entries)

qa_venues_df = pd.DataFrame(qa_data)

qa_venues_df.head()


def clean_answer(answer):
    if isinstance(answer, str):
        # First I'm removing brackets and quotes
        answer = re.sub(r"[\[\]\"']", "", answer).strip()
        # Then I'm going through and seeing if there are references at the end of the answer, and removing them if there are.
        answer = re.sub(r"\s*\(.*?\)$", "", answer).strip()
    return answer

qa_venues_df['answer'] = qa_venues_df['answer'].apply(clean_answer)

output_csv_path = "/content/drive/MyDrive/Courses/CIS531/Term_Project/olympics/qa_data/venues_qa.csv"
qa_venues_df.to_csv(output_csv_path, index=False)
print(f"QA data for venues saved to {output_csv_path}")

QA data for venues saved to /content/drive/MyDrive/Courses/CIS531/Term_Project/olympics/qa_data/venues_qa.csv


#AAAAAAND now we start working through the results!

#3x3 Basketball

In [28]:
file_path = "/content/drive/MyDrive/Courses/CIS531/Term_Project/olympics/Processed_Data/Results/3x3_Basketball_processed.csv"
bb_df = pd.read_csv(file_path)
bb_df.head(), bb_df.info()

# Parsing
def parse_info(text):
    parsed = {}
    items = text.split(" | ")
    for item in items:
        if ": " in item:
            key, value = item.split(": ", 1)
            parsed[key.strip()] = value.strip()
    return parsed

bb_df['parsed'] = bb_df['text'].apply(parse_info)

bb_df['parsed'].head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 136 entries, 0 to 135
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    136 non-null    object
dtypes: object(1)
memory usage: 1.2+ KB


Unnamed: 0,parsed
0,"{'date': '2024-07-30t18:59:05+02:00', 'event_code': 'bk3mteam3', 'event_name': 'men', 'event_stage': 'men's pool round', 'stage': 'pool round', 'gender': 'm', 'discipline_name': '3x3 basketball', 'discipline_code': 'bk3', 'venue': 'la concorde 1', 'participant_code': 'bk3mteam3---lat01', 'participant_name': 'latvia', 'participant_type': 'team', 'participant_country_code': 'lat', 'participant_country': 'latvia', 'result': '21', 'result_type': 'points', 'result_WLT': 'w', 'start_order': '1'}"
1,"{'date': '2024-07-30t18:59:05+02:00', 'event_code': 'bk3mteam3', 'event_name': 'men', 'event_stage': 'men's pool round', 'stage': 'pool round', 'gender': 'm', 'discipline_name': '3x3 basketball', 'discipline_code': 'bk3', 'venue': 'la concorde 1', 'participant_code': 'bk3mteam3---ltu01', 'participant_name': 'lithuania', 'participant_type': 'team', 'participant_country_code': 'ltu', 'participant_country': 'lithuania', 'result': '14', 'result_type': 'points', 'result_WLT': 'l', 'start_order': '2'}"
2,"{'date': '2024-07-30t19:26:55+02:00', 'event_code': 'bk3mteam3', 'event_name': 'men', 'event_stage': 'men's pool round', 'stage': 'pool round', 'gender': 'm', 'discipline_name': '3x3 basketball', 'discipline_code': 'bk3', 'venue': 'la concorde 1', 'participant_code': 'bk3mteam3---chn01', 'participant_name': 'people's republic of china', 'participant_type': 'team', 'participant_country_code': 'chn', 'participant_country': 'china', 'result': '16', 'result_type': 'points', 'result_WLT': 'l', 'start_order': '1'}"
3,"{'date': '2024-07-30t19:26:55+02:00', 'event_code': 'bk3mteam3', 'event_name': 'men', 'event_stage': 'men's pool round', 'stage': 'pool round', 'gender': 'm', 'discipline_name': '3x3 basketball', 'discipline_code': 'bk3', 'venue': 'la concorde 1', 'participant_code': 'bk3mteam3---ned01', 'participant_name': 'netherlands', 'participant_type': 'team', 'participant_country_code': 'ned', 'participant_country': 'netherlands', 'result': '21', 'result_type': 'points', 'result_WLT': 'w', 'start_order': '2'}"
4,"{'date': '2024-07-30t22:30:21+02:00', 'event_code': 'bk3mteam3', 'event_name': 'men', 'event_stage': 'men's pool round', 'stage': 'pool round', 'gender': 'm', 'discipline_name': '3x3 basketball', 'discipline_code': 'bk3', 'venue': 'la concorde 1', 'participant_code': 'bk3mteam3---pol01', 'participant_name': 'poland', 'participant_type': 'team', 'participant_country_code': 'pol', 'participant_country': 'poland', 'result': '19', 'result_type': 'points', 'result_WLT': 'l', 'start_order': '1'}"


In [36]:
from datetime import datetime

def clean_parsed_info(parsed_info):
    cleaned_info = {key: value.strip(" |") for key, value in parsed_info.items()}
    for date_field in ["date"]:
        if date_field in cleaned_info and cleaned_info[date_field]:
            date_obj = datetime.strptime(cleaned_info[date_field], "%Y-%m-%dT%H:%M:%S%z")
            cleaned_info[date_field] = date_obj.strftime("%Y-%m-%d at %H:%M:%S %z")

    # modifying participant name field to be more descriptive
    if "participant_name" in cleaned_info:
        participant_type = cleaned_info.get("participant_type", "").lower()
        discipline_name = cleaned_info.get("discipline_name", "").lower()
        cleaned_info["participant_name"] = f"{cleaned_info['participant_name']} ({participant_type}) in {discipline_name}".strip()

    cleaned_info.pop('discipline_name', None)
    cleaned_info.pop('participant_type', None)

    return cleaned_info

bb_df['cleaned_parsed'] = bb_df['parsed'].apply(clean_parsed_info)

bb_df['cleaned_parsed'].head()

Unnamed: 0,cleaned_parsed
0,"{'date': '2024-07-30 at 18:59:05 +0200', 'event_code': 'bk3mteam3', 'event_name': 'men', 'event_stage': 'men's pool round', 'stage': 'pool round', 'gender': 'm', 'discipline_code': 'bk3', 'venue': 'la concorde 1', 'participant_code': 'bk3mteam3---lat01', 'participant_name': 'latvia (team) in 3x3 basketball', 'participant_country_code': 'lat', 'participant_country': 'latvia', 'result': '21', 'result_type': 'points', 'result_WLT': 'w', 'start_order': '1'}"
1,"{'date': '2024-07-30 at 18:59:05 +0200', 'event_code': 'bk3mteam3', 'event_name': 'men', 'event_stage': 'men's pool round', 'stage': 'pool round', 'gender': 'm', 'discipline_code': 'bk3', 'venue': 'la concorde 1', 'participant_code': 'bk3mteam3---ltu01', 'participant_name': 'lithuania (team) in 3x3 basketball', 'participant_country_code': 'ltu', 'participant_country': 'lithuania', 'result': '14', 'result_type': 'points', 'result_WLT': 'l', 'start_order': '2'}"
2,"{'date': '2024-07-30 at 19:26:55 +0200', 'event_code': 'bk3mteam3', 'event_name': 'men', 'event_stage': 'men's pool round', 'stage': 'pool round', 'gender': 'm', 'discipline_code': 'bk3', 'venue': 'la concorde 1', 'participant_code': 'bk3mteam3---chn01', 'participant_name': 'people's republic of china (team) in 3x3 basketball', 'participant_country_code': 'chn', 'participant_country': 'china', 'result': '16', 'result_type': 'points', 'result_WLT': 'l', 'start_order': '1'}"
3,"{'date': '2024-07-30 at 19:26:55 +0200', 'event_code': 'bk3mteam3', 'event_name': 'men', 'event_stage': 'men's pool round', 'stage': 'pool round', 'gender': 'm', 'discipline_code': 'bk3', 'venue': 'la concorde 1', 'participant_code': 'bk3mteam3---ned01', 'participant_name': 'netherlands (team) in 3x3 basketball', 'participant_country_code': 'ned', 'participant_country': 'netherlands', 'result': '21', 'result_type': 'points', 'result_WLT': 'w', 'start_order': '2'}"
4,"{'date': '2024-07-30 at 22:30:21 +0200', 'event_code': 'bk3mteam3', 'event_name': 'men', 'event_stage': 'men's pool round', 'stage': 'pool round', 'gender': 'm', 'discipline_code': 'bk3', 'venue': 'la concorde 1', 'participant_code': 'bk3mteam3---pol01', 'participant_name': 'poland (team) in 3x3 basketball', 'participant_country_code': 'pol', 'participant_country': 'poland', 'result': '19', 'result_type': 'points', 'result_WLT': 'l', 'start_order': '1'}"


In [54]:
# Generating questions
def generate_qa_entries(cleaned_data):
    context = " | ".join([f"{key}: {value}" for key, value in cleaned_data.items() if value])
    qa_entries = []
    participant_name = cleaned_data.get("participant_name", "this participant")

    if "result" in cleaned_data and cleaned_data["result"]:
        qa_entries.append({
            "context": context,
            "question": f"What was the result for {participant_name}?",
            "answer": cleaned_data["result"]
        })
    if "result_type" in cleaned_data and cleaned_data["result_type"]:
        qa_entries.append({
            "context": context,
            "question": f"What type of result did {participant_name} achieve?",
            "answer": cleaned_data["result_type"]
        })
    if "result_WLT" in cleaned_data and cleaned_data["result_WLT"]:
        qa_entries.append({
            "context": context,
            "question": f"Did {participant_name} win, lose, or tie?",
            "answer": cleaned_data["result_WLT"]
        })
    if "event_name" in cleaned_data and cleaned_data["event_name"]:
        qa_entries.append({
            "context": context,
            "question": f"In which event did {participant_name} participate?",
            "answer": cleaned_data["event_name"]
        })
    if "discipline_name" in cleaned_data and cleaned_data["discipline_name"]:
        qa_entries.append({
            "context": context,
            "question": f"In which discipline did {participant_name} compete?",
            "answer": cleaned_data["discipline_name"]
        })
    if "venue" in cleaned_data and cleaned_data["venue"]:
        qa_entries.append({
            "context": context,
            "question": f"Where did {participant_name} compete?",
            "answer": cleaned_data["venue"]
        })
    if "date" in cleaned_data and cleaned_data["date"]:
        qa_entries.append({
            "context": context,
            "question": f"When did {participant_name} participate?",
            "answer": cleaned_data["date"]
        })

    return qa_entries


qa_data = []
for _, row in bb_df.iterrows():
    cleaned_data = row['cleaned_parsed']
    qa_entries = generate_qa_entries(cleaned_data)
    qa_data.extend(qa_entries)

qa_bb_df = pd.DataFrame(qa_data)

qa_bb_df.head()


def clean_answer(answer):
    if isinstance(answer, str):
        # First I'm removing brackets and quotes
        answer = re.sub(r"[\[\]\"']", "", answer).strip()
        # Then I'm going through and seeing if there are references at the end of the answer, and removing them if there are.
        answer = re.sub(r"\s*\(.*?\)$", "", answer).strip()
    return answer

qa_bb_df['answer'] = qa_bb_df['answer'].apply(clean_answer)

output_csv_path = "/content/drive/MyDrive/Courses/CIS531/Term_Project/olympics/qa_data/results/3x3_basketball_qa.csv"
qa_bb_df.to_csv(output_csv_path, index=False)
print(f"QA data for 3x3 Basketball saved to {output_csv_path}")

QA data for 3x3 Basketball saved to /content/drive/MyDrive/Courses/CIS531/Term_Project/olympics/qa_data/results/3x3_basketball_qa.csv


#Archery

In [38]:
file_path = "/content/drive/MyDrive/Courses/CIS531/Term_Project/olympics/Processed_Data/Results/Archery_processed.csv"
archery_df = pd.read_csv(file_path)
archery_df.head(), archery_df.info()

# Parsing
def parse_info(text):
    parsed = {}
    items = text.split(" | ")
    for item in items:
        if ": " in item:
            key, value = item.split(": ", 1)
            parsed[key.strip()] = value.strip()
    return parsed

archery_df['parsed'] = archery_df['text'].apply(parse_info)

archery_df['parsed'].head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 515 entries, 0 to 514
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    515 non-null    object
dtypes: object(1)
memory usage: 4.1+ KB


Unnamed: 0,parsed
0,"{'date': '2024-07-25t16:35:29+02:00', 'stage_code': 'arcmindivid-----------qual000100--', 'event_code': 'arcmindivid', 'event_name': 'men's individual', 'event_stage': 'men's individual ranking round', 'stage': 'ranking round', 'gender': 'm', 'discipline_name': 'archery', 'discipline_code': 'arc', 'venue': 'invalides', 'participant_code': '1902135', 'participant_name': 'kim woojin', 'participant_type': 'person', 'participant_country_code': 'kor', 'participant_country': 'korea', 'rank': '1.0', 'result': '686', 'result_type': 'points', 'result_WLT': '', 'start_order': '8', 'bib': '4b'}"
1,"{'date': '2024-07-25t16:35:29+02:00', 'stage_code': 'arcmindivid-----------qual000100--', 'event_code': 'arcmindivid', 'event_name': 'men's individual', 'event_stage': 'men's individual ranking round', 'stage': 'ranking round', 'gender': 'm', 'discipline_name': 'archery', 'discipline_code': 'arc', 'venue': 'invalides', 'participant_code': '1902130', 'participant_name': 'kim je deok', 'participant_type': 'person', 'participant_country_code': 'kor', 'participant_country': 'korea', 'rank': '2.0', 'result': '682', 'result_type': 'points', 'result_WLT': '', 'start_order': '10', 'bib': '5b'}"
2,"{'date': '2024-07-25t16:35:29+02:00', 'stage_code': 'arcmindivid-----------qual000100--', 'event_code': 'arcmindivid', 'event_name': 'men's individual', 'event_stage': 'men's individual ranking round', 'stage': 'ranking round', 'gender': 'm', 'discipline_name': 'archery', 'discipline_code': 'arc', 'venue': 'invalides', 'participant_code': '1553249', 'participant_name': 'unruh florian', 'participant_type': 'person', 'participant_country_code': 'ger', 'participant_country': 'germany', 'rank': '3.0', 'result': '681', 'result_type': 'points', 'result_WLT': '', 'start_order': '37', 'bib': '19a'}"
3,"{'date': '2024-07-25t16:35:29+02:00', 'stage_code': 'arcmindivid-----------qual000100--', 'event_code': 'arcmindivid', 'event_name': 'men's individual', 'event_stage': 'men's individual ranking round', 'stage': 'ranking round', 'gender': 'm', 'discipline_name': 'archery', 'discipline_code': 'arc', 'venue': 'invalides', 'participant_code': '1546108', 'participant_name': 'bommadevara dhiraj', 'participant_type': 'person', 'participant_country_code': 'ind', 'participant_country': 'india', 'rank': '4.0', 'result': '681', 'result_type': 'points', 'result_WLT': '', 'start_order': '24', 'bib': '12b'}"
4,"{'date': '2024-07-25t16:35:29+02:00', 'stage_code': 'arcmindivid-----------qual000100--', 'event_code': 'arcmindivid', 'event_name': 'men's individual', 'event_stage': 'men's individual ranking round', 'stage': 'ranking round', 'gender': 'm', 'discipline_name': 'archery', 'discipline_code': 'arc', 'venue': 'invalides', 'participant_code': '1902144', 'participant_name': 'lee wooseok', 'participant_type': 'person', 'participant_country_code': 'kor', 'participant_country': 'korea', 'rank': '5.0', 'result': '681', 'result_type': 'points', 'result_WLT': '', 'start_order': '12', 'bib': '6b'}"


In [47]:
from datetime import datetime

def clean_parsed_info(parsed_info):
    cleaned_info = {key: value.strip(" |") for key, value in parsed_info.items()}
    for date_field in ["date"]:
        if date_field in cleaned_info and cleaned_info[date_field]:
            date_obj = datetime.strptime(cleaned_info[date_field], "%Y-%m-%dT%H:%M:%S%z")
            cleaned_info[date_field] = date_obj.strftime("%Y-%m-%d at %H:%M:%S %z")

    # modifying participant name field to be more descriptive
    if "participant_name" in cleaned_info:
        participant_type = cleaned_info.get("event_name", "").lower()
        discipline_name = cleaned_info.get("discipline_name", "").lower()
        cleaned_info["participant_name"] = f"{cleaned_info['participant_name']} ({participant_type}) in {discipline_name}".strip()

    if "event_name" in cleaned_info and "discipline_name" in cleaned_info:
        cleaned_info["event_name"] = cleaned_info["event_name"] + " " + cleaned_info["discipline_name"]

    cleaned_info.pop('discipline_name', None)
    cleaned_info.pop('participant_type', None)



    return cleaned_info

archery_df['cleaned_parsed'] = archery_df['parsed'].apply(clean_parsed_info)

archery_df['cleaned_parsed'].head()

Unnamed: 0,cleaned_parsed
0,"{'date': '2024-07-25 at 16:35:29 +0200', 'stage_code': 'arcmindivid-----------qual000100--', 'event_code': 'arcmindivid', 'event_name': 'men's individual archery', 'event_stage': 'men's individual ranking round', 'stage': 'ranking round', 'gender': 'm', 'discipline_code': 'arc', 'venue': 'invalides', 'participant_code': '1902135', 'participant_name': 'kim woojin (men's individual) in archery', 'participant_country_code': 'kor', 'participant_country': 'korea', 'rank': '1.0', 'result': '686', 'result_type': 'points', 'result_WLT': '', 'start_order': '8', 'bib': '4b'}"
1,"{'date': '2024-07-25 at 16:35:29 +0200', 'stage_code': 'arcmindivid-----------qual000100--', 'event_code': 'arcmindivid', 'event_name': 'men's individual archery', 'event_stage': 'men's individual ranking round', 'stage': 'ranking round', 'gender': 'm', 'discipline_code': 'arc', 'venue': 'invalides', 'participant_code': '1902130', 'participant_name': 'kim je deok (men's individual) in archery', 'participant_country_code': 'kor', 'participant_country': 'korea', 'rank': '2.0', 'result': '682', 'result_type': 'points', 'result_WLT': '', 'start_order': '10', 'bib': '5b'}"
2,"{'date': '2024-07-25 at 16:35:29 +0200', 'stage_code': 'arcmindivid-----------qual000100--', 'event_code': 'arcmindivid', 'event_name': 'men's individual archery', 'event_stage': 'men's individual ranking round', 'stage': 'ranking round', 'gender': 'm', 'discipline_code': 'arc', 'venue': 'invalides', 'participant_code': '1553249', 'participant_name': 'unruh florian (men's individual) in archery', 'participant_country_code': 'ger', 'participant_country': 'germany', 'rank': '3.0', 'result': '681', 'result_type': 'points', 'result_WLT': '', 'start_order': '37', 'bib': '19a'}"
3,"{'date': '2024-07-25 at 16:35:29 +0200', 'stage_code': 'arcmindivid-----------qual000100--', 'event_code': 'arcmindivid', 'event_name': 'men's individual archery', 'event_stage': 'men's individual ranking round', 'stage': 'ranking round', 'gender': 'm', 'discipline_code': 'arc', 'venue': 'invalides', 'participant_code': '1546108', 'participant_name': 'bommadevara dhiraj (men's individual) in archery', 'participant_country_code': 'ind', 'participant_country': 'india', 'rank': '4.0', 'result': '681', 'result_type': 'points', 'result_WLT': '', 'start_order': '24', 'bib': '12b'}"
4,"{'date': '2024-07-25 at 16:35:29 +0200', 'stage_code': 'arcmindivid-----------qual000100--', 'event_code': 'arcmindivid', 'event_name': 'men's individual archery', 'event_stage': 'men's individual ranking round', 'stage': 'ranking round', 'gender': 'm', 'discipline_code': 'arc', 'venue': 'invalides', 'participant_code': '1902144', 'participant_name': 'lee wooseok (men's individual) in archery', 'participant_country_code': 'kor', 'participant_country': 'korea', 'rank': '5.0', 'result': '681', 'result_type': 'points', 'result_WLT': '', 'start_order': '12', 'bib': '6b'}"


In [55]:
# Generating questions
def generate_qa_entries(cleaned_data):
    context = " | ".join([f"{key}: {value}" for key, value in cleaned_data.items() if value])
    qa_entries = []
    participant_name = cleaned_data.get("participant_name", "this participant")

    if "result" in cleaned_data and cleaned_data["result"]:
        res = cleaned_data["result"]
        res_type = cleaned_data["result_type"]
        qa_entries.append({
            "context": context,
            "question": f"What was the result for {participant_name}?",
            "answer": f"{res} {res_type}"
        })
    if "result_type" in cleaned_data and cleaned_data["result_type"]:
        qa_entries.append({
            "context": context,
            "question": f"What type of result did {participant_name} achieve?",
            "answer": cleaned_data["result_type"]
        })
    if "result_WLT" in cleaned_data and cleaned_data["result_WLT"]:
        qa_entries.append({
            "context": context,
            "question": f"Did {participant_name} win, lose, or tie?",
            "answer": cleaned_data["result_WLT"]
        })
    if "event_name" in cleaned_data and cleaned_data["event_name"]:
        qa_entries.append({
            "context": context,
            "question": f"In which event did {participant_name} participate?",
            "answer": cleaned_data["event_name"]
        })
    if "discipline_name" in cleaned_data and cleaned_data["discipline_name"]:


        qa_entries.append({
            "context": context,
            "question": f"In which discipline did {participant_name} compete?",
            "answer": cleaned_data["discipline_name"]
        })
    if "venue" in cleaned_data and cleaned_data["venue"]:
        qa_entries.append({
            "context": context,
            "question": f"Where did {participant_name} compete?",
            "answer": cleaned_data["venue"]
        })
    if "rank" in cleaned_data and cleaned_data["rank"]:
        qa_entries.append({
            "context": context,
            "question": f"What was the rank of {participant_name}?",
            "answer": cleaned_data["rank"]
        })
    if "date" in cleaned_data and cleaned_data["date"]:
        qa_entries.append({
            "context": context,
            "question": f"When did {participant_name} participate?",
            "answer": cleaned_data["date"]
        })

    return qa_entries


qa_data = []
for _, row in archery_df.iterrows():
    cleaned_data = row['cleaned_parsed']
    qa_entries = generate_qa_entries(cleaned_data)
    qa_data.extend(qa_entries)

qa_archery_df = pd.DataFrame(qa_data)

qa_archery_df.head()


def clean_answer(answer):
    if isinstance(answer, str):
        # First I'm removing brackets and quotes
        answer = re.sub(r"[\[\]\"']", "", answer).strip()
        # Then I'm going through and seeing if there are references at the end of the answer, and removing them if there are.
        answer = re.sub(r"\s*\(.*?\)$", "", answer).strip()
    return answer

qa_archery_df['answer'] = qa_archery_df['answer'].apply(clean_answer)

output_csv_path = "/content/drive/MyDrive/Courses/CIS531/Term_Project/olympics/qa_data/results/archery_qa.csv"
qa_archery_df.to_csv(output_csv_path, index=False)
print(f"QA data for Archery saved to {output_csv_path}")

QA data for Archery saved to /content/drive/MyDrive/Courses/CIS531/Term_Project/olympics/qa_data/results/archery_qa.csv


#Artistic Gymnastics

In [49]:
file_path = "/content/drive/MyDrive/Courses/CIS531/Term_Project/olympics/Processed_Data/Results/Artistic_Gymnastics_processed.csv"
art_gym_df = pd.read_csv(file_path)
art_gym_df.head(), art_gym_df.info()

# Parsing
def parse_info(text):
    parsed = {}
    items = text.split(" | ")
    for item in items:
        if ": " in item:
            key, value = item.split(": ", 1)
            parsed[key.strip()] = value.strip()
    return parsed

art_gym_df['parsed'] = art_gym_df['text'].apply(parse_info)

art_gym_df['parsed'].head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 247 entries, 0 to 246
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    247 non-null    object
dtypes: object(1)
memory usage: 2.1+ KB


Unnamed: 0,parsed
0,"{'date': '2024-07-31t20:13:54+02:00', 'stage_code': 'garm1aa---------------fnl-000001--', 'event_code': 'garm1aa', 'event_name': 'men's all-around', 'event_stage': 'men's all-around final', 'stage': 'final', 'gender': 'm', 'discipline_name': 'artistic gymnastics', 'discipline_code': 'gar', 'venue': 'bercy arena', 'participant_code': '1896190', 'participant_name': 'dunkel nils', 'participant_type': 'person', 'participant_country_code': 'ger', 'participant_country': 'germany', 'rank': '', 'result': '13.7', 'result_type': 'points', 'result_diff': '', 'start_order': '1.0', 'bib': '139.0'}"
1,"{'date': '2024-07-31t20:13:54+02:00', 'stage_code': 'garm1aa---------------fnl-000001--', 'event_code': 'garm1aa', 'event_name': 'men's all-around', 'event_stage': 'men's all-around final', 'stage': 'final', 'gender': 'm', 'discipline_name': 'artistic gymnastics', 'discipline_code': 'gar', 'venue': 'bercy arena', 'participant_code': '1970203', 'participant_name': 'rijken frank', 'participant_type': 'person', 'participant_country_code': 'ned', 'participant_country': 'netherlands', 'rank': '', 'result': '13.733', 'result_type': 'points', 'result_diff': '', 'start_order': '2.0', 'bib': '168.0'}"
2,"{'date': '2024-07-31t20:13:54+02:00', 'stage_code': 'garm1aa---------------fnl-000001--', 'event_code': 'garm1aa', 'event_name': 'men's all-around', 'event_stage': 'men's all-around final', 'stage': 'final', 'gender': 'm', 'discipline_name': 'artistic gymnastics', 'discipline_code': 'gar', 'venue': 'bercy arena', 'participant_code': '1973090', 'participant_name': 'dolci felix', 'participant_type': 'person', 'participant_country_code': 'can', 'participant_country': 'canada', 'rank': '', 'result': '14.366', 'result_type': 'points', 'result_diff': '', 'start_order': '3.0', 'bib': '112.0'}"
3,"{'date': '2024-07-31t20:13:54+02:00', 'stage_code': 'garm1aa---------------fnl-000001--', 'event_code': 'garm1aa', 'event_name': 'men's all-around', 'event_stage': 'men's all-around final', 'stage': 'final', 'gender': 'm', 'discipline_name': 'artistic gymnastics', 'discipline_code': 'gar', 'venue': 'bercy arena', 'participant_code': '1977699', 'participant_name': 'langenegger florian', 'participant_type': 'person', 'participant_country_code': 'sui', 'participant_country': 'switzerland', 'rank': '', 'result': '14.133', 'result_type': 'points', 'result_diff': '', 'start_order': '4.0', 'bib': '174.0'}"
4,"{'date': '2024-07-31t20:13:54+02:00', 'stage_code': 'garm1aa---------------fnl-000001--', 'event_code': 'garm1aa', 'event_name': 'men's all-around', 'event_stage': 'men's all-around final', 'stage': 'final', 'gender': 'm', 'discipline_name': 'artistic gymnastics', 'discipline_code': 'gar', 'venue': 'bercy arena', 'participant_code': '1953618', 'participant_name': 'soares diogo', 'participant_type': 'person', 'participant_country_code': 'bra', 'participant_country': 'brazil', 'rank': '', 'result': '14.5', 'result_type': 'points', 'result_diff': '', 'start_order': '5.0', 'bib': '108.0'}"


In [50]:
from datetime import datetime

def clean_parsed_info(parsed_info):
    cleaned_info = {key: value.strip(" |") for key, value in parsed_info.items()}
    for date_field in ["date"]:
        if date_field in cleaned_info and cleaned_info[date_field]:
            date_obj = datetime.strptime(cleaned_info[date_field], "%Y-%m-%dT%H:%M:%S%z")
            cleaned_info[date_field] = date_obj.strftime("%Y-%m-%d at %H:%M:%S %z")

    # modifying participant name field to be more descriptive
    if "participant_name" in cleaned_info:
        participant_type = cleaned_info.get("event_name", "").lower()
        discipline_name = cleaned_info.get("discipline_name", "").lower()
        cleaned_info["participant_name"] = f"{cleaned_info['participant_name']} ({participant_type}) in {discipline_name}".strip()

    if "event_name" in cleaned_info and "discipline_name" in cleaned_info:
        cleaned_info["event_name"] = cleaned_info["event_name"] + " " + cleaned_info["discipline_name"]

    cleaned_info.pop('discipline_name', None)
    cleaned_info.pop('participant_type', None)



    return cleaned_info

art_gym_df['cleaned_parsed'] = art_gym_df['parsed'].apply(clean_parsed_info)

art_gym_df['cleaned_parsed'].head()

Unnamed: 0,cleaned_parsed
0,"{'date': '2024-07-31 at 20:13:54 +0200', 'stage_code': 'garm1aa---------------fnl-000001--', 'event_code': 'garm1aa', 'event_name': 'men's all-around artistic gymnastics', 'event_stage': 'men's all-around final', 'stage': 'final', 'gender': 'm', 'discipline_code': 'gar', 'venue': 'bercy arena', 'participant_code': '1896190', 'participant_name': 'dunkel nils (men's all-around) in artistic gymnastics', 'participant_country_code': 'ger', 'participant_country': 'germany', 'rank': '', 'result': '13.7', 'result_type': 'points', 'result_diff': '', 'start_order': '1.0', 'bib': '139.0'}"
1,"{'date': '2024-07-31 at 20:13:54 +0200', 'stage_code': 'garm1aa---------------fnl-000001--', 'event_code': 'garm1aa', 'event_name': 'men's all-around artistic gymnastics', 'event_stage': 'men's all-around final', 'stage': 'final', 'gender': 'm', 'discipline_code': 'gar', 'venue': 'bercy arena', 'participant_code': '1970203', 'participant_name': 'rijken frank (men's all-around) in artistic gymnastics', 'participant_country_code': 'ned', 'participant_country': 'netherlands', 'rank': '', 'result': '13.733', 'result_type': 'points', 'result_diff': '', 'start_order': '2.0', 'bib': '168.0'}"
2,"{'date': '2024-07-31 at 20:13:54 +0200', 'stage_code': 'garm1aa---------------fnl-000001--', 'event_code': 'garm1aa', 'event_name': 'men's all-around artistic gymnastics', 'event_stage': 'men's all-around final', 'stage': 'final', 'gender': 'm', 'discipline_code': 'gar', 'venue': 'bercy arena', 'participant_code': '1973090', 'participant_name': 'dolci felix (men's all-around) in artistic gymnastics', 'participant_country_code': 'can', 'participant_country': 'canada', 'rank': '', 'result': '14.366', 'result_type': 'points', 'result_diff': '', 'start_order': '3.0', 'bib': '112.0'}"
3,"{'date': '2024-07-31 at 20:13:54 +0200', 'stage_code': 'garm1aa---------------fnl-000001--', 'event_code': 'garm1aa', 'event_name': 'men's all-around artistic gymnastics', 'event_stage': 'men's all-around final', 'stage': 'final', 'gender': 'm', 'discipline_code': 'gar', 'venue': 'bercy arena', 'participant_code': '1977699', 'participant_name': 'langenegger florian (men's all-around) in artistic gymnastics', 'participant_country_code': 'sui', 'participant_country': 'switzerland', 'rank': '', 'result': '14.133', 'result_type': 'points', 'result_diff': '', 'start_order': '4.0', 'bib': '174.0'}"
4,"{'date': '2024-07-31 at 20:13:54 +0200', 'stage_code': 'garm1aa---------------fnl-000001--', 'event_code': 'garm1aa', 'event_name': 'men's all-around artistic gymnastics', 'event_stage': 'men's all-around final', 'stage': 'final', 'gender': 'm', 'discipline_code': 'gar', 'venue': 'bercy arena', 'participant_code': '1953618', 'participant_name': 'soares diogo (men's all-around) in artistic gymnastics', 'participant_country_code': 'bra', 'participant_country': 'brazil', 'rank': '', 'result': '14.5', 'result_type': 'points', 'result_diff': '', 'start_order': '5.0', 'bib': '108.0'}"


In [52]:
# Generating questions
def generate_qa_entries(cleaned_data):
    context = " | ".join([f"{key}: {value}" for key, value in cleaned_data.items() if value])
    qa_entries = []
    participant_name = cleaned_data.get("participant_name", "this participant")

    if "result" in cleaned_data and cleaned_data["result"]:
        res = cleaned_data["result"]
        res_type = cleaned_data["result_type"]
        qa_entries.append({
            "context": context,
            "question": f"What was the result for {participant_name}?",
            "answer": f"{res} {res_type}"
        })
    if "result_type" in cleaned_data and cleaned_data["result_type"]:
        qa_entries.append({
            "context": context,
            "question": f"What type of result did {participant_name} achieve?",
            "answer": cleaned_data["result_type"]
        })
    if "result_WLT" in cleaned_data and cleaned_data["result_WLT"]:
        qa_entries.append({
            "context": context,
            "question": f"Did {participant_name} win, lose, or tie?",
            "answer": cleaned_data["result_WLT"]
        })
    if "event_name" in cleaned_data and cleaned_data["event_name"]:
        qa_entries.append({
            "context": context,
            "question": f"In which event did {participant_name} participate?",
            "answer": cleaned_data["event_name"]
        })
    if "discipline_name" in cleaned_data and cleaned_data["discipline_name"]:
        qa_entries.append({
            "context": context,
            "question": f"In which discipline did {participant_name} compete?",
            "answer": cleaned_data["discipline_name"]
        })
    if "venue" in cleaned_data and cleaned_data["venue"]:
        qa_entries.append({
            "context": context,
            "question": f"Where did {participant_name} compete?",
            "answer": cleaned_data["venue"]
        })
    if "rank" in cleaned_data and cleaned_data["rank"]:
        qa_entries.append({
            "context": context,
            "question": f"What was the rank of {participant_name}?",
            "answer": cleaned_data["rank"]
        })
    if "date" in cleaned_data and cleaned_data["date"]:
        qa_entries.append({
            "context": context,
            "question": f"When did {participant_name} participate?",
            "answer": cleaned_data["date"]
        })

    return qa_entries


qa_data = []
for _, row in art_gym_df.iterrows():
    cleaned_data = row['cleaned_parsed']
    qa_entries = generate_qa_entries(cleaned_data)
    qa_data.extend(qa_entries)

qa_art_gym_df = pd.DataFrame(qa_data)

qa_art_gym_df.head()


def clean_answer(answer):
    if isinstance(answer, str):
        # First I'm removing brackets and quotes
        answer = re.sub(r"[\[\]\"']", "", answer).strip()
        # Then I'm going through and seeing if there are references at the end of the answer, and removing them if there are.
        answer = re.sub(r"\s*\(.*?\)$", "", answer).strip()
    return answer

qa_art_gym_df['answer'] = qa_art_gym_df['answer'].apply(clean_answer)

output_csv_path = "/content/drive/MyDrive/Courses/CIS531/Term_Project/olympics/qa_data/results/artistic_gymnastics_qa.csv"
qa_art_gym_df.to_csv(output_csv_path, index=False)
print(f"QA data for artistic gymnastics saved to {output_csv_path}")

QA data for artistic gymnastics saved to /content/drive/MyDrive/Courses/CIS531/Term_Project/olympics/qa_data/results/artistic_gymnastics_qa.csv


#Artistic Swimming

In [56]:
file_path = "/content/drive/MyDrive/Courses/CIS531/Term_Project/olympics/Processed_Data/Results/Artistic_Swimming_processed.csv"
art_swim_df = pd.read_csv(file_path)
art_swim_df.head(), art_swim_df.info()

# Parsing
def parse_info(text):
    parsed = {}
    items = text.split(" | ")
    for item in items:
        if ": " in item:
            key, value = item.split(": ", 1)
            parsed[key.strip()] = value.strip()
    return parsed

art_swim_df['parsed'] = art_swim_df['text'].apply(parse_info)

art_swim_df['parsed'].head()

from datetime import datetime

def clean_parsed_info(parsed_info):
    cleaned_info = {key: value.strip(" |") for key, value in parsed_info.items()}
    for date_field in ["date"]:
        if date_field in cleaned_info and cleaned_info[date_field]:
            date_obj = datetime.strptime(cleaned_info[date_field], "%Y-%m-%dT%H:%M:%S%z")
            cleaned_info[date_field] = date_obj.strftime("%Y-%m-%d at %H:%M:%S %z")

    # modifying participant name field to be more descriptive
    if "participant_name" in cleaned_info:
        participant_type = cleaned_info.get("event_name", "").lower()
        discipline_name = cleaned_info.get("discipline_name", "").lower()
        cleaned_info["participant_name"] = f"{cleaned_info['participant_name']} ({participant_type}) in {discipline_name}".strip()

    if "event_name" in cleaned_info and "discipline_name" in cleaned_info:
        cleaned_info["event_name"] = cleaned_info["event_name"] + " " + cleaned_info["discipline_name"]

    cleaned_info.pop('discipline_name', None)
    cleaned_info.pop('participant_type', None)



    return cleaned_info

art_swim_df['cleaned_parsed'] = art_swim_df['parsed'].apply(clean_parsed_info)

art_swim_df['cleaned_parsed'].head()

# Generating questions
def generate_qa_entries(cleaned_data):
    context = " | ".join([f"{key}: {value}" for key, value in cleaned_data.items() if value])
    qa_entries = []
    participant_name = cleaned_data.get("participant_name", "this participant")

    if "result" in cleaned_data and cleaned_data["result"]:
        res = cleaned_data["result"]
        res_type = cleaned_data["result_type"]
        qa_entries.append({
            "context": context,
            "question": f"What was the result for {participant_name}?",
            "answer": f"{res} {res_type}"
        })
    if "result_type" in cleaned_data and cleaned_data["result_type"]:
        qa_entries.append({
            "context": context,
            "question": f"What type of result did {participant_name} achieve?",
            "answer": cleaned_data["result_type"]
        })
    if "result_WLT" in cleaned_data and cleaned_data["result_WLT"]:
        qa_entries.append({
            "context": context,
            "question": f"Did {participant_name} win, lose, or tie?",
            "answer": cleaned_data["result_WLT"]
        })
    if "event_name" in cleaned_data and cleaned_data["event_name"]:
        qa_entries.append({
            "context": context,
            "question": f"In which event did {participant_name} participate?",
            "answer": cleaned_data["event_name"]
        })
    if "discipline_name" in cleaned_data and cleaned_data["discipline_name"]:
        qa_entries.append({
            "context": context,
            "question": f"In which discipline did {participant_name} compete?",
            "answer": cleaned_data["discipline_name"]
        })
    if "venue" in cleaned_data and cleaned_data["venue"]:
        qa_entries.append({
            "context": context,
            "question": f"Where did {participant_name} compete?",
            "answer": cleaned_data["venue"]
        })
    if "rank" in cleaned_data and cleaned_data["rank"]:
        qa_entries.append({
            "context": context,
            "question": f"What was the rank of {participant_name}?",
            "answer": cleaned_data["rank"]
        })
    if "date" in cleaned_data and cleaned_data["date"]:
        qa_entries.append({
            "context": context,
            "question": f"When did {participant_name} participate?",
            "answer": cleaned_data["date"]
        })

    return qa_entries


qa_data = []
for _, row in art_swim_df.iterrows():
    cleaned_data = row['cleaned_parsed']
    qa_entries = generate_qa_entries(cleaned_data)
    qa_data.extend(qa_entries)

qa_art_swim_df = pd.DataFrame(qa_data)

qa_art_swim_df.head()


def clean_answer(answer):
    if isinstance(answer, str):
        # First I'm removing brackets and quotes
        answer = re.sub(r"[\[\]\"']", "", answer).strip()
        # Then I'm going through and seeing if there are references at the end of the answer, and removing them if there are.
        answer = re.sub(r"\s*\(.*?\)$", "", answer).strip()
    return answer

qa_art_swim_df['answer'] = qa_art_swim_df['answer'].apply(clean_answer)

output_csv_path = "/content/drive/MyDrive/Courses/CIS531/Term_Project/olympics/qa_data/results/artistic_swimming_qa.csv"
qa_art_swim_df.to_csv(output_csv_path, index=False)
print(f"QA data for artistic swimming saved to {output_csv_path}")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93 entries, 0 to 92
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    93 non-null     object
dtypes: object(1)
memory usage: 872.0+ bytes
QA data for artistic swimming saved to /content/drive/MyDrive/Courses/CIS531/Term_Project/olympics/qa_data/results/artistic_swimming_qa.csv


#Athletics_processed, Badminton_processed, Basketball_processed, Beach_Volleyball_processed, Boxing_processed, Canoe_Slalom_processed, Canoe_Sprint_processed, Cycling_BMX_Freestyle_processed, Cycling_BMX_Racing_processed, Cycling_Mountain_Bike_processed, Cycling_Road_processed, Cycling_Track_processed, Diving_processed, Equestrian_processed, Fencing_processed, Football_processed, Golf_processed, Handball_processed, Hockey_processed, Judo_processed, Marathon_Swimming_processed, Modern_Pentathlon_processed, Rhythmic_Gymnastics_processed, Rowing_processed, Rugby_Sevens_processed, Sailing_processed, Shooting_processed, Skateboarding_processed, Sport_Climbing_processed, Surfing_processed, Swimming_processed, Table_Tennis_processed, Taekwondo_processed, Tennis_processed, Trampoline_Gymnastics_processed, Triathlon_processed, Volleyball_processed, Water_Polo_processed, Weightlifting_processed, Wrestling_processed

The results files are all very similar, and I've got a basic formula worked out that should do for most (I'll check afterward and modify them individually as necessary). So, I've made a loop to process them all :)

In [7]:
# Parsing
def parse_info(text):
    parsed = {}
    items = text.split(" | ")
    for item in items:
        if ": " in item:
            key, value = item.split(": ", 1)
            parsed[key.strip()] = value.strip()
    return parsed


def clean_parsed_info(parsed_info):
    cleaned_info = {key: value.strip(" |") for key, value in parsed_info.items()}
    for date_field in ["date"]:
        if date_field in cleaned_info and cleaned_info[date_field]:
            date_obj = datetime.strptime(cleaned_info[date_field], "%Y-%m-%dT%H:%M:%S%z")
            cleaned_info[date_field] = date_obj.strftime("%Y-%m-%d at %H:%M:%S %z")

    # modifying participant name field to be more descriptive
    if "participant_name" in cleaned_info:
        participant_type = cleaned_info.get("event_name", "").lower()
        discipline_name = cleaned_info.get("discipline_name", "").lower()
        cleaned_info["participant_name"] = f"{cleaned_info['participant_name']} ({participant_type}) in {discipline_name}".strip()

    if "event_name" in cleaned_info and "discipline_name" in cleaned_info:
        cleaned_info["event_name"] = cleaned_info["event_name"] + " " + cleaned_info["discipline_name"]

    cleaned_info.pop('discipline_name', None)
    cleaned_info.pop('participant_type', None)

    return cleaned_info



# Generating questions
def generate_qa_entries(cleaned_data):
    context = " | ".join([f"{key}: {value}" for key, value in cleaned_data.items() if value])
    qa_entries = []
    participant_name = cleaned_data.get("participant_name", "this participant")

    if "result" in cleaned_data and cleaned_data["result"]:
        res = cleaned_data["result"]
        if "result_type" in cleaned_data and cleaned_data["result_type"]:
            res_type = cleaned_data["result_type"]
            full_result = f"{res} ({res_type})"
        else:
            full_result = res
        qa_entries.append({
            "context": context,
            "question": f"What was the result for {participant_name}?",
            "answer": f"{full_result}"
        })
    if "result_type" in cleaned_data and cleaned_data["result_type"]:
        qa_entries.append({
            "context": context,
            "question": f"What type of result did {participant_name} achieve?",
            "answer": cleaned_data["result_type"]
        })
    if "result_WLT" in cleaned_data and cleaned_data["result_WLT"]: # I modified this to map w to win, l to lose and t to tie :)
        result_wlt_mapping = {"w": "win", "l": "lose", "t": "tie"}
        result_wlt_full = result_wlt_mapping.get(cleaned_data["result_WLT"].lower(), cleaned_data["result_WLT"])
        qa_entries.append({
            "context": context,
            "question": f"Did {participant_name} win, lose, or tie?",
            "answer": result_wlt_full
        })
    if "event_name" in cleaned_data and cleaned_data["event_name"]:
        qa_entries.append({
            "context": context,
            "question": f"In which event did {participant_name} participate?",
            "answer": cleaned_data["event_name"]
        })
    if "discipline_name" in cleaned_data and cleaned_data["discipline_name"]:
        qa_entries.append({
            "context": context,
            "question": f"In which discipline did {participant_name} compete?",
            "answer": cleaned_data["discipline_name"]
        })
    if "venue" in cleaned_data and cleaned_data["venue"]:
        qa_entries.append({
            "context": context,
            "question": f"Where did {participant_name} compete?",
            "answer": cleaned_data["venue"]
        })
    if "rank" in cleaned_data and cleaned_data["rank"]:
        qa_entries.append({
            "context": context,
            "question": f"What was the rank of {participant_name}?",
            "answer": cleaned_data["rank"]
        })
    if "date" in cleaned_data and cleaned_data["date"]:
        qa_entries.append({
            "context": context,
            "question": f"When did {participant_name} participate?",
            "answer": cleaned_data["date"]
        })

    return qa_entries

def clean_answer(answer):
    if isinstance(answer, str):
        # First I'm removing brackets and quotes
        answer = re.sub(r"[\[\]\"']", "", answer).strip()
        # Now i'm not blanket getting rid of parenthesis so I can have the formatting of result (result type)
        if re.match(r".+\s\(.+\)", answer):  # Now i'll keep the parenthesis for specific cases
            return answer.strip()
        # And otherwise remove the parenthesis
        answer = re.sub(r"\s*\(.*?\)$", "", answer).strip()
    return answer

processed_files = [
    "Athletics_processed", "Badminton_processed", "Basketball_processed",
    "Beach_Volleyball_processed", "Boxing_processed", "Canoe_Slalom_processed",
    "Canoe_Sprint_processed", "Cycling_BMX_Freestyle_processed",
    "Cycling_BMX_Racing_processed", "Cycling_Mountain_Bike_processed",
    "Cycling_Road_processed", "Cycling_Track_processed", "Diving_processed",
    "Equestrian_processed", "Fencing_processed", "Football_processed",
    "Golf_processed", "Handball_processed", "Hockey_processed",
    "Judo_processed", "Marathon_Swimming_processed", "Modern_Pentathlon_processed",
    "Rhythmic_Gymnastics_processed", "Rowing_processed", "Rugby_Sevens_processed",
    "Sailing_processed", "Shooting_processed", "Skateboarding_processed",
    "Sport_Climbing_processed", "Surfing_processed", "Swimming_processed",
    "Table_Tennis_processed", "Taekwondo_processed", "Tennis_processed",
    "Trampoline_Gymnastics_processed", "Triathlon_processed",
    "Volleyball_processed", "Water_Polo_processed", "Weightlifting_processed",
    "Wrestling_processed"
]

input_folder = "/content/drive/MyDrive/Courses/CIS531/Term_Project/olympics/Processed_Data/Results/"
output_folder = "/content/drive/MyDrive/Courses/CIS531/Term_Project/olympics/qa_data/results/"

for file_name in processed_files:
    file_path = f"{input_folder}{file_name}.csv"
    df = pd.read_csv(file_path)

    # Parsing and cleaning
    df['parsed'] = df['text'].apply(parse_info)
    df['cleaned_parsed'] = df['parsed'].apply(clean_parsed_info)

    # Generating QA data
    qa_data = []
    for _, row in df.iterrows():
        cleaned_data = row['cleaned_parsed']
        qa_entries = generate_qa_entries(cleaned_data)
        qa_data.extend(qa_entries)

    qa_df = pd.DataFrame(qa_data)
    qa_df['answer'] = qa_df['answer'].apply(clean_answer)

    # Save to CSV
    output_path = f"{output_folder}{file_name}_qa.csv"
    qa_df.to_csv(output_path, index=False)
    print(f"QA data for {file_name} saved to {output_path}")

QA data for Athletics_processed saved to /content/drive/MyDrive/Courses/CIS531/Term_Project/olympics/qa_data/results/Athletics_processed_qa.csv
QA data for Badminton_processed saved to /content/drive/MyDrive/Courses/CIS531/Term_Project/olympics/qa_data/results/Badminton_processed_qa.csv
QA data for Basketball_processed saved to /content/drive/MyDrive/Courses/CIS531/Term_Project/olympics/qa_data/results/Basketball_processed_qa.csv
QA data for Beach_Volleyball_processed saved to /content/drive/MyDrive/Courses/CIS531/Term_Project/olympics/qa_data/results/Beach_Volleyball_processed_qa.csv
QA data for Boxing_processed saved to /content/drive/MyDrive/Courses/CIS531/Term_Project/olympics/qa_data/results/Boxing_processed_qa.csv
QA data for Canoe_Slalom_processed saved to /content/drive/MyDrive/Courses/CIS531/Term_Project/olympics/qa_data/results/Canoe_Slalom_processed_qa.csv
QA data for Canoe_Sprint_processed saved to /content/drive/MyDrive/Courses/CIS531/Term_Project/olympics/qa_data/results/

##Done!! :) :) The data is now in a good QA format to train a chatbot on :)