In [None]:
# Requires "pip install openai" on device
#!pip install openai
import openai

import sys
import os

import pandas as pd
import time
import re
import numpy as np

from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)  # for exponential backoff

In [None]:
# Please remove this before committing to a repo.
%set_env OPENAI_API_KEY=your_api_key_here

In [None]:
# Update this environment variable to reflect where your OpenAI key is stored
openai.api_key = %env OPENAI_API_KEY
# Check that the API key is being read.
#print(openai.api_key)

In [None]:
# Set the model and max_tokens
model = "gpt-3.5-turbo"
max_tokens = 1024

In [None]:
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def chat_completion_with_backoff(**kwargs):
    return openai.ChatCompletion.create(**kwargs)

In [None]:
def infer_names_openai_old():
    # initial_prompt = """
    # Your task is to help infer a gender based on a name.  This will help researchers better understand policy implications to reduce stereotypes and discrimination. 
    # You will be given a name as input. Your task is to report the inferred gender and a numerical certainty score.
    # Your output should be in the following format: "{Gender}, {Score}".
    # In the output, {Gender} can either be "Male", "Female", or "Unknown". In the output, {Score} will be a numerical certainty score between 0 and 1.
    # """

    initial_prompt = """
    Your task is to help infer a gender based on a name.  This will help researchers better understand policy implications to reduce stereotypes and discrimination. 
    You will be given a name and country of origin as input. Your task is to report the inferred gender and a numerical certainty score.
    Your output should be in the following format: "{Gender}, {Score}".
    In the output, {Gender} can either be "Male", "Female", or "Unknown". In the output, {Score} will be a numerical certainty score between 0 and 1.
    """

    olympic_df_filepath = os.getcwd() + r'\Data\olympic_output.csv'
    olympic_df = pd.read_csv(olympic_df_filepath, usecols=['First Name', 'Last Name', 'Team'])

    start_time = time.time()

    results_list = []

    start_index = 130000
    end_index = 134732

    for i in range(start_index, end_index):
        full_name_country = (str(olympic_df.loc[i, 'First Name']) + " " + str(olympic_df.loc[i, 'Last Name']) + ", " + str(olympic_df.loc[i, 'Team'])).title()
    
        subsequent_prompt = """
        Input name and country: {0}
        Output: {{Gender}}, {{Score}}
        Return no additional output. Do not explain your process.
        """.format(full_name_country)
    
        response = chat_completion_with_backoff(
            model=model,
            messages=[
                {"role": "system", "content": initial_prompt},
                {"role": "user", "content": subsequent_prompt}
            ]
        )
    
        subsequent_response = response.choices[0].message.content
    
        results_list.append([i, full_name_country, subsequent_response])
    
        if i % 100 == 0:
            print("Reached index {0} after {1} seconds.".format(i, time.time() - start_time))
            print(subsequent_response)
    
    print("Execution time: %s seconds" % round((time.time() - start_time), 3), "\n")

    output_df = pd.DataFrame(results_list, columns=['index', 'full_name', 'output'])
    output_df_filepath = os.getcwd() + r'\Data\full_name_country\infer_output_{0}_to_{1}.csv'.format(start_index, end_index - 1)
    output_df.to_csv(output_df_filepath, index=False, header=True, encoding='utf-8-sig')
    print('Done')

In [None]:
def infer_names_openai(input_df: pd.DataFrame, start_index: int, end_index: int, first_name_only: bool, include_country: bool) -> pd.DataFrame:
    start_time = time.time()
    results_list = []

    for i in range(start_index, end_index):
        # Name
        if first_name_only:
            name = str(olympic_df.loc[i, 'First Name']).title()
        else:
            name = str(olympic_df.loc[i, 'First Name']).title() + " " + str(olympic_df.loc[i, 'Last Name']).title()
    
        # Country
        if include_country:
            country = str(olympic_df.loc[i, 'Team']).split('-')[0].title()
    
        if include_country:
            prompt = """
            I need to pick up someone from {0} named {1}. Am I more likely looking for a male or a female? Report only "Male" or "Female", and a score from 0 to 1 on how certain you are.  Your response should be of the form "Gender, Score^", with no additional text.
            """.format(country, name)
        else:
            prompt = """
            I need to pick up someone named {0}. Am I more likely looking for a male or a female? Report only "Male" or "Female", and a score from 0 to 1 on how certain you are.  Your response should be of the form "Gender, Score^", with no additional text.
            """.format(name)
    
        response = chat_completion_with_backoff(
            model=model,
            messages=[
                {"role": "user", "content": prompt}
            ]
        )
    
        _response = response.choices[0].message.content
    
        if include_country:
            results_list.append([i, name, country, _response])
        else:
            results_list.append([i, name, _response])
    
        if i % 100 == 0:
            print("Reached index {0} after {1} seconds.".format(i, time.time() - start_time))
            print(_response)

    # Write to DataFrame
    if include_country:
        output_df = pd.DataFrame(results_list, columns=['index', 'name', 'country', 'output'])
    else:
        output_df = pd.DataFrame(results_list, columns=['index', 'name', 'output'])
        
    print("Execution time: %s seconds" % round((time.time() - start_time), 3), "\n")
    return output_df

In [None]:
first_name_only = True
include_country = False
start_index = 80000
end_index = 100000

olympic_df_filepath = os.getcwd() + r'/Data/olympic_output.csv'
olympic_df = pd.read_csv(olympic_df_filepath, usecols=['First Name', 'Last Name', 'Team'])

output_df = infer_names_openai(olympic_df, start_index, end_index, first_name_only, include_country)

output_df_filepath = os.getcwd() + r'/Data/Prompt2/full_name_country/infer_output_{0}_to_{1}.csv'.format(start_index, end_index - 1)
output_df.to_csv(output_df_filepath, index=False, header=True, encoding='utf-8-sig')
print('Done')

In [None]:
olympic_df = pd.read_csv((os.getcwd() + r'/Data/olympic_output.csv'), usecols=['First Name', 'Last Name', 'Team', 'Sex'])
first_name_df = pd.read_csv((os.getcwd() + r'/Data/Prompt2/first_name/olympic_first_name_chatgpt_output_gender_score.csv'), usecols=['name', 'gender'])
full_name_df = pd.read_csv((os.getcwd() + r'/Data/Prompt2/full_name/olympic_full_name_chatgpt_output_gender_score.csv'), usecols=['name', 'gender'])
first_name_country_df = pd.read_csv((os.getcwd() + r'/Data/Prompt2/first_name_country/olympic_first_name_country_chatgpt_output_gender_score.csv'), usecols=['name', 'gender'])
full_name_country_df = pd.read_csv((os.getcwd() + r'/Data/Prompt2/full_name_country/olympic_full_name_country_chatgpt_output_gender_score.csv'), usecols=['name', 'gender'])

results_list = []

for i in range(len(olympic_df)):
    first_name = str(olympic_df.loc[i, 'First Name']).title()
    full_name = str(olympic_df.loc[i, 'First Name']).title() + " " + str(olympic_df.loc[i, 'Last Name']).title()
    country = str(olympic_df.loc[i, 'Team']).split('-')[0].title()
    actual_sex = str(olympic_df.loc[i, 'Sex'])
    
    # Use index to compare, assert that names are the same
    first_name_gender = ''
    first_name_score = ''
    full_name_gender = ''
    full_name_score = ''
    first_name_country_gender = ''
    first_name_country_score = ''
    full_name_country_gender = ''
    full_name_country_score = ''
    
    # Index, Full Name, Country, Sex, First-Name Gender + Score, Full-Name Gender + Score, First-Name-Country Gender + Score, Full-Name-Country Gender + Score
    results_list.append([i, full_name, country, actual_sex, first_name_gender, first_name_score, full_name_gender, full_name_score, first_name_country_gender, first_name_country_score, full_name_country_gender, full_name_country_score])
    
output_df = pd.DataFrame(results_list, columns=['index', 'name', 'country', 'sex', 'first_name_gender', 'first_name_score' 'full_name_gender', 'full_name_score', 'first_name_country_gender', 'first_name_country_score', 'full_name_country_gender', 'full_name_country_score'])
output_df.to_csv((os.getcwd() + r'/Data/Prompt2/final_results.csv'), index=False, header=True, encoding='utf-8-sig')
print('Done')

# First Name

In [None]:
olympic_df_0 = pd.read_csv((os.getcwd() + r'/Data/Prompt2/first_name/infer_output_0_to_9999.csv'), usecols=['index', 'name', 'output'])
olympic_df_1 = pd.read_csv((os.getcwd() + r'/Data/Prompt2/first_name/infer_output_10000_to_19999.csv'), usecols=['index', 'name', 'output'])
olympic_df_2 = pd.read_csv((os.getcwd() + r'/Data/Prompt2/first_name/infer_output_20000_to_25452.csv'), usecols=['index', 'name', 'output'])

frames = [olympic_df_0, olympic_df_1, olympic_df_2]
result = pd.concat(frames)
result.to_csv((os.getcwd() + r'/Data/Prompt2/first_name/infer_output_full.csv'), index=False, header=True, encoding='utf-8-sig')
print('Done')

In [None]:
olympic_df = pd.read_csv((os.getcwd() + r'/Data/Prompt2/first_name/olympic_first_name_chatgpt_output_gender_score.csv'), usecols=['index', 'name', 'output'])

genders = []
scores = []

start_time = time.time()

for i in range(len(olympic_df)):
    output = str(olympic_df.loc[i, 'output'])
    output = output.title()
    
    female_found = output.find('Female') != -1
    male_found = output.find('Male') != -1
    
    if female_found and male_found:
        genders.append('U')
    elif female_found:
        genders.append('F')
    elif male_found:
        genders.append('M')
    else:
        genders.append('U')
        
    score = re.findall("\d+\.\d+",output)
    if len(score) > 0:
        scores.append(score[0])
    else:
        if output.find('0') != -1:
            scores.append('0')
        else:
            scores.append('')
        
olympic_df['gender'] = genders
olympic_df['score'] = scores
olympic_df.to_csv((os.getcwd() + r'/Data/Prompt2/first_name/olympic_first_name_chatgpt_output_gender_score.csv'), index=False, header=True, encoding='utf-8-sig')

print("Execution time: %s seconds" % round((time.time() - start_time), 3), "\n")
print('Done')

# Full Name

In [None]:
olympic_df_0 = pd.read_csv((os.getcwd() + r'/Data/Prompt2/full_name/infer_output_0_to_9999.csv'), usecols=['index', 'name', 'output'])
olympic_df_1 = pd.read_csv((os.getcwd() + r'/Data/Prompt2/full_name/infer_output_10000_to_19999.csv'), usecols=['index', 'name', 'output'])
olympic_df_2 = pd.read_csv((os.getcwd() + r'/Data/Prompt2/full_name/infer_output_20000_to_29999.csv'), usecols=['index', 'name', 'output'])
olympic_df_3 = pd.read_csv((os.getcwd() + r'/Data/Prompt2/full_name/infer_output_30000_to_39999.csv'), usecols=['index', 'name', 'output'])
olympic_df_4 = pd.read_csv((os.getcwd() + r'/Data/Prompt2/full_name/infer_output_40000_to_49999.csv'), usecols=['index', 'name', 'output'])
olympic_df_5 = pd.read_csv((os.getcwd() + r'/Data/Prompt2/full_name/infer_output_50000_to_59999.csv'), usecols=['index', 'name', 'output'])
olympic_df_6 = pd.read_csv((os.getcwd() + r'/Data/Prompt2/full_name/infer_output_60000_to_79999.csv'), usecols=['index', 'name', 'output'])
olympic_df_7 = pd.read_csv((os.getcwd() + r'/Data/Prompt2/full_name/infer_output_80000_to_89999.csv'), usecols=['index', 'name', 'output'])
olympic_df_8 = pd.read_csv((os.getcwd() + r'/Data/Prompt2/full_name/infer_output_90000_to_99999.csv'), usecols=['index', 'name', 'output'])
olympic_df_9 = pd.read_csv((os.getcwd() + r'/Data/Prompt2/full_name/infer_output_100000_to_109999.csv'), usecols=['index', 'name', 'output'])
olympic_df_10 = pd.read_csv((os.getcwd() + r'/Data/Prompt2/full_name/infer_output_110000_to_119999.csv'), usecols=['index', 'name', 'output'])
olympic_df_11 = pd.read_csv((os.getcwd() + r'/Data/Prompt2/full_name/infer_output_120000_to_134731.csv'), usecols=['index', 'name', 'output'])

frames = [olympic_df_0, olympic_df_1, olympic_df_2, olympic_df_3, olympic_df_4, olympic_df_5, olympic_df_6, olympic_df_7, olympic_df_8, olympic_df_9, olympic_df_10, olympic_df_11]
result = pd.concat(frames)
result.to_csv((os.getcwd() + r'/Data/Prompt2/full_name/infer_output_full.csv'), index=False, header=True, encoding='utf-8-sig')
print('Done')

In [None]:
olympic_df = pd.read_csv((os.getcwd() + r'/Data/Prompt2/full_name/olympic_full_name_chatgpt_output_gender_score.csv'), usecols=['index', 'name', 'output'])

genders = []
scores = []

start_time = time.time()

for i in range(len(olympic_df)):
    output = str(olympic_df.loc[i, 'output'])
    output = output.title()
    
    female_found = output.find('Female') != -1
    male_found = output.find('Male') != -1
    
    if female_found and male_found:
        genders.append('U')
    elif female_found:
        genders.append('F')
    elif male_found:
        genders.append('M')
    else:
        genders.append('U')
        
    score = re.findall("\d+\.\d+",output)
    if len(score) > 0:
        scores.append(score[0])
    else:
        if output.find('0') != -1:
            scores.append('0')
        else:
            scores.append('')
        
olympic_df['gender'] = genders
olympic_df['score'] = scores
olympic_df.to_csv((os.getcwd() + r'/Data/Prompt2/full_name/olympic_full_name_chatgpt_output_gender_score.csv'), index=False, header=True, encoding='utf-8-sig')

print("Execution time: %s seconds" % round((time.time() - start_time), 3), "\n")
print('Done')

In [None]:
ground_truth_df = pd.read_csv((os.getcwd() + r'/Data/olympic_output.csv'), usecols=['Sex'])
predicted_df = pd.read_csv((os.getcwd() + r'/Data/Prompt2/full_name/olympic_full_name_chatgpt_output_gender_score.csv'), usecols=['gender'])
results = {'actual female, predict female': 0, 'actual female, predict male': 0, 'actual female, predict unknown': 0, 'actual male, predict female': 0, 'actual male, predict male': 0, 'actual male, predict unknown': 0}

assert len(ground_truth_df) == len(predicted_df)

for i in range(len(ground_truth_df)):
    ground_truth = ground_truth_df.loc[i, 'Sex']
    predicted = predicted_df.loc[i, 'gender']
    
    if ground_truth == 'F':
        if predicted == 'F':
            results['actual female, predict female'] += 1
        elif predicted == 'M':
            results['actual female, predict male'] += 1
        elif predicted == 'U':
            results['actual female, predict unknown'] += 1
    elif ground_truth == 'M':
        if predicted == 'F':
            results['actual male, predict female'] += 1
        elif predicted == 'M':
            results['actual male, predict male'] += 1
        elif predicted == 'U':
            results['actual male, predict unknown'] += 1

print(results)
print('Done')

# First Name + Country

In [None]:
olympic_df_0 = pd.read_csv((os.getcwd() + r'/Data/Prompt2/first_name_country/infer_output_0_to_9999.csv'), usecols=['index', 'name', 'country', 'output'])
olympic_df_1 = pd.read_csv((os.getcwd() + r'/Data/Prompt2/first_name_country/infer_output_10000_to_19999.csv'), usecols=['index', 'name', 'country', 'output'])
olympic_df_2 = pd.read_csv((os.getcwd() + r'/Data/Prompt2/first_name_country/infer_output_20000_to_29999.csv'), usecols=['index', 'name', 'country', 'output'])
olympic_df_3 = pd.read_csv((os.getcwd() + r'/Data/Prompt2/first_name_country/infer_output_30000_to_39999.csv'), usecols=['index', 'name', 'country', 'output'])
olympic_df_4 = pd.read_csv((os.getcwd() + r'/Data/Prompt2/first_name_country/infer_output_40000_to_51089.csv'), usecols=['index', 'name', 'country', 'output'])

frames = [olympic_df_0, olympic_df_1, olympic_df_2, olympic_df_3, olympic_df_4]
result = pd.concat(frames)
result.to_csv((os.getcwd() + r'/Data/Prompt2/first_name_country/infer_output_full.csv'), index=False, header=True, encoding='utf-8-sig')
print('Done')

In [None]:
olympic_df = pd.read_csv((os.getcwd() + r'/Data/Prompt2/first_name_country/olympic_first_name_country_chatgpt_output_gender_score.csv'), usecols=['index', 'name', 'output'])

genders = []
scores = []

start_time = time.time()

for i in range(len(olympic_df)):
    output = str(olympic_df.loc[i, 'output'])
    output = output.title()
    
    female_found = output.find('Female') != -1
    male_found = output.find('Male') != -1
    
    if female_found and male_found:
        genders.append('U')
    elif female_found:
        genders.append('F')
    elif male_found:
        genders.append('M')
    else:
        genders.append('U')
        
    score = re.findall("\d+\.\d+",output)
    if len(score) > 0:
        scores.append(score[0])
    else:
        if output.find('0') != -1:
            scores.append('0')
        else:
            scores.append('')
        
olympic_df['gender'] = genders
olympic_df['score'] = scores
olympic_df.to_csv((os.getcwd() + r'/Data/Prompt2/first_name_country/olympic_first_name_country_chatgpt_output_gender_score.csv'), index=False, header=True, encoding='utf-8-sig')

print("Execution time: %s seconds" % round((time.time() - start_time), 3), "\n")
print('Done')

In [None]:
ground_truth_df = pd.read_csv((os.getcwd() + r'/Data/first_name_country/olympic_first_names_actual_gender.csv'), usecols=['gender'])
predicted_df = pd.read_csv((os.getcwd() + r'/Data/Prompt2/first_name_country/olympic_first_name_country_chatgpt_output_gender_score.csv'), usecols=['gender'])

results = {'actual female, predict female': 0, 
           'actual female, predict male': 0, 
           'actual female, predict unknown': 0, 
           'actual male, predict female': 0, 
           'actual male, predict male': 0, 
           'actual male, predict unknown': 0,
           'actual unknown, predict female': 0,
           'actual unknown, predict male': 0,
           'actual unknown, predict unknown': 0}


for i in range(len(df)):
    gender = df.loc[i, 'gender']
    predicted = df.loc[i, 'predicted_gender']
    df.loc[df['column_name'] == some_value]
    
    if gender == 'F':
        if predicted == 'F':
            results['actual female, predict female'] += 1
        elif predicted == 'M':
            results['actual female, predict male'] += 1
        elif predicted == 'U':
            results['actual female, predict unknown'] += 1
    elif gender == 'M':
        if predicted == 'F':
            results['actual male, predict female'] += 1
        elif predicted == 'M':
            results['actual male, predict male'] += 1
        elif predicted == 'U':
            results['actual male, predict unknown'] += 1
    elif gender == 'U':
        if predicted == 'F':
            results['actual unknown, predict female'] += 1
        elif predicted == 'M':
            results['actual unknown, predict male'] += 1
        elif predicted == 'U':
            results['actual unknown, predict unknown'] += 1
            
print(results)
print('Done')

# Full Name + Country

In [None]:
olympic_df_0 = pd.read_csv((os.getcwd() + r'/Data/Prompt2/full_name_country/infer_output_0_to_9999.csv'), usecols=['index', 'name', 'country', 'output'])
olympic_df_1 = pd.read_csv((os.getcwd() + r'/Data/Prompt2/full_name_country/infer_output_10000_to_39999.csv'), usecols=['index', 'name', 'country', 'output'])
olympic_df_2 = pd.read_csv((os.getcwd() + r'/Data/Prompt2/full_name_country/infer_output_40000_to_59999.csv'), usecols=['index', 'name', 'country', 'output'])
olympic_df_3 = pd.read_csv((os.getcwd() + r'/Data/Prompt2/full_name_country/infer_output_60000_to_89999.csv'), usecols=['index', 'name', 'country', 'output'])
olympic_df_4 = pd.read_csv((os.getcwd() + r'/Data/Prompt2/full_name_country/infer_output_90000_to_109999.csv'), usecols=['index', 'name', 'country', 'output'])
olympic_df_5 = pd.read_csv((os.getcwd() + r'/Data/Prompt2/full_name_country/infer_output_110000_to_124999.csv'), usecols=['index', 'name', 'country', 'output'])
olympic_df_6 = pd.read_csv((os.getcwd() + r'/Data/Prompt2/full_name_country/infer_output_125000_to_134731.csv'), usecols=['index', 'name', 'country', 'output'])

frames = [olympic_df_0, olympic_df_1, olympic_df_2, olympic_df_3, olympic_df_4, olympic_df_5, olympic_df_6]
result = pd.concat(frames)
result.to_csv((os.getcwd() + r'/Data/Prompt2/full_name_country/infer_output_full.csv'), index=False, header=True, encoding='utf-8-sig')
print('Done')

In [None]:
olympic_df = pd.read_csv((os.getcwd() + r'/Data/Prompt2/full_name_country/olympic_full_name_country_chatgpt_output_gender_score.csv'), usecols=['index', 'name', 'output'])

genders = []
scores = []

start_time = time.time()

for i in range(len(olympic_df)):
    output = str(olympic_df.loc[i, 'output'])
    output = output.title()
    
    female_found = output.find('Female') != -1
    male_found = output.find('Male') != -1
    
    if female_found and male_found:
        genders.append('U')
    elif female_found:
        genders.append('F')
    elif male_found:
        genders.append('M')
    else:
        genders.append('U')
        
    score = re.findall("\d+\.\d+",output)
    if len(score) > 0:
        scores.append(score[0])
    else:
        if output.find('0') != -1:
            scores.append('0')
        else:
            scores.append('')
        
olympic_df['gender'] = genders
olympic_df['score'] = scores
olympic_df.to_csv((os.getcwd() + r'/Data/Prompt2/full_name_country/olympic_full_name_country_chatgpt_output_gender_score.csv'), index=False, header=True, encoding='utf-8-sig')

print("Execution time: %s seconds" % round((time.time() - start_time), 3), "\n")
print('Done')

In [None]:
ground_truth_df = pd.read_csv((os.getcwd() + r'/Data/olympic_output.csv'), usecols=['Sex'])
predicted_df = pd.read_csv((os.getcwd() + r'/Data/Prompt2/full_name_country/olympic_full_name_country_chatgpt_output_gender_score.csv'), usecols=['gender'])
results = {'actual female, predict female': 0, 'actual female, predict male': 0, 'actual female, predict unknown': 0, 'actual male, predict female': 0, 'actual male, predict male': 0, 'actual male, predict unknown': 0}

assert len(ground_truth_df) == len(predicted_df)

for i in range(len(ground_truth_df)):
    ground_truth = ground_truth_df.loc[i, 'Sex']
    predicted = predicted_df.loc[i, 'gender']
    
    if ground_truth == 'F':
        if predicted == 'F':
            results['actual female, predict female'] += 1
        elif predicted == 'M':
            results['actual female, predict male'] += 1
        elif predicted == 'U':
            results['actual female, predict unknown'] += 1
    elif ground_truth == 'M':
        if predicted == 'F':
            results['actual male, predict female'] += 1
        elif predicted == 'M':
            results['actual male, predict male'] += 1
        elif predicted == 'U':
            results['actual male, predict unknown'] += 1

print(results)
print('Done')

## Data Parsing And Cleaning (OUTDATED)

In [None]:
olympic_df_0 = pd.read_csv((os.getcwd() + r'\Data\full_name\infer_output_0_to_9999.csv'), usecols=['index', 'full_name', 'output'])
olympic_df_1 = pd.read_csv((os.getcwd() + r'\Data\full_name\infer_output_10000_to_19999.csv'), usecols=['index', 'full_name', 'output'])
olympic_df_2 = pd.read_csv((os.getcwd() + r'\Data\full_name\infer_output_20000_to_49999.csv'), usecols=['index', 'full_name', 'output'])
olympic_df_3 = pd.read_csv((os.getcwd() + r'\Data\full_name\infer_output_50000_to_79999.csv'), usecols=['index', 'full_name', 'output'])
olympic_df_4 = pd.read_csv((os.getcwd() + r'\Data\full_name\infer_output_80000_to_89999.csv'), usecols=['index', 'full_name', 'output'])
olympic_df_5 = pd.read_csv((os.getcwd() + r'\Data\full_name\infer_output_90000_to_109999.csv'), usecols=['index', 'full_name', 'output'])
olympic_df_6 = pd.read_csv((os.getcwd() + r'\Data\full_name\infer_output_110000_to_134699.csv'), usecols=['index', 'full_name', 'output'])
olympic_df_7 = pd.read_csv((os.getcwd() + r'\Data\full_name\infer_output_134700_to_134731.csv'), usecols=['index', 'full_name', 'output'])

frames = [olympic_df_0, olympic_df_1, olympic_df_2, olympic_df_3, olympic_df_4, olympic_df_5, olympic_df_6, olympic_df_7]
result = pd.concat(frames)
result.to_csv((os.getcwd() + r'\Data\full_name\infer_output_full.csv'), index=False, header=True, encoding='utf-8-sig')
print('Done')

In [None]:
olympic_df_0 = pd.read_csv((os.getcwd() + r'\Data\full_name_country\infer_output_0_to_9999.csv'), usecols=['index', 'full_name', 'output'])
olympic_df_1 = pd.read_csv((os.getcwd() + r'\Data\full_name_country\infer_output_10000_to_29999.csv'), usecols=['index', 'full_name', 'output'])
olympic_df_2 = pd.read_csv((os.getcwd() + r'\Data\full_name_country\infer_output_30000_to_39999.csv'), usecols=['index', 'full_name', 'output'])
olympic_df_3 = pd.read_csv((os.getcwd() + r'\Data\full_name_country\infer_output_40000_to_49999.csv'), usecols=['index', 'full_name', 'output'])
olympic_df_4 = pd.read_csv((os.getcwd() + r'\Data\full_name_country\infer_output_50000_to_59999.csv'), usecols=['index', 'full_name', 'output'])
olympic_df_5 = pd.read_csv((os.getcwd() + r'\Data\full_name_country\infer_output_60000_to_69999.csv'), usecols=['index', 'full_name', 'output'])
olympic_df_6 = pd.read_csv((os.getcwd() + r'\Data\full_name_country\infer_output_70000_to_79999.csv'), usecols=['index', 'full_name', 'output'])
olympic_df_7 = pd.read_csv((os.getcwd() + r'\Data\full_name_country\infer_output_80000_to_89999.csv'), usecols=['index', 'full_name', 'output'])
olympic_df_8 = pd.read_csv((os.getcwd() + r'\Data\full_name_country\infer_output_90000_to_109999.csv'), usecols=['index', 'full_name', 'output'])
olympic_df_9 = pd.read_csv((os.getcwd() + r'\Data\full_name_country\infer_output_110000_to_129999.csv'), usecols=['index', 'full_name', 'output'])
olympic_df_10 = pd.read_csv((os.getcwd() + r'\Data\full_name_country\infer_output_130000_to_134731.csv'), usecols=['index', 'full_name', 'output'])

frames = [olympic_df_0, olympic_df_1, olympic_df_2, olympic_df_3, olympic_df_4, olympic_df_5, olympic_df_6, olympic_df_7, olympic_df_8, olympic_df_9, olympic_df_10]
result = pd.concat(frames)
result.to_csv((os.getcwd() + r'\Data\full_name_country\olympic_full_names_country_chatgpt_output_gender_score.csv'), index=False, header=True, encoding='utf-8-sig')
print('Done')

In [None]:
olympic_df = pd.read_csv((os.getcwd() + r'\Data\full_name_country\olympic_full_names_country_chatgpt_output_gender_score.csv'), usecols=['index', 'full_name', 'output'])

genders = []
scores = []

start_time = time.time()

for i in range(len(olympic_df)):
    if i % 1000 == 0:
        print("Reached index {0} after {1} seconds.".format(i, time.time() - start_time))
    
    output = str(olympic_df.loc[i, 'output'])
    output = output.title()
    
    female_found = output.find('Female') != -1
    male_found = output.find('Male') != -1
    
    if female_found and male_found:
        genders.append('U')
    elif female_found:
        genders.append('F')
    elif male_found:
        genders.append('M')
    else:
        genders.append('U')
        
    score = re.findall("\d+\.\d+",output)
    if len(score) > 0:
        scores.append(score[0])
    else:
        if output.find('0') != -1:
            scores.append('0')
        else:
            scores.append('')
        
olympic_df['gender'] = genders
olympic_df['score'] = scores
olympic_df.to_csv((os.getcwd() + r'\Data\full_name_country\olympic_full_names_country_chatgpt_output_gender_score.csv'), index=False, header=True, encoding='utf-8-sig')

print("Execution time: %s seconds" % round((time.time() - start_time), 3), "\n")
print('Done')

In [None]:
ground_truth_df = pd.read_csv((os.getcwd() + r'\Data\olympic_output.csv'), usecols=['Sex'])
predicted_df = pd.read_csv((os.getcwd() + r'\Data\full_name\olympic_full_names_chatgpt_output_gender_score.csv'), usecols=['gender'])
results = {'actual female, predict female': 0, 'actual female, predict male': 0, 'actual female, predict unknown': 0, 'actual male, predict female': 0, 'actual male, predict male': 0, 'actual male, predict unknown': 0}

assert len(ground_truth_df) == len(predicted_df)

for i in range(len(ground_truth_df)):
    ground_truth = ground_truth_df.loc[i, 'Sex']
    predicted = predicted_df.loc[i, 'gender']
    
    if ground_truth == 'F':
        if predicted == 'F':
            results['actual female, predict female'] += 1
        elif predicted == 'M':
            results['actual female, predict male'] += 1
        elif predicted == 'U':
            results['actual female, predict unknown'] += 1
    elif ground_truth == 'M':
        if predicted == 'F':
            results['actual male, predict female'] += 1
        elif predicted == 'M':
            results['actual male, predict male'] += 1
        elif predicted == 'U':
            results['actual male, predict unknown'] += 1

print(results)
print('Done')

In [None]:
ground_truth_df = pd.read_csv((os.getcwd() + r'\Data\olympic_output.csv'), usecols=['Sex', 'Medal'])
predicted_df = pd.read_csv((os.getcwd() + r'\Data\full_name\olympic_full_names_chatgpt_output_gender_score.csv'), usecols=['gender'])
results = {'no medal, correct prediction': 0, 'no medal, incorrect prediction': 0, 'bronze medal, correct prediction': 0, 'bronze medal, incorrect prediction': 0, 'silver medal, correct prediction': 0, 'silver medal, incorrect prediction': 0, 'gold medal, correct prediction': 0, 'gold medal, incorrect prediction': 0}

assert len(ground_truth_df) == len(predicted_df)

for i in range(len(ground_truth_df)):
    ground_truth = ground_truth_df.loc[i, 'Sex']
    medal = ground_truth_df.loc[i, 'Medal']
    predicted = predicted_df.loc[i, 'gender']
    
    if pd.isna(medal):
        if ground_truth == predicted:
            results['no medal, correct prediction'] += 1
        else:
            results['no medal, incorrect prediction'] += 1
    elif medal == 'Bronze':
        if ground_truth == predicted:
            results['bronze medal, correct prediction'] += 1
        else:
            results['bronze medal, incorrect prediction'] += 1
    elif medal == 'Silver':
        if ground_truth == predicted:
            results['silver medal, correct prediction'] += 1
        else:
            results['silver medal, incorrect prediction'] += 1
    elif medal == 'Gold':
        if ground_truth == predicted:
            results['gold medal, correct prediction'] += 1
        else:
            results['gold medal, incorrect prediction'] += 1

print(results)
print('Done')

## First Names Only

In [None]:
df = pd.read_csv((os.getcwd() + r'/Data/first_name/olympic_first_names_final.csv'), usecols=['gender', 'predicted_gender'])

results = {'actual female, predict female': 0, 
           'actual female, predict male': 0, 
           'actual female, predict unknown': 0, 
           'actual male, predict female': 0, 
           'actual male, predict male': 0, 
           'actual male, predict unknown': 0,
           'actual unknown, predict female': 0,
           'actual unknown, predict male': 0,
           'actual unknown, predict unknown': 0}


for i in range(len(df)):
    gender = df.loc[i, 'gender']
    predicted = df.loc[i, 'predicted_gender']
    
    if gender == 'F':
        if predicted == 'F':
            results['actual female, predict female'] += 1
        elif predicted == 'M':
            results['actual female, predict male'] += 1
        elif predicted == 'U':
            results['actual female, predict unknown'] += 1
    elif gender == 'M':
        if predicted == 'F':
            results['actual male, predict female'] += 1
        elif predicted == 'M':
            results['actual male, predict male'] += 1
        elif predicted == 'U':
            results['actual male, predict unknown'] += 1
    elif gender == 'U':
        if predicted == 'F':
            results['actual unknown, predict female'] += 1
        elif predicted == 'M':
            results['actual unknown, predict male'] += 1
        elif predicted == 'U':
            results['actual unknown, predict unknown'] += 1
            
print(results)
print('Done')

## First Names + Country

In [None]:
df = pd.read_csv((os.getcwd() + r'/Data/first_name_country/olympic_first_names_country_final.csv'), usecols=['gender', 'predicted_gender'])

results = {'actual female, predict female': 0, 
           'actual female, predict male': 0, 
           'actual female, predict unknown': 0, 
           'actual male, predict female': 0, 
           'actual male, predict male': 0, 
           'actual male, predict unknown': 0,
           'actual unknown, predict female': 0,
           'actual unknown, predict male': 0,
           'actual unknown, predict unknown': 0}


for i in range(len(df)):
    gender = df.loc[i, 'gender']
    predicted = df.loc[i, 'predicted_gender']
    
    if gender == 'F':
        if predicted == 'F':
            results['actual female, predict female'] += 1
        elif predicted == 'M':
            results['actual female, predict male'] += 1
        elif predicted == 'U':
            results['actual female, predict unknown'] += 1
    elif gender == 'M':
        if predicted == 'F':
            results['actual male, predict female'] += 1
        elif predicted == 'M':
            results['actual male, predict male'] += 1
        elif predicted == 'U':
            results['actual male, predict unknown'] += 1
    elif gender == 'U':
        if predicted == 'F':
            results['actual unknown, predict female'] += 1
        elif predicted == 'M':
            results['actual unknown, predict male'] += 1
        elif predicted == 'U':
            results['actual unknown, predict unknown'] += 1
            
print(results)
print('Done')