# API Quest
## Oslo

# HYPOTHESIS
- Rich countries have more Nobel Prizes
    - Nobel prize winners immigrate towards rich countries
    - Nobel prize winners immigrate towards stable countries
- Countries of birth / early education have more impact than countries of higher education
- Nobel Prizes Laureates are getting younger
- Nobel Prizes are awarded more to international teams than before

- Gender Differences: Is there a significant difference in the gender ratio among Nobel Prize winners? Has this changed over time?
- Geographic Distribution: In which countries or regions are Nobel Prize winners predominantly located? Has this distribution changed over time?
- Age of Winners: What is the age distribution of Nobel Prize winners? Are there any noticeable trends in age?
- Publications: Are there specific journals where Nobel Prize winners’ research is commonly published? How influential are these journals?

## HYPOTHESIS 1
- Men are over represented in Nobel Prizes

## Selected data sources

1. Nobel API
2. crossref.org
3. https://archive.ics.uci.edu/ml/datasets/Gender+by+Name
4. namsor.app

In [121]:
#QUESTIONS
#caching
#error handling / checkpoints?
#nested jsons?
#what if we don't know the possible value?
#FileNotFoundError as check for file existence?

In [122]:
#TODO fix given names function to accept ending years
#TODO compare bar charts of nobel vs fields
#TODO compare evolution of fields


In [None]:
%load_ext autoreload
%autoreload 2 

In [124]:
#imports
import os
import requests
import time
import pandas as pd
from dotenv import load_dotenv
from datetime import datetime
import plotly.express as px
from wrangling import *

In [125]:
#settings
pd.set_option('display.max_colwidth', 900)
pd.set_option('display.max_rows', 40)

In [126]:
#load env
load_dotenv()
name_token = os.getenv('NAME_KEY')


In [127]:
#TODO: Get the data from the API
enrollment_df = pd.read_csv('sources/school_enrolment_gender.csv')
enrollment_df.head()

laureates_url = 'https://api.nobelprize.org/2.1/laureates'

In [None]:
laureates_df = get_all_laureates(laureates_url)
laureates_df.to_csv('sources/laureates.csv', index=False)
display(laureates_df)

In [None]:
for column_name in laureates_df.columns:
    print(column_name)

### GENDER ANALYSIS

In [130]:
gender_columns = get_json('schema')

In [None]:
gender_df = shape_dataframe(laureates_df, gender_columns)
display(gender_df)

In [None]:
gender_counts = gender_df.groupby('gender').aggregate({'id': 'count'}).reset_index()
gender_counts['proportion'] = gender_counts['id'] / gender_counts['id'].sum()
gender_counts['proportion'] = gender_counts['proportion'].apply(lambda x: f"{x:.0%}")
gender_counts.sort_values('proportion', ascending=False, inplace=True)
gender_counts.index = range(1, len(gender_counts) + 1)
display(gender_counts)

In [None]:
fig = px.bar(gender_counts, x='gender', y='id', text='proportion', title='Gender Distribution of Nobel Laureates')
fig.show()

In [None]:
yearly_gender_ratio = gender_df.groupby(['award_year','gender']).size().unstack(fill_value=0)
display(yearly_gender_ratio[['female','male']])
yearly_gender_ratio['total'] = yearly_gender_ratio.sum(axis=1)
yearly_gender_ratio['female_ratio'] = yearly_gender_ratio['female'] / yearly_gender_ratio['total']
yearly_gender_ratio['male_ratio'] = yearly_gender_ratio['male'] / yearly_gender_ratio['total']
display(yearly_gender_ratio[['female_ratio','male_ratio']])

In [None]:
fig = px.line(yearly_gender_ratio, x=yearly_gender_ratio.index, y=['female_ratio', 'male_ratio'], title='Yearly Gender Distribution of Nobel Laureates')
fig.show()

In [None]:
gender_cumulative = gender_df.groupby(['award_year', 'gender']).size().unstack(fill_value=0).cumsum()
display(gender_cumulative)
gender_cumulative['total'] = gender_cumulative.sum(axis=1)
gender_cumulative['male_proportion'] = gender_cumulative['male'] / gender_cumulative['total']
gender_cumulative['female_proportion'] = gender_cumulative['female'] / gender_cumulative['total']
display(gender_cumulative[['male_proportion', 'female_proportion']])

In [None]:
fig = px.line(gender_cumulative, x=gender_cumulative.index, y=['female_proportion', 'male_proportion'], title='Cumulative Proportion of Nobel Laureates by gender')
fig.show()

In [None]:
fig = px.line(gender_cumulative, x=gender_cumulative.index, y=['female', 'male'], title='Cumulative Gender Distribution of Nobel Laureates')
fig.show()

### FIELD ANALYSIS  

In [None]:
#get the authors of random papers
authors_names_df = get_papers_authors(gender_columns, 1901, 2023, 'initial')
display(authors_names_df)

In [140]:
#clean and select the names+gender database
def clean_name_gender_db(source_path = 'sources/name_gender_dataset.csv', target_path = 'sources/name_gender_database_clean.csv'):
    
    try:
        df = pd.read_csv(target_path)
               
    except FileNotFoundError as e:
        
        print(f"{e}")
        df = pd.read_csv(f'{source_path}')
        df = df.rename(lambda x: x.lower(), axis=1)
        df = df[['name','gender']]
        df['gender'] = df['gender'].apply(lambda x: 'male' if x == 'M' else 'female' if x == 'F' else None)
        df.to_csv(f'{target_path}', index=False)
        
    return df



In [141]:
#supplement the database with missing values from laureates
def find_missing_values_in_db(db, list, column_name  = 'name'):
    missing_values = list[~list[column_name].str.lower().isin(db[column_name].str.lower())]
    return missing_values


In [142]:


# call namsor API with the list

def get_genders_from_name_api(name_list_df, token, limit = 50) -> dict:
    
    final_list = []
    url = "https://v2.namsor.com/NamSorAPIv2/api2/json/genderBatch"

    while len(name_list_df) > 0:
        
        payload = {
            "personalNames": [{"firstName": name} for name in name_list_df['name'][:limit]]
        }
        
        print(payload)

        headers = {
            "X-API-KEY": token,
            "Accept": "application/json",
            "Content-Type": "application/json"
        }

        response = requests.post(url, headers=headers, json=payload)
        response = response.json()
        final_list.extend(response["personalNames"])
        name_list_df = name_list_df[limit:]
        time.sleep(1)
        
    return final_list

def format_new_names(new_names):
    new_names = pd.DataFrame(new_names)
    new_names = new_names[['firstName', 'likelyGender']]
    new_names = new_names.rename(columns={'firstName': 'name', 'likelyGender': 'gender'})
    return new_names
                                          

In [143]:
#update the database with the new values
def update_name_gender_db(db_df, new_names_gender_df):
    db_df = pd.concat([db_df, new_names_gender_df], ignore_index=True)
    db_df = db_df.drop_duplicates(subset='name', keep='first')
    db_df.to_csv('sources/name_gender_database_clean.csv', index=False)
    return db_df

In [144]:
#update the authors names df with the new genders

def update_gender_from_db(df: pd.DataFrame, db: pd.DataFrame):
    
    db = db.drop_duplicates(subset='name', keep='first')
    
    # Create a mapping from dfB's name to gender
    gender_mapping = db.set_index('name')['gender']

    # Fill missing genders in dfA using the mapping
    df['gender'] = df['gender'].fillna(df['name'].map(gender_mapping))
    return df



In [None]:
def genderize_names(df: pd.DataFrame):
    #get the unique names
    unique_authors_names_df = pd.DataFrame(df['name'].unique(), columns=['name'])
    
    #get a clean database of name + gender
    name_gender_db_df = clean_name_gender_db()
    
    #find the missing names in the database
    missing_names = find_missing_values_in_db(name_gender_db_df, unique_authors_names_df)
    
    if not missing_names.empty:
        #call the API with the missing names
        new_names = get_genders_from_name_api(missing_names.head(5), name_token)
        new_names_df = format_new_names(new_names)
    
        #update the database with the missing names
        name_gender_db_df = update_name_gender_db(name_gender_db_df, new_names_df)

    #update the authors names df with the new genders
    authors_names_and_genders_df = update_gender_from_db(df, name_gender_db_df)
    
    return df


fields_df = genderize_names(authors_names_df)
fields_df

In [None]:
#group by decade
fields_df['decade'] = fields_df['year'] // 10 * 10
display(fields_df)

#proportion of males and females by decade
gender_counts = fields_df.groupby(['decade', 'gender']).size().reset_index(name='count')
display(gender_counts)

total_counts = fields_df.groupby('decade').size().reset_index(name='total')
display(total_counts)

gender_proportions = pd.merge(gender_counts, total_counts, on='decade')
gender_proportions['proportion'] = gender_proportions['count'] / gender_proportions['total']
display(gender_proportions)

pivot_fields_df = gender_proportions.pivot(index='decade', columns='gender', values='proportion').reset_index()
print(pivot_fields_df)


In [None]:
# proportion of males and females by year
gender_counts = fields_df.groupby(['year', 'gender']).size().reset_index(name='count')
display(gender_counts)

total_counts = fields_df.groupby('year').size().reset_index(name='total')
display(total_counts)

gender_proportions = pd.merge(gender_counts, total_counts, on='year')
gender_proportions['proportion'] = gender_proportions['count'] / gender_proportions['total']
display(gender_proportions)

pivot_fields_df = gender_proportions.pivot(index='year', columns='gender', values='proportion').reset_index()
print(pivot_fields_df)

# graph the data
fig = px.line(pivot_fields_df, x='year', y=['female', 'male'], title='Scientific papers by Gender Over Time')
fig.show()

# Overlay yearly_ratio and pivot_fields_df
fig = px.line(yearly_gender_ratio, x=yearly_gender_ratio.index, y=['female_ratio', 'male_ratio'], title='Yearly Distribution of Nobel Laureates')


fig.show()





In [None]:
import plotly.graph_objects as go

import pandas as pd

# Sample Field Data
data_field = {
    'year': [1901, 1901, 1902, 1902, 1903, 2023, 2023],
    'gender': ['female', 'male', 'female', 'male', 'female', 'female', 'male'],
    'count': [3, 23, 3, 23, 5, 19, 21020]
}
df_field = pd.DataFrame(data_field)

# Sample Nobel Wins Data
data_nobel = {
    'award_year': [1901, 1902, 1903, 2021, 2022, 2023],
    'female': [0, 0, 1, 0, 1, 3],
    'male': [3, 4, 4, 10, 8, 6]
}
df_nobel = pd.DataFrame(data_nobel)
df_nobel.set_index('award_year', inplace=True)

# Group by year and gender, then sum the counts
df_field_grouped = df_field.groupby(['year', 'gender']).sum().reset_index()

# Pivot the data to have genders as columns
field_pivot = df_field_grouped.pivot(index='year', columns='gender', values='count').fillna(0)

# Calculate total counts per year
field_pivot['total'] = field_pivot.sum(axis=1)

# Calculate proportions
field_pivot['female_prop'] = field_pivot['female'] / field_pivot['total']
field_pivot['male_prop'] = field_pivot['male'] / field_pivot['total']
# Calculate total Nobel awards per year
df_nobel['total'] = df_nobel['female'] + df_nobel['male']

# Calculate proportions
df_nobel['female_prop'] = df_nobel['female'] / df_nobel['total']
df_nobel['male_prop'] = df_nobel['male'] / df_nobel['total']

merged = pd.merge(
    field_pivot[['female_prop', 'male_prop']],
    df_nobel[['female_prop', 'male_prop']],
    left_index=True,
    right_index=True,
    how='outer',
    suffixes=('_field', '_nobel')
).reset_index()

# Initialize the figure
fig = go.Figure()

# Add Field Proportions
fig.add_trace(go.Scatter(
    x=field_pivot.index,
    y=field_pivot['female_prop'],
    mode='lines',
    name='Female in Field',
    line=dict(color='pink')
))
fig.add_trace(go.Scatter(
    x=field_pivot.index,
    y=field_pivot['male_prop'],
    mode='lines',
    name='Male in Field',
    line=dict(color='blue')
))

# Add Nobel Proportions
fig.add_trace(go.Scatter(
    x=df_nobel.index,
    y=df_nobel['female_prop'],
    mode='lines',
    name='Female Nobel Wins',
    line=dict(color='pink', dash='dash')
))
fig.add_trace(go.Scatter(
    x=df_nobel.index,
    y=df_nobel['male_prop'],
    mode='lines',
    name='Male Nobel Wins',
    line=dict(color='blue', dash='dash')
))

# Update layout for better aesthetics
fig.update_layout(
    title='Evolution of Gender Proportions in Fields vs. Nobel Wins Over Time',
    xaxis_title='Year',
    yaxis_title='Proportion',
    legend=dict(x=0.01, y=0.99),
    template='plotly_white'
)

# Display the figure
fig.show()