# API Quest
## Oslo

# HYPOTHESIS
- Rich countries have more Nobel Prizes
    - Nobel prize winners immigrate towards rich countries
    - Nobel prize winners immigrate towards stable countries
- Countries of birth / early education have more impact than countries of higher education
- Nobel Prizes Laureates are getting younger
- Nobel Prizes are awarded more to international teams than before

- Gender Differences: Is there a significant difference in the gender ratio among Nobel Prize winners? Has this changed over time?
- Geographic Distribution: In which countries or regions are Nobel Prize winners predominantly located? Has this distribution changed over time?
- Age of Winners: What is the age distribution of Nobel Prize winners? Are there any noticeable trends in age?
- Publications: Are there specific journals where Nobel Prize winners’ research is commonly published? How influential are these journals?

## HYPOTHESIS 1
- Men are over represented in Nobel Prizes

## Selected data sources

1. Nobel API
2. https://uis.unesco.org/
3. https://databank.worldbank.org/source/world-development-indicators

In [59]:
#TODO filter STEM fields
#TODO modularize
#TODO country analysis

In [60]:
#imports
import os
import json
import requests
import pandas as pd
from dotenv import load_dotenv
import plotly.express as px
from datetime import datetime


In [61]:
#settings
pd.set_option('display.max_colwidth', 900)
pd.set_option('display.max_rows', 40)

In [62]:
#load env
load_dotenv()
token = os.getenv('TOKEN')
print(token)

test


In [63]:
#TODO: Get the data from the API
enrollment_df = pd.read_csv('sources/school_enrolment_gender.csv')
enrollment_df.head()

laureates_url = 'https://api.nobelprize.org/2.1/laureates'

In [64]:
def flatten(dictionnary, prefix=''):
    flattened = pd.json_normalize(dictionnary)

    if prefix:
        flattened = flattened.add_prefix(prefix + '.')

    for column in flattened.columns:
        sample = flattened[column].iloc[0]

        if isinstance(sample, list) and len(sample) > 0 and isinstance(sample[0], dict):
            # Find the maximum length of lists in the column
            max_len = flattened[column].apply(lambda x: len(x) if isinstance(x, list) else 0).max()
            for i in range(max_len):
                inner_dict = flattened[column].apply(
                    lambda x: x[i] if isinstance(x, list) and len(x) > i else None
                )
                flattened = pd.concat([flattened, flatten(inner_dict, f"{column}_{i+1}")], axis=1)
            flattened.drop(column, axis=1, inplace=True)

    return flattened

In [65]:
def get_all_laureates():
    offset = 0
    limit = 25
    max = 50
    all_laureates = pd.DataFrame()
    
    while offset < max:
        url = f"{laureates_url}?offset={offset}&limit={limit}"
        response = requests.get(url)
        data = response.json()
        max = data['meta']['count']
        flattened = flatten(data['laureates'])
        all_laureates = pd.concat([all_laureates, flattened], ignore_index=True)
        offset += limit

    all_laureates['id'] = all_laureates['id'].astype(int)
    return all_laureates.sort_values('id')

In [66]:
laureates_df = get_all_laureates()
laureates_df.head()

Unnamed: 0,id,fileName,gender,sameAs,knownName.en,knownName.se,givenName.en,givenName.se,familyName.en,familyName.se,...,nobelPrizes_1.affiliations_4.countryNow.en,nobelPrizes_1.affiliations_4.countryNow.no,nobelPrizes_1.affiliations_4.countryNow.se,nobelPrizes_1.affiliations_4.countryNow.sameAs,nobelPrizes_1.affiliations_4.countryNow.latitude,nobelPrizes_1.affiliations_4.countryNow.longitude,nobelPrizes_1.affiliations_4.continent.en,nobelPrizes_1.affiliations_4.locationString.en,nobelPrizes_1.affiliations_4.locationString.no,nobelPrizes_1.affiliations_4.locationString.se
949,1,rontgen,male,"[https://www.wikidata.org/wiki/Q35149, https://en.wikipedia.org/wiki/Wilhelm_Röntgen]",Wilhelm Conrad Röntgen,Wilhelm Conrad Röntgen,Wilhelm Conrad,Wilhelm Conrad,Röntgen,Röntgen,...,,,,,,,,,,
379,2,lorentz,male,"[https://www.wikidata.org/wiki/Q41688, https://en.wikipedia.org/wiki/Hendrik_Lorentz]",Hendrik A. Lorentz,Hendrik A. Lorentz,Hendrik A.,Hendrik A.,Lorentz,Lorentz,...,,,,,,,,,,
738,3,zeeman,male,"[https://www.wikidata.org/wiki/Q79000, https://en.wikipedia.org/wiki/Pieter_Zeeman]",Pieter Zeeman,Pieter Zeeman,Pieter,Pieter,Zeeman,Zeeman,...,,,,,,,,,,
380,4,becquerel,male,"[https://www.wikidata.org/wiki/Q41269, https://en.wikipedia.org/wiki/Henri_Becquerel]",Henri Becquerel,Henri Becquerel,Henri,Henri,Becquerel,Becquerel,...,,,,,,,,,,
736,5,pierre-curie,male,"[https://www.wikidata.org/wiki/Q37463, https://en.wikipedia.org/wiki/Pierre_Curie]",Pierre Curie,Pierre Curie,Pierre,Pierre,Curie,Curie,...,,,,,,,,,,


In [67]:
for column_name in laureates_df.columns:
    print(column_name)

id
fileName
gender
sameAs
knownName.en
knownName.se
givenName.en
givenName.se
familyName.en
familyName.se
fullName.en
fullName.se
birth.date
birth.place.city.en
birth.place.city.no
birth.place.city.se
birth.place.country.en
birth.place.country.no
birth.place.country.se
birth.place.cityNow.en
birth.place.cityNow.no
birth.place.cityNow.se
birth.place.cityNow.sameAs
birth.place.cityNow.latitude
birth.place.cityNow.longitude
birth.place.countryNow.en
birth.place.countryNow.no
birth.place.countryNow.se
birth.place.countryNow.sameAs
birth.place.countryNow.latitude
birth.place.countryNow.longitude
birth.place.continent.en
birth.place.continent.no
birth.place.continent.se
birth.place.locationString.en
birth.place.locationString.no
birth.place.locationString.se
wikipedia.slug
wikipedia.english
wikidata.id
wikidata.url
death.date
death.place.city.en
death.place.city.no
death.place.city.se
death.place.country.en
death.place.country.no
death.place.country.se
death.place.country.sameAs
death.place.

### GENDER ANALYSIS

In [68]:
gender_columns = {
    'id': {'original_name': 'id', 'dtype': 'int64'},
    'name': {'original_name': 'knownName.en', 'dtype': 'object'},
    'gender': {'original_name': 'gender', 'dtype': 'category', 'categories': ['female', 'male']},
    'award_year': {'original_name': 'nobelPrizes_1.awardYear', 'dtype': 'int64'},
    'birth_country': {'original_name': 'birth.place.country.en', 'dtype': 'object'},
    'field': {'original_name': 'nobelPrizes_1.category.en', 'dtype': 'category', 'categories': ['Physics', 'Chemistry', 'Physiology or Medicine', 'Economic Sciences']}
}


In [69]:
def get_selected_columns(column_dict: dict = gender_columns) -> list:

    selected_columns = [infos['original_name'] for key, infos in column_dict.items()]
    print('selected columns :\n', selected_columns)
    return selected_columns

def get_new_names(column_dict: dict = gender_columns) -> dict:
    
    new_names = {infos['original_name']:new for new, infos in column_dict.items()}
    print('new column names :\n', new_names)
    return new_names

def shape_dataframe(df, dictionnary):

    df = df[get_selected_columns(dictionnary)]
    df = df.rename(columns = get_new_names(dictionnary))

    return df

def selected_categories(column_name = 'field', dictionnary = gender_columns):
    selected_categories = []
    for data in dictionnary[column_name]:
        print(data['categories'])

In [70]:
gender_df = shape_dataframe(laureates_df, gender_columns)
display(gender_df)



selected columns :
 ['id', 'knownName.en', 'gender', 'nobelPrizes_1.awardYear', 'birth.place.country.en', 'nobelPrizes_1.category.en']
new column names :
 {'id': 'id', 'knownName.en': 'name', 'gender': 'gender', 'nobelPrizes_1.awardYear': 'award_year', 'birth.place.country.en': 'birth_country', 'nobelPrizes_1.category.en': 'field'}


Unnamed: 0,id,name,gender,award_year,birth_country,field
949,1,Wilhelm Conrad Röntgen,male,1901,Prussia,Physics
379,2,Hendrik A. Lorentz,male,1902,the Netherlands,Physics
738,3,Pieter Zeeman,male,1902,the Netherlands,Physics
380,4,Henri Becquerel,male,1903,France,Physics
736,5,Pierre Curie,male,1903,France,Physics
...,...,...,...,...,...,...
583,1030,Louis Brus,male,2023,USA,Chemistry
34,1031,Aleksey Yekimov,male,2023,USSR,Chemistry
508,1032,Jon Fosse,male,2023,Norway,Literature
655,1033,Narges Mohammadi,female,2023,Iran,Peace


In [71]:

gender_counts = gender_df.groupby('gender').aggregate({'id': 'count'}).reset_index()
gender_counts['proportion'] = gender_counts['id'] / gender_counts['id'].sum()
gender_counts['proportion'] = gender_counts['proportion'].apply(lambda x: f"{x:.0%}")
gender_counts.sort_values('proportion', ascending=False, inplace=True)
gender_counts.index = range(1, len(gender_counts) + 1)
display(gender_counts)


Unnamed: 0,gender,id,proportion
1,male,901,93%
2,female,64,7%


In [72]:
fig = px.bar(gender_counts, x='gender', y='id', text='proportion', title='Gender Distribution of Nobel Laureates')
fig.show()

In [73]:
yearly_ratio = gender_df.groupby(['award_year','gender']).size().unstack(fill_value=0)
display(yearly_ratio[['female','male']])
yearly_ratio['total'] = yearly_ratio.sum(axis=1)
yearly_ratio['female_ratio'] = yearly_ratio['female'] / yearly_ratio['total']
yearly_ratio['male_ratio'] = yearly_ratio['male'] / yearly_ratio['total']
display(yearly_ratio[['female_ratio','male_ratio']])


gender,female,male
award_year,Unnamed: 1_level_1,Unnamed: 2_level_1
1901,0,6
1902,0,7
1903,1,6
1904,0,5
1905,1,4
...,...,...
2019,1,13
2020,4,7
2021,1,12
2022,2,9


gender,female_ratio,male_ratio
award_year,Unnamed: 1_level_1,Unnamed: 2_level_1
1901,0.000000,1.000000
1902,0.000000,1.000000
1903,0.142857,0.857143
1904,0.000000,1.000000
1905,0.200000,0.800000
...,...,...
2019,0.071429,0.928571
2020,0.363636,0.636364
2021,0.076923,0.923077
2022,0.181818,0.818182


In [74]:
fig = px.line(yearly_ratio, x=yearly_ratio.index, y=['female_ratio', 'male_ratio'], title='Yearly Gender Distribution of Nobel Laureates')
fig.show()

In [75]:
gender_cumulative = gender_df.groupby(['award_year', 'gender']).size().unstack(fill_value=0).cumsum()
gender_cumulative['total'] = gender_cumulative.sum(axis=1)
gender_cumulative['male_proportion'] = gender_cumulative['male'] / gender_cumulative['total']
gender_cumulative['female_proportion'] = gender_cumulative['female'] / gender_cumulative['total']
display(gender_cumulative[['male_proportion', 'female_proportion']])

gender,male_proportion,female_proportion
award_year,Unnamed: 1_level_1,Unnamed: 2_level_1
1901,1.000000,0.000000
1902,1.000000,0.000000
1903,0.950000,0.050000
1904,0.960000,0.040000
1905,0.933333,0.066667
...,...,...
2019,0.942329,0.057671
2020,0.938710,0.061290
2021,0.938494,0.061506
2022,0.937107,0.062893


In [76]:
fig = px.line(gender_cumulative, x=gender_cumulative.index, y=['female_proportion', 'male_proportion'], title='Cumulative Proportion of Nobel Laureates by gender')
fig.show()

In [77]:
fig = px.line(gender_cumulative, x=gender_cumulative.index, y=['female', 'male'], title='Cumulative Gender Distribution of Nobel Laureates')
fig.show()

### AGE ANALYSIS

In [78]:
response = requests.get(laureates_url)
nobel_data = response.json()
laureate_infos = {}

for laureate in nobel_data['laureates']:
    
    laureate_infos[laureate['id']]= {
        'Name': laureate['knownName']['en'],
        'Gender': laureate.get('gender',None),
        'Birth_date': laureate['birth']['date'],
        
        'Birth_country': laureate['birth']['place']['country']['en'] if 'birth' in laureate and 'place' in laureate['birth'] and 'country' in 
        laureate['birth']['place'] else 'None',
        
        'Current_Country': laureate['birth']['place']['countryNow']['en'] if 'birth' in laureate and 'place' in laureate['birth'] and 'country' in 
        laureate['birth']['place'] else 'None',

        'Continent': laureate['birth']['place']['continent']['en'] if 'birth' in laureate and 'place' in laureate['birth'] and 'continent' in 
        laureate['birth']['place'] else 'None',
       
        'award_year': laureate['nobelPrizes'][0]['awardYear'],
        
        'Prize_category': laureate['nobelPrizes'][0]['category']['en'],    
        
        'Prize_affiliations': laureate['nobelPrizes'][0]['affiliations'][0]['nameNow']['en'] if 'nobelPrizes' in laureate and 'affiliations'
         in laureate['nobelPrizes'][0] and 'nameNow' in laureate['nobelPrizes'][0]['affiliations'][0] else 'None',
        
        'wikipedia_details': laureate['wikipedia']['english'] if 'wikipedia' in laureate and 'english' in laureate['wikipedia'] else 'None'}
                                 
df = pd.DataFrame.from_dict(laureate_infos,orient ='index')

df['award_year'] = pd.to_numeric(df['award_year'],errors = 'coerce')
df['Birth_date'] = pd.to_datetime(df['Birth_date'],errors = 'coerce')

df['Award_age']=df['award_year']-df['Birth_date'].dt.year
df['Award_age']=df['Award_age'].fillna(0).astype(int)

print(df.columns)

Index(['Name', 'Gender', 'Birth_date', 'Birth_country', 'Current_Country',
       'Continent', 'award_year', 'Prize_category', 'Prize_affiliations',
       'wikipedia_details', 'Award_age'],
      dtype='object')



Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.

