# API Quest
## Oslo

# HYPOTHESIS
- Rich countries have more Nobel Prizes
    - Nobel prize winners immigrate towards rich countries
    - Nobel prize winners immigrate towards stable countries
- Countries of birth / early education have more impact than countries of higher education
- Nobel Prizes Laureates are getting younger
- Nobel Prizes are awarded more to international teams than before

- Gender Differences: Is there a significant difference in the gender ratio among Nobel Prize winners? Has this changed over time?
- Geographic Distribution: In which countries or regions are Nobel Prize winners predominantly located? Has this distribution changed over time?
- Age of Winners: What is the age distribution of Nobel Prize winners? Are there any noticeable trends in age?
- Publications: Are there specific journals where Nobel Prize winners’ research is commonly published? How influential are these journals?

## HYPOTHESIS 1
- Men are over represented in Nobel Prizes

## Selected data sources

1. Nobel API
2. https://uis.unesco.org/
3. https://databank.worldbank.org/source/world-development-indicators

In [1]:
#TODO filter STEM fields
#TODO modularize
#TODO country analysis

In [2]:
#imports
import os
import json
import requests
import pandas as pd
from dotenv import load_dotenv
from datetime import datetime
import plotly.express as px
from wrangling import *

In [3]:
#settings
pd.set_option('display.max_colwidth', 900)
pd.set_option('display.max_rows', 40)

In [None]:
#load env
load_dotenv()
token = os.getenv('TOKEN')
print(token)

In [5]:
#TODO: Get the data from the API
enrollment_df = pd.read_csv('sources/school_enrolment_gender.csv')
enrollment_df.head()

laureates_url = 'https://api.nobelprize.org/2.1/laureates'

In [6]:
def flatten(dictionnary, prefix=''):
    flattened = pd.json_normalize(dictionnary)

    if prefix:
        flattened = flattened.add_prefix(prefix + '.')

    for column in flattened.columns:
        sample = flattened[column].iloc[0]

        if isinstance(sample, list) and len(sample) > 0 and isinstance(sample[0], dict):
            # Find the maximum length of lists in the column
            max_len = flattened[column].apply(lambda x: len(x) if isinstance(x, list) else 0).max()
            for i in range(max_len):
                inner_dict = flattened[column].apply(
                    lambda x: x[i] if isinstance(x, list) and len(x) > i else None
                )
                flattened = pd.concat([flattened, flatten(inner_dict, f"{column}_{i+1}")], axis=1)
            flattened.drop(column, axis=1, inplace=True)

    return flattened

In [7]:
def get_all_laureates():
    offset = 0
    limit = 25
    max = 50
    all_laureates = pd.DataFrame()
    
    while offset < max:
        url = f"{laureates_url}?offset={offset}&limit={limit}"
        response = requests.get(url)
        data = response.json()
        max = data['meta']['count']
        flattened = flatten(data['laureates'])
        all_laureates = pd.concat([all_laureates, flattened], ignore_index=True)
        offset += limit

    all_laureates['id'] = all_laureates['id'].astype(int)
    return all_laureates.sort_values('id')

In [8]:
laureates_df = get_all_laureates(laureates_url)
laureates_df.head()
laureates_df.to_csv('sources/laureates.csv', index=False)

In [None]:
for column_name in laureates_df.columns:
    print(column_name)

### GENDER ANALYSIS

In [10]:
gender_columns = {
    'id': {'original_name': 'id', 'dtype': 'int64'},
    'name': {'original_name': 'knownName.en', 'dtype': 'object'},
    'gender': {'original_name': 'gender', 'dtype': 'category', 'categories': ['female', 'male']},
    'award_year': {'original_name': 'nobelPrizes_1.awardYear', 'dtype': 'int64'},
    'birth_country': {'original_name': 'birth.place.country.en', 'dtype': 'object'},
    'field': {'original_name': 'nobelPrizes_1.category.en', 'dtype': 'category', 'categories': ['Physics', 'Chemistry', 'Physiology or Medicine', 'Economic Sciences']}
}

In [11]:
def get_selected_columns(column_dict: dict) -> list:

    selected_columns = [infos['original_name'] for key, infos in column_dict.items()]
    print('selected columns :\n', selected_columns)
    return selected_columns

def get_new_names(column_dict: dict) -> dict:
    
    new_names = {infos['original_name']:new for new, infos in column_dict.items()}
    print('new column names :\n', new_names)
    return new_names

def filter_categories(df:pd.DataFrame, column_dict:dict) -> pd.DataFrame:
    
    for name, infos in column_dict.items():
        if infos['dtype'] == 'category':
            df = df[df[name].isin(infos['categories'])]
    
    return df

def shape_dataframe(df:pd.DataFrame, dictionnary:dict) -> pd.DataFrame:

    df = df[get_selected_columns(dictionnary)]
    df = df.rename(columns = get_new_names(dictionnary))
    df = filter_categories(df, dictionnary)

    return df

In [None]:
gender_df = shape_dataframe(laureates_df, gender_columns)
display(gender_df)
gender_df['field'].value_counts()
gender_df.dtypes

In [None]:
gender_counts = gender_df.groupby('gender').aggregate({'id': 'count'}).reset_index()
gender_counts['proportion'] = gender_counts['id'] / gender_counts['id'].sum()
gender_counts['proportion'] = gender_counts['proportion'].apply(lambda x: f"{x:.0%}")
gender_counts.sort_values('proportion', ascending=False, inplace=True)
gender_counts.index = range(1, len(gender_counts) + 1)
display(gender_counts)

In [None]:
fig = px.bar(gender_counts, x='gender', y='id', text='proportion', title='Gender Distribution of Nobel Laureates')
fig.show()

In [None]:
yearly_ratio = gender_df.groupby(['award_year','gender']).size().unstack(fill_value=0)
display(yearly_ratio[['female','male']])
yearly_ratio['total'] = yearly_ratio.sum(axis=1)
yearly_ratio['female_ratio'] = yearly_ratio['female'] / yearly_ratio['total']
yearly_ratio['male_ratio'] = yearly_ratio['male'] / yearly_ratio['total']
display(yearly_ratio[['female_ratio','male_ratio']])

In [None]:
fig = px.line(yearly_ratio, x=yearly_ratio.index, y=['female_ratio', 'male_ratio'], title='Yearly Gender Distribution of Nobel Laureates')
fig.show()

In [None]:
gender_cumulative = gender_df.groupby(['award_year', 'gender']).size().unstack(fill_value=0).cumsum()
gender_cumulative['total'] = gender_cumulative.sum(axis=1)
gender_cumulative['male_proportion'] = gender_cumulative['male'] / gender_cumulative['total']
gender_cumulative['female_proportion'] = gender_cumulative['female'] / gender_cumulative['total']
display(gender_cumulative[['male_proportion', 'female_proportion']])

In [None]:
fig = px.line(gender_cumulative, x=gender_cumulative.index, y=['female_proportion', 'male_proportion'], title='Cumulative Proportion of Nobel Laureates by gender')
fig.show()

In [None]:
fig = px.line(gender_cumulative, x=gender_cumulative.index, y=['female', 'male'], title='Cumulative Gender Distribution of Nobel Laureates')
fig.show()

### AGE ANALYSIS

In [None]:
response = requests.get(laureates_url)
nobel_data = response.json()
laureate_infos = {}

for laureate in nobel_data['laureates']:
    
    laureate_infos[laureate['id']]= {
        'Name': laureate['knownName']['en'],
        'Gender': laureate.get('gender',None),
        'Birth_date': laureate['birth']['date'],
        
        'Birth_country': laureate['birth']['place']['country']['en'] if 'birth' in laureate and 'place' in laureate['birth'] and 'country' in 
        laureate['birth']['place'] else 'None',
        
        'Current_Country': laureate['birth']['place']['countryNow']['en'] if 'birth' in laureate and 'place' in laureate['birth'] and 'country' in 
        laureate['birth']['place'] else 'None',

        'Continent': laureate['birth']['place']['continent']['en'] if 'birth' in laureate and 'place' in laureate['birth'] and 'continent' in 
        laureate['birth']['place'] else 'None',
       
        'award_year': laureate['nobelPrizes'][0]['awardYear'],
        
        'Prize_category': laureate['nobelPrizes'][0]['category']['en'],    
        
        'Prize_affiliations': laureate['nobelPrizes'][0]['affiliations'][0]['nameNow']['en'] if 'nobelPrizes' in laureate and 'affiliations'
         in laureate['nobelPrizes'][0] and 'nameNow' in laureate['nobelPrizes'][0]['affiliations'][0] else 'None',
        
        'wikipedia_details': laureate['wikipedia']['english'] if 'wikipedia' in laureate and 'english' in laureate['wikipedia'] else 'None'}
                                 
df = pd.DataFrame.from_dict(laureate_infos,orient ='index')

df['award_year'] = pd.to_numeric(df['award_year'],errors = 'coerce')
df['Birth_date'] = pd.to_datetime(df['Birth_date'],errors = 'coerce')

df['Award_age']=df['award_year']-df['Birth_date'].dt.year
df['Award_age']=df['Award_age'].fillna(0).astype(int)

print(df.columns)