# API Quest
## Oslo

# HYPOTHESIS
- Rich countries have more Nobel Prizes
    - Nobel prize winners immigrate towards rich countries
    - Nobel prize winners immigrate towards stable countries
- Countries of birth / early education have more impact than countries of higher education
- Nobel Prizes Laureates are getting younger
- Nobel Prizes are awarded more to international teams than before

- Gender Differences: Is there a significant difference in the gender ratio among Nobel Prize winners? Has this changed over time?
- Geographic Distribution: In which countries or regions are Nobel Prize winners predominantly located? Has this distribution changed over time?
- Age of Winners: What is the age distribution of Nobel Prize winners? Are there any noticeable trends in age?
- Publications: Are there specific journals where Nobel Prize winners’ research is commonly published? How influential are these journals?

## HYPOTHESIS 1
- Men are over represented in Nobel Prizes

## Selected data sources

1. Nobel API
2. https://uis.unesco.org/
3. https://databank.worldbank.org/source/world-development-indicators

In [19]:
#TODO filter STEM fields
#TODO modularize
#TODO country analysis

In [None]:
%load_ext autoreload
%autoreload 2 

In [21]:
#imports
import os
import requests
import time
import pandas as pd
from dotenv import load_dotenv
from datetime import datetime
import plotly.express as px
from wrangling import *

In [22]:
#settings
pd.set_option('display.max_colwidth', 900)
pd.set_option('display.max_rows', 40)

In [23]:
#load env
load_dotenv()
token = os.getenv('TOKEN')

In [24]:
#TODO: Get the data from the API
enrollment_df = pd.read_csv('sources/school_enrolment_gender.csv')
enrollment_df.head()

laureates_url = 'https://api.nobelprize.org/2.1/laureates'

In [None]:
laureates_df = get_all_laureates(laureates_url)
laureates_df.to_csv('sources/laureates.csv', index=False)
display(laureates_df)

In [None]:
for column_name in laureates_df.columns:
    print(column_name)

### GENDER ANALYSIS

In [27]:
gender_columns = get_json('schema')

In [None]:
gender_df = shape_dataframe(laureates_df, gender_columns)
display(gender_df)

In [None]:
gender_counts = gender_df.groupby('gender').aggregate({'id': 'count'}).reset_index()
gender_counts['proportion'] = gender_counts['id'] / gender_counts['id'].sum()
gender_counts['proportion'] = gender_counts['proportion'].apply(lambda x: f"{x:.0%}")
gender_counts.sort_values('proportion', ascending=False, inplace=True)
gender_counts.index = range(1, len(gender_counts) + 1)
display(gender_counts)

In [None]:
fig = px.bar(gender_counts, x='gender', y='id', text='proportion', title='Gender Distribution of Nobel Laureates')
fig.show()

In [None]:
yearly_ratio = gender_df.groupby(['award_year','gender']).size().unstack(fill_value=0)
display(yearly_ratio[['female','male']])
yearly_ratio['total'] = yearly_ratio.sum(axis=1)
yearly_ratio['female_ratio'] = yearly_ratio['female'] / yearly_ratio['total']
yearly_ratio['male_ratio'] = yearly_ratio['male'] / yearly_ratio['total']
display(yearly_ratio[['female_ratio','male_ratio']])

In [None]:
fig = px.line(yearly_ratio, x=yearly_ratio.index, y=['female_ratio', 'male_ratio'], title='Yearly Gender Distribution of Nobel Laureates')
fig.show()

In [None]:
gender_cumulative = gender_df.groupby(['award_year', 'gender']).size().unstack(fill_value=0).cumsum()
gender_cumulative['total'] = gender_cumulative.sum(axis=1)
gender_cumulative['male_proportion'] = gender_cumulative['male'] / gender_cumulative['total']
gender_cumulative['female_proportion'] = gender_cumulative['female'] / gender_cumulative['total']
display(gender_cumulative[['male_proportion', 'female_proportion']])

In [None]:
fig = px.line(gender_cumulative, x=gender_cumulative.index, y=['female_proportion', 'male_proportion'], title='Cumulative Proportion of Nobel Laureates by gender')
fig.show()

In [None]:
fig = px.line(gender_cumulative, x=gender_cumulative.index, y=['female', 'male'], title='Cumulative Gender Distribution of Nobel Laureates')
fig.show()

### FIELD ANALYSIS  

In [44]:
names_df = get_all_names_df(gender_columns, 1901)
names_df['gender'] = None
display(names_df)
names_df.to_csv('sources/names.csv', index=False)

{'year': 1901, 'field': 'Physics', 'authors': ['Ludwig', 'Dawson', 'John', 'George']}
{'year': 1901, 'field': 'Chemistry', 'authors': ['Bernard', 'Louis', 'Paul']}
{'year': 1901, 'field': 'Physiology or Medicine', 'authors': ['Sydney', 'Allen', 'Joseph', 'Leo', 'William', 'Theodore', 'Harvey', 'Jacques', 'BENJAMIN', 'Leonard']}
{'year': 1901, 'field': 'Economic Sciences', 'authors': ['Max', 'Edwin', 'Henry', 'William', 'Montague', 'Mona', 'Christabel', 'Bolton', 'Sidney', 'Rudolfo']}
{'year': 1902, 'field': 'Physics', 'authors': ['Henri', 'Alfred', 'Carl', 'Vladimír']}
{'year': 1902, 'field': 'Chemistry', 'authors': ['Louis', 'Edward', 'Paul']}
{'year': 1902, 'field': 'Physiology or Medicine', 'authors': ['James', 'Anne', 'Walter', 'Torald', 'William', 'Laura', 'Jacques', 'Francis', 'Otto']}
{'year': 1902, 'field': 'Economic Sciences', 'authors': ['Adolphe', 'George', 'Octave', 'Harold', 'Helen', 'Percy', 'Sidney', 'Carl', 'David', 'Cameron']}
{'year': 1903, 'field': 'Physics', 'author

Unnamed: 0,year,field,authors,gender
0,1901,Physics,Ludwig,
1,1901,Physics,Dawson,
2,1901,Physics,John,
3,1901,Physics,George,
4,1901,Chemistry,Bernard,
...,...,...,...,...
4549,2023,Economic Sciences,Vincenzo,
4550,2023,Economic Sciences,Henrique,
4551,2023,Economic Sciences,Tünde-Ilona,
4552,2023,Economic Sciences,Salvatore,


### AGE ANALYSIS

In [None]:
response = requests.get(laureates_url)
nobel_data = response.json()
laureate_infos = {}

for laureate in nobel_data['laureates']:
    
    laureate_infos[laureate['id']]= {
        'Name': laureate['knownName']['en'],
        'Gender': laureate.get('gender',None),
        'Birth_date': laureate['birth']['date'],
        
        'Birth_country': laureate['birth']['place']['country']['en'] if 'birth' in laureate and 'place' in laureate['birth'] and 'country' in 
        laureate['birth']['place'] else 'None',
        
        'Current_Country': laureate['birth']['place']['countryNow']['en'] if 'birth' in laureate and 'place' in laureate['birth'] and 'country' in 
        laureate['birth']['place'] else 'None',

        'Continent': laureate['birth']['place']['continent']['en'] if 'birth' in laureate and 'place' in laureate['birth'] and 'continent' in 
        laureate['birth']['place'] else 'None',
       
        'award_year': laureate['nobelPrizes'][0]['awardYear'],
        
        'Prize_category': laureate['nobelPrizes'][0]['category']['en'],    
        
        'Prize_affiliations': laureate['nobelPrizes'][0]['affiliations'][0]['nameNow']['en'] if 'nobelPrizes' in laureate and 'affiliations'
         in laureate['nobelPrizes'][0] and 'nameNow' in laureate['nobelPrizes'][0]['affiliations'][0] else 'None',
        
        'wikipedia_details': laureate['wikipedia']['english'] if 'wikipedia' in laureate and 'english' in laureate['wikipedia'] else 'None'}
                                 
df = pd.DataFrame.from_dict(laureate_infos,orient ='index')

df['award_year'] = pd.to_numeric(df['award_year'],errors = 'coerce')
df['Birth_date'] = pd.to_datetime(df['Birth_date'],errors = 'coerce')

df['Award_age']=df['award_year']-df['Birth_date'].dt.year
df['Award_age']=df['Award_age'].fillna(0).astype(int)

print(df.columns)