# API Quest
## Oslo

## HYPOTHESIS 1
- Men are over represented in Nobel Prizes
- Older people get more prizes
- USA has the most nobels

## Selected data sources

1. Nobel API
2. crossref.org
3. https://archive.ics.uci.edu/ml/datasets/Gender+by+Name
4. namsor.app

In [None]:
%load_ext autoreload
%autoreload 2 

In [421]:
#imports
import os
import requests
import time
import pandas as pd
from dotenv import load_dotenv
from datetime import datetime
import plotly.express as px
from wrangling import *

import numpy as np
import matplotlib.pyplot as plt

In [422]:
#settings
pd.set_option('display.max_colwidth', 900)
pd.set_option('display.max_rows', 40)

In [423]:
#load env
load_dotenv()
name_token = os.getenv('NAME_KEY')


### MAIN DATA

In [424]:
laureates_url = 'https://api.nobelprize.org/2.1/laureates'

In [None]:
laureates_df = load_or_fetch_laureates('sources/laureates.csv', laureates_url)
display(laureates_df)

### GENDER ANALYSIS

In [426]:
#gender data schema
gender_columns = get_json('schema')

In [None]:
#transforms df into usable form
gender_df = shape_dataframe(laureates_df, gender_columns)
display(gender_df)

In [None]:
#shape nobels by year
nobels_by_year = gender_df.groupby(['year', 'gender']).size().unstack(fill_value=0)
nobels_by_year['total'] = nobels_by_year.apply(sum, axis=1)
display(nobels_by_year)

In [None]:
#display ratios
nobels_by_year['Women Winners'] = nobels_by_year['female'] / nobels_by_year['total']
nobels_by_year['Men Winners'] = nobels_by_year['male'] / nobels_by_year['total']
display(nobels_by_year.head(3))

In [None]:
#select only ratios
nobels_ratio_by_year = nobels_by_year[['Women Winners', 'Men Winners']]
nobels_ratio_by_year.reset_index(inplace=True)
display(nobels_ratio_by_year.head(3))

In [431]:
custom_colors = {
    'Men Nobel Win': '#1f77b4',        
    'Men Scientists': '#87ceeb',            
    'Women Nobel Win': '#cd8816',     
    'Women Scientists': '#ffb333',          
}

In [None]:
#cumulative count
gender_cumulative = gender_df.groupby(['year', 'gender']).size().unstack(fill_value=0).cumsum()
gender_cumulative = gender_cumulative.rename(columns={'male': 'Men Winners', 'female': 'Women Winners'})

display(gender_cumulative.head(3))

In [None]:
fig = px.line(gender_cumulative, x=gender_cumulative.index, y=['Men Winners', 'Women Winners'], title='Cumulative Gender Distribution of Nobel Laureates', color_discrete_map=custom_colors)
fig.update_layout(template='plotly_white')
fig.show()

### FIELD ANALYSIS  

In [None]:
#get the authors of random papers
authors_names_df = get_papers_authors(gender_columns, 1901, 2023, 'initial')
display(authors_names_df)

In [None]:
#genderize the names
fields_df = genderize_names(authors_names_df, name_token)
display(fields_df.head(3))

In [None]:
# count of males and females by year
gender_scientists = fields_df.groupby(['year', 'gender']).size().unstack(fill_value=0).reset_index()
gender_scientists['total'] = gender_scientists['female'] + gender_scientists['male']
display(gender_scientists.head(3))

In [None]:
#add ratios
gender_scientists['Women Scientists'] = gender_scientists['female'] / gender_scientists['total']
gender_scientists['Men Scientists'] = gender_scientists['male'] / gender_scientists['total']
display(gender_scientists)

In [None]:
#select only ratios
scientists_ratio_by_year = gender_scientists[['year', 'Women Scientists','Men Scientists']]
display(scientists_ratio_by_year.head(3))

### JOINT ANALYSIS

In [None]:
#merge the two raio datasets
merged_df = pd.merge(nobels_ratio_by_year, scientists_ratio_by_year, on='year')
display(merged_df)

In [None]:
# Create the line graph with markers
fig = px.line(
    merged_df,
    x='year',
    y=[
        'Men Winners',
        'Men Scientists', 
        'Women Winners', 
        'Women Scientists', 
        ],
    title='Gender Ratios in Scientific Papers and Nobel Laureates Over Time',
    color_discrete_map=custom_colors,
)
# Apply a theme
fig.update_layout(template='plotly_white')
# Update fonts

fig.add_annotation(
    x=2009,
    y=merged_df.loc[merged_df['year'] == 2009, 'Women Winners'].values[0],
    text="Rare over representation",
    showarrow=True,
    arrowhead=1
)
fig.show()


In [None]:
# Calculate average ratios over time
average_ratios_df = merged_df.mean().to_frame(name='Average').T
average_ratios_df = average_ratios_df.drop(columns='year')
display(average_ratios_df)

# Create a bar chart for average ratios
fig_avg = px.bar(
    average_ratios_df.melt(var_name='Category', value_name='Average Ratio'),
    x='Category',
    y='Average Ratio',
    title='Average Gender Ratios in Scientific Papers and Nobel Laureates',
    color='Category',
    color_discrete_map=custom_colors
)
fig_avg.update_layout(template='plotly_white')
fig_avg.show()

### AGE AND COUNTRY ANALYSIS

In [None]:
base_url = "http://api.nobelprize.org/2.1/laureates"
limit = 50
offset = 1
total_laureates = 991
all_laureates ={}

while offset < total_laureates:
    params = {
        "offset": offset,
        "limit": limit,
        "format":"json"
    }
    
    response = requests.get(base_url,params=params,timeout=20)
    if response.status_code == 200:
        data = response.json()
    
        for laureate in data['laureates']:
            all_laureates[laureate['id']] = laureate
            
        offset = offset + limit
    else:
        print(f"error:{response.status_code}")
        break
print(f"Till now we totally collect {len(all_laureates)} laureates for further analysis")
all_laureates

def extract_data(laureate_data):
    extracted_data = {}
    try:
        extracted_data['fullName'] = laureate_data.get('fullName', {}).get('en', None)  
        extracted_data['gender'] = laureate_data.get('gender', None)  
        extracted_data['birth_date'] = laureate_data.get('birth', {}).get('date', None)  
        extracted_data['birth_country'] = laureate_data.get('birth', {}).get('place', {}).get('country', {}).get('en', None)  
        extracted_data['country_now'] = laureate_data.get('birth', {}).get('place', {}).get('countryNow', {}).get('en', None)  
        extracted_data['continent'] = laureate_data.get('birth', {}).get('place', {}).get('continent', {}).get('en', None)  
        extracted_data['nobel_category'] = laureate_data.get('nobelPrizes', [{}])[0].get('category', {}).get('en', None)  
        extracted_data['awardYear'] = laureate_data.get('nobelPrizes', [{}])[0].get('awardYear', None)  
        extracted_data['dateAwarded'] = laureate_data.get('nobelPrizes', [{}])[0].get('dateAwarded', None)  
    except (KeyError, IndexError, AttributeError):
        pass
    
    return extracted_data

# extract all laureates data
def process_multiple_entries(all_laureates):
    final_data = {}
    for laureate_id, laureate_data in all_laureates.items():
        final_data[laureate_id] = extract_data(laureate_data)  
    return final_data

dictionaries = all_laureates  
final_dictionary = process_multiple_entries(dictionaries)

df = pd.DataFrame.from_dict(final_dictionary, orient='index')

def calculate_age_at_award(row):
    try:
        birth_date = pd.to_datetime(row['birth_date'], errors='coerce')  
        date_awarded = pd.to_datetime(row['dateAwarded'], errors='coerce')  
        
        if pd.notnull(birth_date) and pd.notnull(date_awarded):
            age_at_award = date_awarded.year - birth_date.year
            if date_awarded < birth_date + pd.DateOffset(years=age_at_award):  # check if Award-date before or after birth date
                age_at_award -= 1
            return age_at_award
        else:
            return None  # deal with missing value
    except:
        return None  # deal with dis-normal 

# Calculate all laureates age when being awarded and insert as a new column:
df['age_at_award'] = df.apply(calculate_age_at_award, axis=1).astype('Int64')

## Data cleaning:
  #remove totally 28 laureates rows which missing birth_date data or show invalid date format:
df = df.dropna(subset=['birth_date'])
for col in df[['birth_date','awardYear','dateAwarded']]:
    invalid_dates =df[col].str.contains(r"00-00",na=False)
    if invalid_dates.any():
        df = df[~invalid_dates]
    
df['birth_date']=pd.to_datetime(df['birth_date'], errors='coerce')
df['awardYear']=pd.to_datetime(df['awardYear'], errors='coerce')
df['dateAwarded']=pd.to_datetime(df['dateAwarded'], errors='coerce')

for col in df.columns:
    print(f" Column:{col}")
    print(df[col].apply(type).value_counts())
    print("___________")
    
print()

## Visualize all hypothesis/insight :

# hypothesis 1:
print(f"Below show total age group statistic for all NobelPrize laureates:")
bins = [17,29,39,49,59,69,79,100]
labels =['17-29','30-39','40-49','50-59','60-69','70-79','80-100']
df['age_group'] = pd.cut(df['age_at_award'],bins=bins,labels=labels,right=True)
age_group_counts = df['age_group'].value_counts().sort_index()
age_group_counts.plot(kind='bar', color='skyblue')

plt.xlabel('Age_Group')
plt.ylabel('laureate qty')
plt.title('Laureates statistic per age group')
plt.xticks(rotation=0)

for index,value in enumerate(age_group_counts):
    plt.text(index, value, str(value), ha='center', va='bottom')
plt.tight_layout()
plt.show()
print("_______________________________________________________________________")

# hypothesis 2:
print(f"Below show NobelPrize laureates statistic by countries:")
country_counts = df['birth_country'].value_counts().head(20)
country_counts.plot(kind='bar', color='orange')

plt.xlabel('Country')
plt.ylabel('laureate qty')
plt.title('Laureates statistic by top 20 country')
plt.xticks(rotation=45,ha='right')

for index,value in enumerate(country_counts):
    plt.text(index, value, str(value), ha='center', va='bottom')
plt.tight_layout()
plt.show()


    