In [4]:
import requests
import numpy as np  # for using pandas
import pandas as pd  # for using dataframes

In [5]:
endpoint = "https://api.worldbank.org/v2/country/all/indicator/SP.POP.TOTL?format=json&per_page=1000"
res = requests.get(endpoint)
data = res.json()

pages = data[0]['pages']

df = pd.DataFrame(data[1], columns=['country', 'date', 'indicator'])

for p in range(2, pages + 1):
    endpoint = f"https://api.worldbank.org/v2/country/all/indicator/SP.POP.TOTL?format=json&page={p}&per_page=1000"
    res = requests.get(endpoint)
    data = res.json()
    dfi = pd.DataFrame(data[1], columns=['country', 'date', 'indicator'])
    df = pd.concat([df, dfi])

df.reset_index(drop=True, inplace=True)

In [6]:
endpoint = "https://api.worldbank.org/v2/indicator?format=json&per_page=25000"
res = requests.get(endpoint)
data = res.json()

# for i in data[1]:
    # print(i['id'])

if 'SP.POP.TOTL' in data[1]:
    print("Population data is available")

In [7]:
df

Unnamed: 0,country,date,indicator
0,"{'id': 'ZH', 'value': 'Africa Eastern and Sout...",2023,"{'id': 'SP.POP.TOTL', 'value': 'Population, to..."
1,"{'id': 'ZH', 'value': 'Africa Eastern and Sout...",2022,"{'id': 'SP.POP.TOTL', 'value': 'Population, to..."
2,"{'id': 'ZH', 'value': 'Africa Eastern and Sout...",2021,"{'id': 'SP.POP.TOTL', 'value': 'Population, to..."
3,"{'id': 'ZH', 'value': 'Africa Eastern and Sout...",2020,"{'id': 'SP.POP.TOTL', 'value': 'Population, to..."
4,"{'id': 'ZH', 'value': 'Africa Eastern and Sout...",2019,"{'id': 'SP.POP.TOTL', 'value': 'Population, to..."
...,...,...,...
17019,"{'id': 'ZW', 'value': 'Zimbabwe'}",1964,"{'id': 'SP.POP.TOTL', 'value': 'Population, to..."
17020,"{'id': 'ZW', 'value': 'Zimbabwe'}",1963,"{'id': 'SP.POP.TOTL', 'value': 'Population, to..."
17021,"{'id': 'ZW', 'value': 'Zimbabwe'}",1962,"{'id': 'SP.POP.TOTL', 'value': 'Population, to..."
17022,"{'id': 'ZW', 'value': 'Zimbabwe'}",1961,"{'id': 'SP.POP.TOTL', 'value': 'Population, to..."


In [8]:
# country column is a dictionary with id and value, we need to extract the country name
df[['country_id', 'country_name']] = pd.DataFrame(df['country'].to_list(), index=df.index)
df[['indicator_id', 'indicator_value']] = pd.DataFrame(df['indicator'].to_list(), index=df.index)
df.drop(columns=['country', 'indicator'], inplace=True)
df

Unnamed: 0,date,country_id,country_name,indicator_id,indicator_value
0,2023,ZH,Africa Eastern and Southern,SP.POP.TOTL,"Population, total"
1,2022,ZH,Africa Eastern and Southern,SP.POP.TOTL,"Population, total"
2,2021,ZH,Africa Eastern and Southern,SP.POP.TOTL,"Population, total"
3,2020,ZH,Africa Eastern and Southern,SP.POP.TOTL,"Population, total"
4,2019,ZH,Africa Eastern and Southern,SP.POP.TOTL,"Population, total"
...,...,...,...,...,...
17019,1964,ZW,Zimbabwe,SP.POP.TOTL,"Population, total"
17020,1963,ZW,Zimbabwe,SP.POP.TOTL,"Population, total"
17021,1962,ZW,Zimbabwe,SP.POP.TOTL,"Population, total"
17022,1961,ZW,Zimbabwe,SP.POP.TOTL,"Population, total"


In [9]:
def getIndicators(df):
    assert isinstance(df, pd.DataFrame), "df must be a pandas DataFrame"

    df[['indicator_id', 'indicator_value']] = pd.DataFrame(df['indicator'].to_list(), index=df.index)

    for _id in df.indicator_id.unique().tolist():
        df[_id] = df[(df.indicator_id == _id)]['value']

    if df['indicator_id'].nunique() > 1:
        df = df.groupby('countryiso3code').first()
    else:
        df.set_index('countryiso3code', inplace=True)

    df.drop(columns=['value', 'indicator','indicator_id','indicator_value'], inplace=True)
    return df

In [10]:
endpoint = "https://api.worldbank.org/v2/country/DE;FR/indicator/SP.POP.TOTL?date=2015:2020&format=json&per_page=1000"
res = requests.get(endpoint)
data = res.json()
df_1a = pd.DataFrame(data[1], columns=['indicator', 'date', 'value','countryiso3code'])

df_1a = getIndicators(df_1a)
df_1a

Unnamed: 0_level_0,date,SP.POP.TOTL
countryiso3code,Unnamed: 1_level_1,Unnamed: 2_level_1
DEU,2020,83160871
DEU,2019,83092962
DEU,2018,82905782
DEU,2017,82657002
DEU,2016,82348669
DEU,2015,81686611
FRA,2020,67571107
FRA,2019,67388001
FRA,2018,67158348
FRA,2017,66918020


In [11]:
# there are some countries that are not countries: high income, low income, not classified, etc
# df_original[(df_original.countryiso3code == '')]

endpoint = "https://api.worldbank.org/v2/country/all/indicator/SP.POP.TOTL;NY.GDP.MKTP.CD;SP.DYN.LE00.IN?date=2012&format=json&per_page=1000&source=2"
res = requests.get(endpoint)
data = res.json()
df_1b = pd.DataFrame(data[1], columns=['indicator', 'countryiso3code', 'date', 'value'])

df_1b = getIndicators(df_1b)
df_1b.drop('', inplace=True)
df_1b

Unnamed: 0_level_0,date,SP.POP.TOTL,NY.GDP.MKTP.CD,SP.DYN.LE00.IN
countryiso3code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ABW,2012,102112.0,2.615208e+09,75.531000
AFE,2012,552530654.0,9.526756e+11,60.050780
AFG,2012,30466479.0,1.990733e+10,61.923000
AFW,2012,376797999.0,7.377996e+11,55.340561
AGO,2012,25188292.0,1.280529e+11,58.623000
...,...,...,...,...
XKX,2012,1807106.0,6.163484e+09,78.280000
YEM,2012,26223391.0,3.540133e+10,67.343000
ZAF,2012,53145033.0,4.344005e+11,61.846000
ZMB,2012,14744658.0,2.550306e+10,58.867000


In [12]:
df_medal = pd.read_csv("medal_table.csv")

# https://en.wikipedia.org/wiki/ISO_3166-1_alpha-3
country_code2iso3 = {
    'GER':'DEU',
    'IRI':'IRN',
    'NED':'NLD',
    'RSA':'ZAF',
    'CRO':'HRV',
    'DEN':'DNK',
    'SUI':'CHE',
    'SLO':'SVN',
    'TPE':'TPE',
    'LAT':'LVA',
    'ALG':'DZA',
    'GRN':'GRD',
    'BAH':'BHS',
    'MGL':'MNG',
    'BUL':'BGR',
    'INA':'IDN',
    'MAS':'MYS',
    'PUR':'PRI',
    'BOT':'BWA',
    'GUA':'GTM',
    'POR':'PRT',
    'GRE':'GRC',
    'KSA':'SAU',
    'KUW':'KWT',
    'VIE':'VNM'
}

df_medal['iso3'] = df_medal['country_code'].map(country_code2iso3)

# set country_code as iso3 if iso3 is null
df_medal['iso3'] = df_medal['iso3'].combine_first(df_medal['country_code'])

df_medal.head(10)

Unnamed: 0,year,country,country_code,gold,silver,bronze,iso3
0,2012,United States,USA,46,28,30,USA
1,2012,People's Republic of China,CHN,38,31,22,CHN
2,2012,Great Britain,GBR,29,17,19,GBR
3,2012,Russian Federation,RUS,20,20,27,RUS
4,2012,Republic of Korea,KOR,13,9,8,KOR
5,2012,Germany,GER,11,20,13,DEU
6,2012,France,FRA,11,11,13,FRA
7,2012,Australia,AUS,8,15,12,AUS
8,2012,Italy,ITA,8,9,11,ITA
9,2012,Hungary,HUN,8,4,6,HUN


In [13]:
df = pd.merge(left=df_medal, right=df_1b, left_on='iso3', right_index=True, how='left')
df

Unnamed: 0,year,country,country_code,gold,silver,bronze,iso3,date,SP.POP.TOTL,NY.GDP.MKTP.CD,SP.DYN.LE00.IN
0,2012,United States,USA,46,28,30,USA,2012,3.138777e+08,1.625397e+13,78.741463
1,2012,People's Republic of China,CHN,38,31,22,CHN,2012,1.354190e+09,8.532185e+12,76.192000
2,2012,Great Britain,GBR,29,17,19,GBR,2012,6.370022e+07,2.707090e+12,80.904878
3,2012,Russian Federation,RUS,20,20,27,RUS,2012,1.433784e+08,2.208294e+12,70.072195
4,2012,Republic of Korea,KOR,13,9,8,KOR,2012,5.019985e+07,1.278047e+12,80.819512
...,...,...,...,...,...,...,...,...,...,...,...
81,2012,Kingdom of Saudi Arabia,KSA,0,0,1,SAU,2012,3.082154e+07,7.418499e+11,76.461000
82,2012,Kuwait,KUW,0,0,1,KWT,2012,3.394663e+06,1.740477e+11,78.812000
83,2012,Morocco,MAR,0,0,1,MAR,2012,3.335217e+07,1.069374e+11,71.679000
84,2012,Tajikistan,TJK,0,0,1,TJK,2012,7.956382e+06,7.633037e+09,68.484000


In [147]:
df_per_10M = df.copy()
df_per_10M['gold_per_10M'] = df_per_10M['gold'] / df_per_10M['SP.POP.TOTL'] * 10**7
df_per_10M['silver_per_10M'] = df_per_10M['silver'] / df_per_10M['SP.POP.TOTL'] * 10**7
df_per_10M['bronze_per_10M'] = df_per_10M['bronze'] / df_per_10M['SP.POP.TOTL'] * 10**7
df_per_10M['total_per_10M'] = (df_per_10M['gold']+df_per_10M['silver']+df_per_10M['bronze']) / df_per_10M['SP.POP.TOTL'] * 10**7

df_per_10M.sort_values(['gold_per_10M', 'silver_per_10M', 'bronze_per_10M'], ascending=False, inplace=True)
# df_per_10M.sort_values(['total_per_10M','gold_per_10M', 'silver_per_10M', 'bronze_per_10M'], ascending=False, inplace=True)

df_per_10M = df_per_10M[['country', 'gold_per_10M', 'silver_per_10M', 'bronze_per_10M', 'total_per_10M']]
df_per_10M.rename(columns={'country':'Country', 'gold_per_10M':'Gold', 'silver_per_10M':'Silver', 
                           'bronze_per_10M':'Bronze', 'total_per_10M':'Total' }, inplace=True)

df_per_10M['Rank'] = np.arange(1, len(df_per_10M)+1)
df_per_10M.set_index('Rank', inplace=True)

df_per_10M.head(10)

Unnamed: 0_level_0,Country,Gold,Silver,Bronze,Total
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Grenada,86.272345,0.0,0.0,86.272345
2,The Bahamas,26.173831,0.0,0.0,26.173831
3,Bahrain,24.592139,0.0,0.0,24.592139
4,Jamaica,14.493715,18.117143,10.870286,43.481144
5,New Zealand,13.611306,4.537102,11.342755,29.491164
6,Hungary,8.064222,4.032111,6.048166,18.144499
7,Croatia,7.029781,2.34326,4.686521,14.059563
8,Trinidad and Tobago,6.991164,6.991164,13.982328,27.964655
9,Lithuania,6.693949,0.0,10.040923,16.734872
10,Latvia,4.91565,0.0,4.91565,9.8313


Carry out a simple supervised machine learning experiment, in which you train a model to predict the
 number of medals a country wins at the Olympic Games based on demographic and economic features.
 Note: Since machine learning is not a focus topic of this course, you do not need to optimize the model.
 Just demonstrate that you are able to apply the steps we discussed in the course and correctly interpret
 the results.

a) Train and evaluate a linear regression model: 
- 1. Split your data into a training and a test set. 
- 2. Train a linear regression model using population, life expectancy and the GDP per capita of a country as features.
- 3. Evaluate the model using the root mean squared error as the performance
 metric.

<br/>

b) Briefly discuss the results: How do you judge the performance? What are possible reasons for this
 performance? How could the model be improved?

<br/>

c) Predict the number of medals a hypothetical country with a population of 10 million, life expectancy
 of 70 years, and a GDP per capita of 20.000 US$ would win.

In [84]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error

def split_fit_predict(X,y):
    x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.1,shuffle=True)

    model = LinearRegression()
    model.fit(x_train, y_train)

    y_pred = model.predict(x_test)

    rmse = root_mean_squared_error(y_test, y_pred)
    return model, rmse

In [85]:
df_3 = df.copy()
df_3.dropna(how='any', inplace=True)
X = df_3[['SP.POP.TOTL', 'NY.GDP.MKTP.CD', 'SP.DYN.LE00.IN']]
y = df_3['gold'] + df_3['silver'] + df_3['bronze']

_,rmse = split_fit_predict(X,y)

print(f"Root Mean Squared Error: {rmse}")

Root Mean Squared Error: 6.638966621063598


As this is a little dataset, the performance relys a lot on the selected train/test data, results change a lot depending on the split (from 4 to 25).

For this reason a single split + prediction is not enough. We should run it not only once (let's try 1000) and get the root mean squared error mean of all iterarions. 

In [122]:
res = []
best_model = [None, None]
for i in range(1000):
    model_i,rmse_i = split_fit_predict(X,y)
    res.append(rmse_i)

    if best_model[0] == None or best_model[0] < rmse_i:
        best_model = [rmse_i,model_i]
        
res = np.array(res)
mean_rmse = res.mean()
mean_rmse

np.float64(11.292998788933113)

<!-- Reasons -->
The model is not great, but it's a good start. 

We could improve it by adding more features, like the number of athletes, the number of sports, the number of events, etc.

In [134]:
# 
names = ["SP.POP.TOTL","NY.GDP.MKTP.CD","SP.DYN.LE00.IN"]
values = np.array([10**7, 20000, 70]).reshape(1,-1)

x_sample = pd.DataFrame(data=values, columns=names)
y_sample = best_model[1].predict(x_sample)
y_sample

array([3.41346544])