In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import plotly.io as pio
import plotly.express as px

import os
import warnings
warnings.filterwarnings('ignore')
import PIL
import io
if not os.path.exists("images"):
    os.mkdir("images")

# Import and merge datasets

In [2]:
df = pd.read_csv('data/Olympics_Games.csv')
medal = pd.read_csv('data/Olympic_Games_Medal_Tally.csv')
country = pd.read_excel('data/olympic_city_country.xlsx')
place = pd.read_csv('data/nocRegions.csv')

newdf1 = df[['year','city']].merge(medal[['year','country','total']], how='right')
list_africa=pd.read_excel('data/list-african-countries-dependent-territory-286j.xls')

gpd = pd.read_csv('data/API_NY.GDP.MKTP.CD_DS2_en_csv_v2_4701247.csv',skiprows=[0,1,2])
gpd=gpd.fillna(0)
newdf1.head(4)

Unnamed: 0,year,city,country,total
0,1896,Athina,United States,20
1,1896,Athina,Greece,47
2,1896,Athina,Germany,13
3,1896,Athina,France,11


# data pre-processing 

In [3]:
newdf1['country'][newdf1['country']=='Great Britain']='United Kingdom'
newdf1['country'][newdf1['country']=='West Germany']='Germany'
newdf1['country'][newdf1['country']=="People's Republic of China"]='China'
newdf1['country'][newdf1['country']=="People's Republic of China"]='China'
newdf1['country'][newdf1['country']=="Russian Federation"]='Russia'
newdf1.isnull().sum()
list_africa=list_africa.dropna()
newdf1.head(4)

Unnamed: 0,year,city,country,total
0,1896,Athina,United States,20
1,1896,Athina,Greece,47
2,1896,Athina,Germany,13
3,1896,Athina,France,11


In [4]:
# distint year in our data
pd.unique(newdf1['year'])

array([1896, 1900, 1904, 1908, 1912, 1920, 1924, 1928, 1932, 1936, 1948,
       1952, 1956, 1960, 1964, 1968, 1972, 1976, 1980, 1984, 1988, 1992,
       1996, 2000, 2004, 2008, 2012, 2016, 2020, 1994, 1998, 2002, 2006,
       2010, 2014, 2018, 2022])

In [5]:
country = country.rename(columns={'City': 'city'})
newdf1 = newdf1.rename(columns={'total': 'Medal number', "Country": 'Host country'})
newdf = newdf1.merge(country, how='right').dropna()
newdf = newdf.rename(columns={"Country": 'Host country'})


In [6]:
newdf[newdf['Host country']=='Germany']

Unnamed: 0,year,city,country,Medal number,Host country
610,1936.0,Garmisch-Partenkirchen,Germany,101.0,Germany
611,1936.0,Garmisch-Partenkirchen,United States,57.0,Germany
612,1936.0,Garmisch-Partenkirchen,Hungary,16.0,Germany
613,1936.0,Garmisch-Partenkirchen,Italy,27.0,Germany
614,1936.0,Garmisch-Partenkirchen,Finland,20.0,Germany
...,...,...,...,...,...
691,1936.0,Berlin,United Kingdom,3.0,Germany
692,1936.0,Berlin,United States,4.0,Germany
693,1936.0,Berlin,Canada,1.0,Germany
694,1936.0,Berlin,France,1.0,Germany


# number of medal won by each country

In [7]:
tmp = newdf.groupby(['country'])['Medal number'].sum()
df4 = pd.DataFrame(data={'Medal': tmp.values}, index=tmp.index).reset_index()
df4 =df4.sort_values(by=['Medal'],ascending=False)
df4

Unnamed: 0,country,Medal
143,United States,5707.0
119,Soviet Union,2825.0
47,Germany,2356.0
141,United Kingdom,2010.0
44,France,1717.0
...,...,...
110,Samoa,1.0
9,Barbados,1.0
80,Mauritius,1.0
98,Paraguay,1.0


## a) Number of  olympic games host by country

In [8]:
tmp = newdf.drop_duplicates(subset=['year'])
tmp = tmp.groupby(['Host country'])['Host country'].value_counts()
l= [tmp.index[i][0] for i in range(len(tmp))]
t = {"host country": l,'Number games hosted':tmp.values }
seasoncount = pd.DataFrame(t)
seasoncount

Unnamed: 0,host country,Number games hosted
0,Australia,1
1,Austria,2
2,Belgium,1
3,Brazil,1
4,Canada,2
5,China,2
6,France,4
7,Germany,1
8,Greece,2
9,Italy,2


In [9]:
hostcountry=newdf[newdf['Host country'].isin(list(seasoncount['host country']))]
l = hostcountry['country']==hostcountry['Host country']
hostcountry['host game'] = [l.iloc[i] for i in range(len(l))]
t={'country':[],'Number of participation':[]}
for elem in list(seasoncount['host country']):
    t['country'].append(elem)
    t['Number of participation'].append(len(hostcountry[(hostcountry['country']==elem) & (hostcountry['host game']==False) ]))

In [10]:
lu=pd.DataFrame(t)
lu

Unnamed: 0,country,Number of participation
0,Australia,53
1,Austria,77
2,Belgium,59
3,Brazil,33
4,Canada,86
5,China,20
6,France,80
7,Germany,75
8,Greece,31
9,Italy,75


In [11]:
import plotly.express as px
from matplotlib import font_manager
# Set the font properties
font_prop = font_manager.FontProperties(fname='font/source-sans-pro-semibold.ttf')
font_prop.get_name()

custom_template = {
    "layout": go.Layout(
        font={
            "family": 'Open Sans',
            "size": 14,
            "color": "black",
        },
        title={
            "font": {
                "family": 'Open Sans',
                "size": 22,
                "color": "#1f1f1f",
            },
        },
        plot_bgcolor="#ffffff",
        paper_bgcolor="#ffffff",
        colorway=px.colors.qualitative.G10,
    )
}

def format_title(title, subtitle=None, subtitle_font_size=12):
    title = f'<b>{title}</b>'
    if not subtitle:
        return title
    subtitle = f'<span style="font-size: {subtitle_font_size}px;">{subtitle}</span>'
    return f'{title}<br>{subtitle}'

def plot(country:str,newdf, hostcountry):
    lg =hostcountry
    lp =lg[lg['Host country']==country]
    lo=lg[lg['country']==country]
    lo['year'] = lo['year'].astype(int)
    tmp = newdf.groupby(['year','country'])['Medal number'].sum()
    dfg = pd.DataFrame(data={'Medal': tmp.values}, index=tmp.index).reset_index()
    dfg =dfg.sort_values(by=['Medal'],ascending=False)
    dffl= dfg[dfg['country']==country]
    dffl =dffl.sort_values(by=['year'],ascending=False)
    
    fig = px.line(dffl, x='year', y='Medal',template=custom_template)
    for i in range(len(lp)):
        fig.add_vline(x=lp['year'].iloc[i], line_width=.5, line_dash="dash", line_color="green")
        
    fig.update_layout( title=format_title(country, 
                                      "(Data collected between 1896-2021).")
)
    fig.show()


In [12]:
plot('United Kingdom',newdf,hostcountry)

KeyboardInterrupt: 

In [13]:
plot('France',newdf,hostcountry)

In [14]:
plot('United States',newdf,hostcountry)

In [15]:
plot('Germany',newdf,hostcountry)

In [16]:
plot('China',newdf,hostcountry)

In [17]:
plot('Japan',newdf,hostcountry)

Recently, there has been a trend that fewer cities want to host the Olympics, challenging the future of a century-old tradition (Goldblatt, 2016). For example, 12 cities bid for 2004 Summer Olympics but only two for the 2020 Winter games (Ludacer, 2018). Considering the importance of Olympic success in national pride (Mower, 2012), a home-field advantage in the Olympics might encourage countries to bid for hosting the mega-event. Therefore, it is worth investigating whether this advantage exists in the Olympics, and how significant it is.
[ref](https://olymvis.hongtaoh.com/paper/)

In [18]:
tmp = hostcountry.groupby(['country','host game'])['Medal number'].sum()
dff = pd.DataFrame(data={'Medal': tmp.values}, index=tmp.index).reset_index()
dfl=dff[dff['country'].isin(list(seasoncount['host country']))]
dfl.head()

Unnamed: 0,country,host game,Medal
5,Australia,False,755.0
6,Australia,True,93.0
7,Austria,False,486.0
8,Austria,True,57.0
13,Belgium,False,262.0


In [19]:
tmp = dfl.groupby(['country','host game'])['Medal'].sum()
dfff = pd.DataFrame(data={'Medal': tmp.values}, index=tmp.index).reset_index()
dflf=dfff[dfff['country'].isin(list(seasoncount['host country']))]
dflf.head()

Unnamed: 0,country,host game,Medal
0,Australia,False,755.0
1,Australia,True,93.0
2,Austria,False,486.0
3,Austria,True,57.0
4,Belgium,False,262.0


In [20]:
temp = seasoncount
temp= temp.rename(columns={'host country': 'country'})
dfl = dfl.merge(temp, how='right').dropna()
dfl = dfl.merge(lu, how='right').dropna()

In [21]:

dfl['away'] = np.array(dfl['Medal']/dfl['Number of participation']).astype(int)
dfl['home'] = np.array(dfl['Medal']/dfl['Number games hosted']).astype(int)
dfl['home'][dfl['host game']==False] =0
dfl['home'][dfl['host game']==False] =0
dfl['Number of participation'][dfl['host game']==True] =0
dfl

Unnamed: 0,country,host game,Medal,Number games hosted,Number of participation,away,home
0,Australia,False,755.0,1,53,14,0
1,Australia,True,93.0,1,0,1,93
2,Austria,False,486.0,2,77,6,0
3,Austria,True,57.0,2,0,0,28
4,Belgium,False,262.0,1,59,4,0
5,Belgium,True,42.0,1,0,0,42
6,Brazil,False,193.0,1,33,5,0
7,Brazil,True,19.0,1,0,0,19
8,Canada,False,783.0,2,86,9,0
9,Canada,True,41.0,2,0,0,20


In [22]:
slop=dfl[dfl['country'].isin(list(['United Kingdom', 'Germany', 'United states','China','France']))]

In [23]:
import plotly.graph_objects as go
fig = go.Figure(go.Scatter(x=['Hosting a game', 'Not hosting a game'],y=[slop[slop['host game']==True]['home'].iloc[0], 
                              slop[slop['host game']==False]['away'].iloc[0]]
                           , mode='lines+markers+text',
                           text=[slop[slop['host game']==True]['country'].iloc[0],slop[slop['host game']==True]['country'].iloc[0]],
                           textposition=['middle right', 'middle left']))
for i in range(1,len(slop[slop['host game']==False]['Medal'])):
    fig.add_trace(go.Scatter(x=['Hosting a game', 'Not hosting a game'],
                             y=[slop[slop['host game']==True]['home'].iloc[i], 
                              slop[slop['host game']==False]['away'].iloc[i]], mode='lines+markers+text', 
                           text=[slop[slop['host game']==True]['country'].iloc[i],
                                 slop[slop['host game']==True]['country'].iloc[i]],
                           textposition=['middle right', 'middle left']))
    
fig.add_shape(type='line', x0=0, x1=0, y0=0, y1=1, xref='x', yref='paper')
fig.add_shape(type='line', x0=0, x1=0, y0=0, y1=1, xref='x', yref='paper')
fig.add_shape(type='line', x0=1, x1=1, y0=0, y1=1, xref='x', yref='paper')
fig.show()

In [24]:
sub1 = dfl[dfl['host game']==True][['country', 'host game','home']]
sub2 = sub = dfl[dfl['host game']==False][['country', 'host game','away']]
sub2= sub2.rename(columns={"away": 'average medals won'})
sub1= sub1.rename(columns={"home": 'average medals won'})
sub=pd.concat([sub1,sub2])

In [25]:
slopt=sub[sub['country'].isin(list(['United Kingdom', 'Germany', 'United States','Italy','China','France', 'Japan','Canada']))]

In [26]:
slopt1 = slopt[slopt['host game']==False]
slopt2 = slopt[slopt['host game']==True]
# slopt['host game'] = slopt[slopt['host game']==True]='Home'

In [27]:
slopt1['host game'] = 'Away'
slopt2['host game'] = 'Home'
slopt=pd.concat([slopt2,slopt1])


#  Does hosting the Olympic games and the country's economy affect its performance?

In [28]:

custom_template = {
    "layout": go.Layout(
        font={
            "family": 'Open Sans',
            "size": 14,
            "color": "black",
        },
        title={
            "font": {
                "family": 'Open Sans',
                "size": 22,
                "color": "black",
            },
        },
        plot_bgcolor="#ffffff",
        paper_bgcolor="#ffffff",
        colorway=px.colors.qualitative.G10,
    )
}
def format_title(title, subtitle=None, subtitle_font_size=15):
    title = f'<b>{title}</b>'
    if not subtitle:
        return title
    subtitle = f'<span style="font-size: {subtitle_font_size}px;">{subtitle}</span>'
    return f'{title}<br>{subtitle}'

In [29]:
fig = px.bar(slopt, x="country", y="average medals won",color='host game',
             barmode='group',text= "average medals won",color_discrete_sequence=[
                 "#00b0d1", "#73c25c"],
labels={ # replaces default labels by column name
                "host game": "Location of medal acquisition",
            })
fig.update_layout(legend=dict(
    yanchor="bottom",
    y=0.7,
    xanchor="right",
    x=1.09
))
fig.update_layout(template=custom_template,)
fig.update_yaxes(title_text='')
fig.update_layout( title=format_title('The Olympic Host Advantage: A Boost in National Medal Counts', 
                                      "Medal Count Ratio: A measure of a country's Olympic performance, calculated as the number of medals won divided by the number of Olympic participations (Data collected between 1896-2021)")
)
fig.update_xaxes(title_text='')
fig.update_traces(textposition='outside')
fig.update_layout(
    annotations=[
        dict(
            text="Data source: https://www.kaggle.com/datasets/josephcheng123456/olympic-historical-dataset-from-olympediaorg?select=Olympic_Athlete_Bio.csv",
            xref="paper",
            yref="paper",
            x=0.03,
            y=-.16,
            showarrow=False
        ),
#         dict(
#             text="Medal count ratio (medals won/participations)",
#             xref="paper",
#             yref="paper",
#             x=-.1,
#             y=1.06,
#             showarrow=False
#         )
    ]
)
fig.update_layout(yaxis_visible=False)
fig.update_layout(xaxis={'categoryorder':'max descending'})
# fig.update_xaxes(categoryorder='category ascending')
fig.show()
pio.write_image(fig, 'hosteffet.pdf', width=1400, height=600, scale=5)

In [51]:
df.index[df['city']=='Paris'].tolist()[0]


1

In [50]:
df

Unnamed: 0,edition,edition_id,edition_url,season,year,city,country_flag_url,country_noc,start_date,end_date,isHeld,competition_start_date,competition_end_date
0,1896 Summer Olympics,1,/editions/1,Summer,1896,Athina,/images/flags/GRE.png,GRE,1896-04-06,1896-04-15,na,1896-04-06,1896-04-13
1,1900 Summer Olympics,2,/editions/2,Summer,1900,Paris,/images/flags/FRA.png,FRA,na,na,na,1900-05-14,1900-10-28
2,1904 Summer Olympics,3,/editions/3,Summer,1904,St. Louis,/images/flags/USA.png,USA,1904-05-14,na,na,1904-07-01,1904-11-23
3,1908 Summer Olympics,5,/editions/5,Summer,1908,London,/images/flags/GBR.png,GBR,1908-07-13,na,na,1908-04-27,1908-10-31
4,1912 Summer Olympics,6,/editions/6,Summer,1912,Stockholm,/images/flags/SWE.png,SWE,1912-07-06,1912-07-27,na,1912-05-05,1912-07-27
...,...,...,...,...,...,...,...,...,...,...,...,...,...
57,2010 Winter Olympics,57,/editions/57,Winter,2010,Vancouver,/images/flags/CAN.png,CAN,2010-02-12,2010-02-28,na,2010-02-12,2010-02-28
58,2014 Winter Olympics,58,/editions/58,Winter,2014,Sochi,/images/flags/RUS.png,RUS,2014-02-07,2014-02-23,na,2014-02-06,2014-02-23
59,2018 Winter Olympics,60,/editions/60,Winter,2018,PyeongChang,/images/flags/KOR.png,KOR,2018-02-09,2018-02-25,na,2018-02-08,2018-02-25
60,2022 Winter Olympics,62,/editions/62,Winter,2022,Beijing,/images/flags/CHN.png,CHN,2022-02-04,2022-02-20,na,2022-02-02,2022-02-20


In [41]:

df[df['city']=='Paris']['edition_id'] = df[df['city']=='Paris']['edition_id'].replace(temp,temp+1)

In [42]:
df[df['city']=='Paris']['edition_id']

1      2
7      8
32    63
Name: edition_id, dtype: int64

## b) Top 12 world countries that have won more medial and hosted at least one Olympic game.

In [30]:
import numpy as np
pg =df4[df4['country'].isin(list(seasoncount['host country']))]
pg= pg.rename(columns={"country": 'host country'})
l = pg.merge(seasoncount, how='right').dropna()
GPD=gpd[gpd['Country Name'].isin(list(seasoncount['host country']))]
GPD = GPD.sort_values(by=['2021'],ascending=False)
pfd = pd.DataFrame({'host country': GPD['Country Name'], 'GDP': np.log10(GPD["2021"]),'GDP rank':[i for i in range(1,len(GPD['Country Name'])+1)]})

# coustomized plot:

In [31]:
def format_title(title, subtitle=None, subtitle_font_size=12):
    title = f'<b>{title}</b>'
    if not subtitle:
        return title
    subtitle = f'<span style="font-size: {subtitle_font_size}px;">{subtitle}</span>'
    return f'{title}<br>{subtitle}'

In [32]:

import pycountry

ll = l.merge(pfd, how='right').dropna()
ll['Number games hosted']=ll['Number games hosted'].values.astype(int)
ll =ll.sort_values(by=['Medal'],ascending=False)
lo = ll

iso3_to_iso2 = {c.alpha_3: c.alpha_2 for c in pycountry.countries}

df = px.data.gapminder().query("year==2007")
df["iso_alpha2"] = df["iso_alpha"].map(iso3_to_iso2)
ll = ll.rename(columns={"host country": 'country'})
df=ll.merge(df,how='right').dropna()

fig = px.scatter(
    df,y="Medal", x="GDP",text='country',
    hover_name="country",
   template=custom_template
)

for i, row in df.iterrows():
    country_iso = row["iso_alpha2"]
    fig.add_layout_image(
        dict(
            source=f"https://raw.githubusercontent.com/matahombres/CSS-Country-Flags-Rounded/master/flags/{country_iso}.png",
            xref="x",
            yref="y",
            xanchor="center",
            yanchor="middle",
            y=row["Medal"],
            x=row["GDP"],
            sizex = 300,
            sizey=350,
            sizing="contain",
            opacity=1,
            layer="above"
        )
    )
fig.update_traces(textposition='top left')
fig.update_xaxes(title_text='')
fig.update_layout(
    annotations=[
        dict(
            text="Data source: https://www.kaggle.com/datasets/josephcheng123456/olympic-historical-dataset-from-olympediaorg?select=Olympic_Athlete_Bio.csv",
            xref="paper",
            yref="paper",
            x=-0.01,
            y=-.16,
            showarrow=False
        )
    , dict(
            text="Number of medals won",
            xref="paper",
            yref="paper",
            x=-.05,
            y=1.05,
            showarrow=False
        ),
    dict(
            text=" Log10 GDP Scale in 2021 ( U.S. dollars)",
            xref="paper",
            yref="paper",
            y=-.09,
            x=1.01,
            showarrow=False
        )]
    
)
fig.update_layout( title=format_title("Japan and China's Olympic Results Contradict the Trend of GDP Positively Impacting Medal Count",
                                      "Data collected between 1896-2021")
)
fig.update_yaxes(title_text='')
# fig.update_xaxes(type="log")
fig.show()
pio.write_image(fig, 'correlation_world.pdf', width=1100, height=600, scale=10)

Japan's Olympic Underperformance: A Deviation from the positive Correlation between Medals and GDP" is a title that suggests that there is generally a strong positive correlation between the number of Olympic medals won and the GDP ranking of countries. In this case, Japan is experiencing "Olympic underperformance" compared to other top wealth countries, meaning that it has won fewer Olympic medals than expected given its high GDP ranking. This deviation from the expected relationship could potentially be a result of various factors such as the country's sporting culture, funding for athletes, or other variables. The title implies that Japan's lower Olympic medal count is unusual or unexpected given the negative correlation between medals and GDP ranking, and thus warrants further investigation or explanation.


We can also notice that China is not there. 
China has won a significant number of medals in the Olympic Games, but it has not always been the most successful country in terms of medal count. One reason for this may be that China only began participating in the Olympics in the 20th century and did not compete in the early games. Additionally, other countries such as the United States and Soviet Union have had a longer history of participation and investment in sports. Furthermore, China's focus on economic and industrial development may have been prioritized over sports development in the past, leading to less success in the Olympics.


In [33]:
df[['country','Medal','Number games hosted','GDP rank','continent','iso_alpha2']].to_csv('data/top_country_rank.csv')

 For example, you might use color to highlight specific columns for storytelling. Colors can also be used if they are meaningful for the categories posted (e.g. to match company or team colors).

## c) Top African countries that have won more medail 

**If the Olympic performance of the riches countries is affected by the place where the games were organised. Therefore what about the country that can not organise the Olympic games?**

In [34]:


import pycountry

# pga =gpd[gpd['Country Name'].isin(['Cameroon','Nigeria','Algeria','South Africa','Kenya','Ethiopia','Ghana'])]
pga =gpd[gpd['Country Name'].isin(list(list_africa['Country']))]
pga = pga.sort_values(by=['2021'],ascending=False)
fda = pd.DataFrame({'country':list(pga['Country Name']),'gdp':np.log10(pga['2021']), 'Africa GDP rank':[i for i in range(1,len(pga)+1)]})
tmp = newdf.groupby(['country'])['Medal number'].sum()
df6 = pd.DataFrame(data={'Medal': tmp.values}, index=tmp.index).reset_index()
df6 = df6.merge(fda, how='right').dropna()


ll= df6.sort_values(by=['Medal'],ascending=False)
ll=ll.head(14)
iso3_to_iso2 = {c.alpha_3: c.alpha_2 for c in pycountry.countries}

df = px.data.gapminder().query("year==2007")
df["iso_alpha2"] = df["iso_alpha"].map(iso3_to_iso2)
ll = ll.rename(columns={"host country": 'country'})
df=ll.merge(df,how='right').dropna()

fig = px.scatter(
    df,y="Medal", x="gdp",text='country',
    hover_name="country",
    hover_data=["Medal", "gdp"],template=custom_template
)
# fig.update_traces(marker_color="rgba(0,0,0,0)")

minDim = df[["Medal", "gdp"]].max().idxmax()
maxi = df[minDim].max()
for i, row in df.iterrows():
    country_iso = row["iso_alpha2"]
    fig.add_layout_image(
        dict(
            source=f"https://raw.githubusercontent.com/matahombres/CSS-Country-Flags-Rounded/master/flags/{country_iso}.png",
            xref="x",
            yref="y",
            xanchor="center",
            yanchor="middle",
            y=row["Medal"],
            x=row["gdp"],
            sizex = 10,
            sizey=10,
            sizing="contain",
            opacity=1,
            layer="above"
        )
    )
fig.update_traces(textposition='top right')
# fig.update_xaxes(title_text='',tickmode='linear')

fig.update_layout( title=format_title("African Countries' GDP is Positively Linked to their Olympic Medal Performance",
                                      "Ethiopia, Kenya, and South Africa present an exception to this correlation (Data collected between 1896-2021)")
)


fig.update_layout(
    annotations=[
        dict(
            text="Data source: https://www.kaggle.com/datasets/josephcheng123456/olympic-historical-dataset-from-olympediaorg?select=Olympic_Athlete_Bio.csv",
            xref="paper",
            yref="paper",
            x=0.01,
            y=-.16,
            showarrow=False
        ),
        dict(
            text="Number of medals won",
            xref="paper",
            yref="paper",
            x=-.08,
            y=1.03,
            showarrow=False
        )
        ,
        dict(
            text="Log10 GDP Scale in 2021 ( U.S. dollars)",
            xref="paper",
            yref="paper",
            y=-.08,
            x=1.01,
            showarrow=False
        )
    ]
)
fig.update_yaxes(title_text='')
fig.update_xaxes(title_text='')
fig.update_layout( xaxis_range=[10, 11.8])
fig.show()


pio.write_image(fig, 'african_nation.pdf', width=1100, height=600, scale=10)


This title means that there is a positive correlation between the GDP (Gross Domestic Product) of African countries and their performance in the Olympics in terms of the number of medals they win. This means that as the GDP of a country increases, the number of Olympic medals that country wins also tends to increase.

It is important to note that correlation does not imply causality, so this statement does not indicate that GDP is the cause of Olympic success, but rather that there is a relationship between the two factors.


GDP is a measure of the size of a country's economy and is often used as an indicator of a country's overall wealth and prosperity. Countries with higher GDPs tend to have more resources available to invest in sports and athlete development, which can lead to better results in international competitions like the Olympics.

On the other hand, countries with lower GDPs may have fewer resources available to invest in sports and athlete development, which can make it more difficult for them to compete at the highest levels. As a result, it is not surprising that there is a strong correlation between Olympic medals and GDP ranking for African countries ranked 10th and lower in terms of GDP.

In [35]:
df[['country','Medal','Africa GDP rank','continent','iso_alpha2']].to_csv('data/african_medals.csv')

**why kenya have won more medal in olympic games compare to nigeria?**

There are a few reasons why Kenya has won more medals in the Olympic Games compared to Nigeria. Some possible reasons include:

 1. Kenya has a strong tradition of success in athletics, especially in distance running events, which tend to be some of the most highly contested events at the Olympics.

 2. Kenya has a well-developed sports infrastructure and a national sports program that supports and trains athletes from a young age. This allows them to develop the skills and experience needed to compete at the highest levels.

 3. Kenya has a larger population than Nigeria, which means they have a greater pool of potential athletes to choose from. This increases their chances of finding talented athletes who can compete at the Olympics.

 4. Kenya has been participating in the Olympics for a longer period of time than Nigeria, which means they have had more opportunities to win medals.

Overall, Kenya's success in the Olympics is likely due to a combination of these factors, as well as the dedication and hard work of their athletes.