### Converting raw data to processed data

In [153]:
import pandas as pd 
import numpy as np 
import os

from utils import * 

def forecasting():

    # reading data
    df_2012_summer = pd.DataFrame(pd.read_csv(os.path.join("/kaggle", Y_2012, "summer.csv")))
    df_2012_winter = pd.DataFrame(pd.read_csv(os.path.join("/kaggle", Y_2012, "winter.csv")))
    df_2012 = pd.concat([df_2012_summer, df_2012_winter])
    df_2012 = df_2012.drop('Athlete', axis=1)
    df_2016 = pd.DataFrame(pd.read_csv(os.path.join("/kaggle", Y_2016, "athletes.csv")))
    df_2020 = pd.DataFrame(pd.read_csv(os.path.join("/kaggle", Y_2020, "medals.csv")))

    # 2020 data changes
    df_2020['gold_medal']=df_2020['medal_code'].apply(lambda x: 1 if x==1 else 0)
    df_2020['silver_medal']=df_2020['medal_code'].apply(lambda x: 1 if x==2 else 0)
    df_2020['bronze_medal']=df_2020['medal_code'].apply(lambda x: 1 if x==3 else 0)
    df_2020 = df_2020[['country_code', 'gold_medal', 'silver_medal', 'bronze_medal']]
    df_2020 = df_2020.groupby('country_code').agg(sum)
    df_2020 = df_2020.reset_index(drop=False)
    df_2020.insert(0, 'year', 2020)
    df_2020['total'] = df_2020['gold_medal'] + df_2020['silver_medal'] + df_2020['bronze_medal']
    print(f'df_2020:\n {df_2020.head()}')
    # 2016 data changes
    df_2016 = df_2016.rename({'nationality': 'country_code', 'gold': 'gold_medal', 'silver': 'silver_medal', 'bronze': 'bronze_medal'}, axis=1)
    df_2016 = df_2016[['country_code', 'gold_medal', 'silver_medal', 'bronze_medal']]
    df_2016 = df_2016.groupby('country_code').agg(sum)
    df_2016 = df_2016.reset_index(drop=False)
    df_2016.insert(0, 'year', 2016)
    df_2016['total'] = df_2016['gold_medal'] + df_2016['silver_medal'] + df_2016['bronze_medal']

    # 2012 data changes
    years = df_2012[df_2012.Year >= 1948].Year.unique()

    df_2012['gold_medal']=df_2012['Medal'].apply(lambda x: 1 if x=='Gold' else 0)
    df_2012['silver_medal']=df_2012['Medal'].apply(lambda x: 1 if x=='Silver' else 0)
    df_2012['bronze_medal']=df_2012['Medal'].apply(lambda x: 1 if x=='Bronze' else 0)
    df_2012 = df_2012.rename({'Country': 'country_code'}, axis=1)
    df_2012 = df_2012[['Year', 'country_code', 'gold_medal', 'silver_medal', 'bronze_medal']]

    df_temp= pd.DataFrame()
    for year in years:
        temp = df_2012[df_2012.Year == year][['country_code', 'gold_medal', 'silver_medal', 'bronze_medal']]
        temp = temp.groupby('country_code').agg(sum)
        temp = temp.reset_index(drop=False)
        temp.insert(0, 'year', year)
        temp['total'] = temp['gold_medal'] + temp['silver_medal'] + temp['bronze_medal']
        df_temp = pd.concat([df_temp, temp])

    # concatinating all the dataframes
    df = pd.concat([df_temp, df_2016, df_2020], axis=0)
    
    return df

if __name__ == "__main__":
    print(forecasting())

df_2020:
    year country_code  gold_medal  silver_medal  bronze_medal  total
0  2020          ARG           0            18            25     43
1  2020          ARM           0             2             2      4
2  2020          AUS          36            27            66    129
3  2020          AUT           1             1             5      7
4  2020          AZE           0             3             4      7
    year country_code  gold_medal  silver_medal  bronze_medal  total
0   1948          ARG           3             8             1     12
1   1948          AUS           2             9             5     16
2   1948          AUT           2             3             7     12
3   1948          BEL           6             6             8     20
4   1948          BRA           0             0            12     12
..   ...          ...         ...           ...           ...    ...
88  2020          UGA           2             1             1      4
89  2020          UKR         

## Top 150 countries will be considered

In [154]:
df = pd.DataFrame(forecasting())

temp = df.groupby('country_code').agg(sum)
temp = temp[['gold_medal', 'silver_medal', 'bronze_medal', 'total']]
temp.head()

temp = temp.sort_values(['gold_medal', 'silver_medal', 'bronze_medal'], ascending= False)
temp = temp.reset_index(drop=False)

countries_to_consider = temp['country_code'].head(100)
countries_to_consider =list(countries_to_consider)
countries_to_consider
df = df[df['country_code'].isin(countries_to_consider)]

df_2020:
    year country_code  gold_medal  silver_medal  bronze_medal  total
0  2020          ARG           0            18            25     43
1  2020          ARM           0             2             2      4
2  2020          AUS          36            27            66    129
3  2020          AUT           1             1             5      7
4  2020          AZE           0             3             4      7


## Visualization on world map

In [155]:
import folium
import geopandas as gpd
import pandas as pd
import branca

url = (
    "https://raw.githubusercontent.com/python-visualization/folium/master/examples/data"
)
country_shapes = f"{url}/world-countries.json"
global_polygon = gpd.read_file(country_shapes)

global_polygon.id =  global_polygon.id.apply(lambda x: 'DEN' if x=='DNK' else x)
global_polygon.id =  global_polygon.id.apply(lambda x: 'IRI' if x=='IRN' else x)
global_polygon.id =  global_polygon.id.apply(lambda x: 'NED' if x=='NLD' else x)
global_polygon.id =  global_polygon.id.apply(lambda x: 'POR' if x=='PRT' else x)
global_polygon.id =  global_polygon.id.apply(lambda x: 'RSA' if x=='ZAF' else x)
global_polygon.id =  global_polygon.id.apply(lambda x: 'SUI' if x=='CHE' else x)
global_polygon.id =  global_polygon.id.apply(lambda x: 'BUL' if x=='BGR' else x)
global_polygon.id =  global_polygon.id.apply(lambda x: 'GER' if x=='DEU' else x)
global_polygon.id =  global_polygon.id.apply(lambda x: 'GRE' if x=='GRC' else x)
global_polygon.id =  global_polygon.id.apply(lambda x: 'MGL' if x=='MNG' else x)
global_polygon.id =  global_polygon.id.apply(lambda x: 'SLO' if x=='HRV' else x)
global_polygon.id =  global_polygon.id.apply(lambda x: 'LAT' if x=='LVA' else x)
global_polygon.id =  global_polygon.id.apply(lambda x: 'NGR' if x=='NGA' else x)
global_polygon.id =  global_polygon.id.apply(lambda x: 'INA' if x=='IDN' else x)
global_polygon.id =  global_polygon.id.apply(lambda x: 'PUR' if x=='PRI' else x)
global_polygon.id =  global_polygon.id.apply(lambda x: 'TTO' if x=='TRI' else x)
global_polygon.id =  global_polygon.id.apply(lambda x: 'ALG' if x=='DZA' else x)
global_polygon.id =  global_polygon.id.apply(lambda x: 'PHI' if x=='PHL' else x)
global_polygon.id =  global_polygon.id.apply(lambda x: 'ZIM' if x=='ZWE' else x)
global_polygon.id =  global_polygon.id.apply(lambda x: 'CRC' if x=='CRI' else x)
global_polygon.id =  global_polygon.id.apply(lambda x: 'VIE' if x=='VNM' else x)
global_polygon.id =  global_polygon.id.apply(lambda x: 'FIJ' if x=='FJI' else x)
global_polygon.id =  global_polygon.id.apply(lambda x: 'KOS' if x=='-99' else x)
global_polygon.id =  global_polygon.id.apply(lambda x: 'CRO' if x=='SLO' else x)

df['id'] = df['country_code']
df = df.merge(global_polygon, on='id', how='inner')

In [156]:
df

Unnamed: 0,year,country_code,gold_medal,silver_medal,bronze_medal,total,id,name,geometry
0,1948,ARG,3,8,1,12,ARG,Argentina,"MULTIPOLYGON (((-65.50000 -55.20000, -66.45000..."
1,1952,ARG,2,2,2,6,ARG,Argentina,"MULTIPOLYGON (((-65.50000 -55.20000, -66.45000..."
2,1956,ARG,0,1,1,2,ARG,Argentina,"MULTIPOLYGON (((-65.50000 -55.20000, -66.45000..."
3,1960,ARG,0,3,1,4,ARG,Argentina,"MULTIPOLYGON (((-65.50000 -55.20000, -66.45000..."
4,1964,ARG,0,1,0,1,ARG,Argentina,"MULTIPOLYGON (((-65.50000 -55.20000, -66.45000..."
...,...,...,...,...,...,...,...,...,...
1086,2016,KOS,1,0,0,1,KOS,Somaliland,"POLYGON ((48.93813 9.45175, 48.48674 8.83763, ..."
1087,2020,KOS,2,0,0,2,KOS,Northern Cyprus,"POLYGON ((32.73178 35.14003, 32.80247 35.14550..."
1088,2020,KOS,2,0,0,2,KOS,Kosovo,"POLYGON ((20.76216 42.05186, 20.71731 41.84711..."
1089,2020,KOS,2,0,0,2,KOS,Western Sahara,"POLYGON ((-8.79488 27.12070, -8.81783 27.65643..."


In [157]:
# df.to_csv('df.csv')

In [214]:
def visualize(year="whole"):
    if year=="whole":
        temp = df[['gold_medal', 'silver_medal', 'bronze_medal', 'total', 'country_code']].groupby('country_code').agg(sum)
        temp = temp.reset_index(drop=False)
        temp['id'] = temp['country_code']
        temp = temp.merge(global_polygon, on='id', how='inner')
        Temp = gpd.GeoDataFrame(temp)

    elif(len(df[df.year==int(year)])==0):
        return "Olympics was not held at this year"
    
    else:
        Temp = gpd.GeoDataFrame(df[df.year==int(year)])

    

    def rd2(x):
        return round(x, 2)

    minimum, maximum = Temp["total"].quantile([0.05, 0.95])
    mean = round(Temp["total"].mean(), 2)


    colormap = branca.colormap.LinearColormap(
        # colors=["#f2f0f7", "#cbc9e2", "#9e9ac8", "#756bb1", "#54278f"],
        colors=["#b4ffe6", "#3fffbf", "#04ffab", "#00dc92", "#007a51"],
        index=Temp["total"].quantile([0.25, 0.5, 0.85, 0.95]),
        vmin=minimum,
        vmax=maximum,
    )
    colormap.caption = "Total medals"


    def style_function(x):
        return {
            "fillColor": colormap(x["properties"]["total"]),
            "color": "black",
            "weight": 2,
            "fillOpacity": 0.8,
        }

    m = folium.Map(location=[26.8206, 30.8025], zoom_start=2.2)

    Map_Layer = folium.GeoJson(
        Temp,
        name="Hosted",
        style_function=style_function,
        tooltip=folium.GeoJsonTooltip(
            fields=["name","total","gold_medal","silver_medal","bronze_medal"], aliases=["name","Total","Gold","Silver", "Bronze"], localize=True
        ),
    ).add_to(m)

    return m

visualize()

'Olympics was not held at this year'

In [176]:
# Segregation of data according to 2020 Olympics using quantile
df[df["year"]==2020]['total'].quantile([0.25, 0.5, 0.85, 0.95])

0.25      4.0
0.50      9.5
0.85     62.5
0.95    131.0
Name: total, dtype: float64

In [208]:
import gc

del df
gc.collect

<function gc.collect(generation=2)>