# This Notebook for creating the Final Medal Dataset
This notebook will create the final dataset from the medal winners side of the data. It will contain all the infomration about the athletes, their NOCs informations e.g (number of athletes sent and medal rankings) and the olympic games information of that year e.g (number of athletes, location and summer/ winter). 

In [1]:
import os.path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
%matplotlib inline
from bs4 import BeautifulSoup
import webbrowser
import urllib.request
from lxml import html
import zipfile
import re
import string
import sys, os
from IPython.display import Image

# Adding in the Athlete numbers from each country 
The final fields we want to add into this medal datframe is the number of athletes male/ female that are sent to each olympics. 

In [2]:
# Ensure the file exists
if not os.path.exists(  r"..\..\data\prep\Games\Games-900.csv" ):
    print("Missing dataset file")

In [3]:
# read the medal csv into a dataframe
currMedaldf = pd.read_csv(  r"..\..\data\prep\Games\Games-900.csv", encoding = "ISO-8859-1")

In [4]:
# read the athlete number csv into a dataframe
athNumdf = pd.read_csv( r"..\..\data\prep\Games\Games-600.csv", encoding = "ISO-8859-1")

In [5]:
currMedaldf.columns

Index(['Year', 'Host_City', 'Host_Country', 'Total_Males', 'Total_Females',
       'Total_Athletes', 'Summer', 'Winter', 'Discipline', 'Sport', 'Ath_Name',
       'Gender', 'NOC', 'Home_Adv', 'Gold', 'Silver', 'Bronze', 'Total_Medals',
       'NOC_Gold', 'NOC_Silver', 'NOC_Bronze', 'NOC_Total_Medals',
       'NOC_Rating', 'NOC_Rank', 'Ath_Rating', 'Ath_Rank'],
      dtype='object')

In [6]:
athNumdf.columns

Index(['NOC', 'M', 'F', 'Total', 'Year', 'Host_Country', 'Host_City', 'Summer',
       'Winter'],
      dtype='object')

In [7]:
# Changing the field names for the NOCs athletes sent so they are distinguishabele in the final data frame 
athNumdf.rename(columns={'M': 'NOC_Males_Sent'}, inplace=True)
athNumdf.rename(columns={'F': 'NOC_Females_Sent'}, inplace=True)
athNumdf.rename(columns={'Total': 'NOC_Total_Sent'}, inplace=True)

# Joining the Rank and Rating to Medal dataFrame 
We want to add the athletes sent to each olympics by each country split male, female and total. 
I'll set the index of the two tables to Year, Host_City, Host_Country and most importantly the NOC so all the athletes numbers will be matched with the right rows in the medal dataframe. 

In [8]:
# Setting the indexes of both tables so they are joinable 
athNumdf = athNumdf.set_index(['Year', 'Host_Country', 'Host_City', 'NOC'])
currMedaldf = currMedaldf.set_index(['Year', 'Host_Country', 'Host_City', 'NOC'])

In [9]:
# It'll be possible to join the medal df to the athlete df because they share the same indexes
# I just have to include which of the fields from the medal df I want to include in the join (which is all of them)
Medaldf = currMedaldf[['Total_Males', 'Total_Females', 'Total_Athletes',
       'Discipline', 'Sport', 'Ath_Name', 'Gender', 'Home_Adv', 'Gold',
       'Silver', 'Bronze', 'Total_Medals', 'NOC_Gold', 'NOC_Silver',
       'NOC_Bronze', 'NOC_Total_Medals', 'NOC_Rating', 'NOC_Rank',
       'Ath_Rating', 'Ath_Rank']].join(athNumdf).reset_index()

In [10]:
# Changing the order of the columns 
Medaldf = Medaldf[['Year', 'Host_Country', 'Host_City', 'Summer', 'Winter', 'Total_Males', 'Total_Females', 'Total_Athletes', 'Discipline', 'Sport', 'Ath_Name', 'Gender', 'Home_Adv', 'Gold', 'Silver', 'Bronze', 'Total_Medals', 'Ath_Rating', 'Ath_Rank', 'NOC', 'NOC_Males_Sent', 'NOC_Females_Sent', 'NOC_Total_Sent', 'NOC_Gold', 'NOC_Silver', 'NOC_Bronze', 'NOC_Total_Medals', 'NOC_Rating', 'NOC_Rank']]

In [11]:
Medaldf.columns

Index(['Year', 'Host_Country', 'Host_City', 'Summer', 'Winter', 'Total_Males',
       'Total_Females', 'Total_Athletes', 'Discipline', 'Sport', 'Ath_Name',
       'Gender', 'Home_Adv', 'Gold', 'Silver', 'Bronze', 'Total_Medals',
       'Ath_Rating', 'Ath_Rank', 'NOC', 'NOC_Males_Sent', 'NOC_Females_Sent',
       'NOC_Total_Sent', 'NOC_Gold', 'NOC_Silver', 'NOC_Bronze',
       'NOC_Total_Medals', 'NOC_Rating', 'NOC_Rank'],
      dtype='object')

In [12]:
Medaldf.to_csv( r"..\..\data\prep\Games\Games-950.csv", index=False)