# Lede Summer 2019 Project - Part 1
## Make a database of female and male Paralympic all-time medallists (summer)
* get years only, not start_year and end_year

In [1]:
import requests
import pandas as pd
import re
import numpy as np
import os

import itertools

from bs4 import BeautifulSoup
from selenium import webdriver

#### LINKS:
Summer:
https://www.paralympic.org/sdms4/hira/web/multiMedallists/type/summer

Winter:
https://www.paralympic.org/sdms4/hira/web/multiMedallists/type/winter

In [2]:
url = 'https://www.paralympic.org/sdms4/hira/web/multiMedallists/type/summer'
response = requests.get(url)
para_doc = BeautifulSoup(response.text)

### Get names of paralympic athletes

In [3]:
# Get athlete names (full name)
para_info = para_doc.find_all('tr')
full_name = []

for athlete in para_info:
    try:
        full_name.append(athlete.find_all('td')[1].string)
    except:
        pass

In [None]:
# full_name

In [4]:
# Get athlete LAST names
para_info = para_doc.find_all('tr')
last_names = []

for name in full_name:
    last_names.append(re.findall(r'([A-Z]+[ -]?[A-Z]{2,}).*', name)[0])

# last_names

In [5]:
# Get athlete FIRST names
para_info = para_doc.find_all('tr')
first_names = []

for athlete in para_info:
    try:
        first_names.append(athlete.find_all('td')[1].string.split(' ')[-1])
    except:
        pass
    
# first_names

### Get citizenship (country code) of paralympic athletes

In [6]:
# Get citizenships of paralympic athletes

para_info = para_doc.find_all('tr')
para_citizenship = []

for athlete in para_info:
    try:
        citizenship = athlete.find_all('td')[3]
        no_citizenship = len(citizenship.find_all('a'))    
        if no_citizenship == 1:
            para_citizenship.append(citizenship.text)

    ## Check for multinationals
        elif no_citizenship == 2:
            multi = []
            multi.append(citizenship.text[:3])
            multi.append(citizenship.text[3:])
            para_citizenship.append(multi)
        else:
            print('Not single or dual citizenship, or there is some other problem')
    except:
        pass
    
# para_citizenship

### Get the year/s during which the paralympic medallists competed

In [5]:
para_info = para_doc.find_all('tr')

years_dict = {}
years = []
for athlete in para_info:
    try:
        year = athlete.find_all('td')[4].string
        years.append(year)
    except:
        pass
# years

### Get total number of medals for paralympic athletes

In [8]:
para_info = para_doc.find_all('tr')
medals_total = []

for athlete in para_info:
    try:
        medals_total.append(athlete.find_all('td')[8].text)
    except:
        pass

# medals_total

### Gold medals for paralympic athletes 

In [9]:
para_info = para_doc.find_all('tr')
medals_gold = []

for athlete in para_info:
    try:
        medals_gold.append(athlete.find_all('td')[5].text)
    except:
        pass

# medals_gold

### Silver medals for paralympic athletes 

In [10]:
para_info = para_doc.find_all('tr')
medals_silver = []

for athlete in para_info:
    try:
        medals_silver.append(athlete.find_all('td')[6].text)
    except:
        pass

# medals_silver

### Bronze medals for paralympic athletes 

In [11]:
para_info = para_doc.find_all('tr')
medals_bronze = []

for athlete in para_info:
    try:
        medals_bronze.append(athlete.find_all('td')[7].text)
    except:
        pass

# medals_bronze

### Assign gender for para athletes

In [None]:
para_info = para_doc.find_all('tr')
gender = []

for athlete in para_info:
    try:
        gender.append(athlete.find_previous('h2').text.strip())
    except:
        pass

# gender

### Put all para info into one list of dictionaries, then into a dataframe

In [13]:
para_info = para_doc.find_all('tr')
rows = []

for athlete in para_info:
    row = {}
    try:
        full_name = athlete.find_all('td')[1].string.strip()
        row['full_name'] = full_name
        row['last_name'] = re.findall(r'([A-Z]+[ -]?[A-Z]{2,}).*', full_name)[0]
        row['first_name'] = athlete.find_all('td')[1].string.split(' ')[-1]
        
        row['medals_total'] = athlete.find_all('td')[8].text.strip()
        row['medals_gold'] = athlete.find_all('td')[5].text.strip()
        row['medals_silver'] = athlete.find_all('td')[6].text.strip()
        row['medals_bronze'] = athlete.find_all('td')[7].text.strip()
        
        row['gender'] = athlete.find_previous('h2').text.strip()
        row['other_info'] = ''
        row['alternate_name'] = ''
        
        years = athlete.find_all('td')[4].string
        row['years'] = years

        citizenship = athlete.find_all('td')[3]
        no_citizenship = len(citizenship.find_all('a'))    
        if no_citizenship == 1:
            row['citizenship'] = citizenship.text.strip()

    ## Check for multinationals
        elif no_citizenship == 2:
            if 'GER' in citizenship.text.strip():
                row['citizenship'] = citizenship.text[:3]
            else:
                multi = []
                multi.append(citizenship.text[:3])
                multi.append(citizenship.text[3:])
                row['citizenship'] = multi
        else:
            row['citizenship'] = 'Not single or dual citizenship, or there is some other problem'
    except:
        pass
    rows.append(row)

rows

[{},
 {'full_name': 'JAKOBSSON Jonas',
  'last_name': 'JAKOBSSON',
  'first_name': 'Jonas',
  'medals_total': '27',
  'medals_gold': '17',
  'medals_silver': '2',
  'medals_bronze': '8',
  'gender': 'Men',
  'other_info': '',
  'alternate_name': '',
  'years': '1980-2012',
  'citizenship': 'SWE'},
 {'full_name': 'MARSON Roberto',
  'last_name': 'MARSON',
  'first_name': 'Roberto',
  'medals_total': '26',
  'medals_gold': '16',
  'medals_silver': '7',
  'medals_bronze': '3',
  'gender': 'Men',
  'other_info': '',
  'alternate_name': '',
  'years': '1964-1976',
  'citizenship': 'ITA'},
 {'full_name': 'KENNY Mike',
  'last_name': 'KENNY',
  'first_name': 'Mike',
  'medals_total': '16',
  'medals_gold': '16',
  'medals_silver': '0',
  'medals_bronze': '0',
  'gender': 'Men',
  'other_info': '',
  'alternate_name': '',
  'years': '1976-1988',
  'citizenship': 'GBR'},
 {'full_name': 'DIAS Daniel',
  'last_name': 'DIAS',
  'first_name': 'Daniel',
  'medals_total': '24',
  'medals_gold': '14',

In [14]:
# Remove empty rows
rows = list(filter(None,rows))

In [15]:
df_summer = pd.DataFrame(rows)
df_summer

Unnamed: 0,alternate_name,citizenship,first_name,full_name,gender,last_name,medals_bronze,medals_gold,medals_silver,medals_total,other_info,years
0,,SWE,Jonas,JAKOBSSON Jonas,Men,JAKOBSSON,8,17,2,27,,1980-2012
1,,ITA,Roberto,MARSON Roberto,Men,MARSON,3,16,7,26,,1964-1976
2,,GBR,Mike,KENNY Mike,Men,KENNY,0,16,0,16,,1976-1988
3,,BRA,Daniel,DIAS Daniel,Men,DIAS,3,14,7,24,,2008-2016
4,,SUI,Heinz,FREI Heinz,Men,FREI,6,14,6,26,,1984-2012
5,,SUI,Franz,NIETLISPACH Franz,Men,NIETLISPACH,2,14,6,22,,1980-2004
6,,CAN,Michael,EDGSON Michael,Men,EDGSON,0,14,2,16,,1984-1992
7,,AUS,Matthew,COWDREY Matthew,Men,COWDREY,3,13,7,23,,2004-2012
8,,NOR,Erling,TRONDSEN Erling,Men,TRONDSEN,1,13,6,20,,1976-1992
9,,USA,Bart,DODSON Bart,Men,DODSON,4,13,3,20,,1984-2000


In [17]:
df_summer.to_csv('para_summer.csv', index=False)

## Manually edit the csv to create duplicate rows for HARRIMAN Margaret (double citizen)
* Make duplicate row for Margaret Harriman -- one with full_name as HARRIMAN Margaret (RHO) and the other as HARRIMAN Margaret (RSA) so that the dataframes merge properly when joining with the df of country codes
* Manually add the comment 'Also competed representing Rhodesia' or 'Also competed representing South Africa' in the other_info column
* Re-import the cleaned csv

In [20]:
df_summer_clean = pd.read_csv('para_summer_cleaned.csv')

In [21]:
df_summer_clean.shape

(41, 12)

## The sport event is missing from this website!
* Add events for every athlete in a new text file, making sure to name Harriman's duplicate rows as HARRIMAN Margaret (RHO) or HARRIMAN Margaret (RSA) so dataframes merge properly

In [22]:
event = pd.read_csv('para_summer_events.txt', sep=',')

In [23]:
event.shape

(41, 2)

In [24]:
merged = df_summer_clean.merge(event, how='inner', left_on='full_name', right_on='name')
merged = merged.drop(columns = 'name')

In [25]:
merged.shape

(41, 13)

In [26]:
merged

Unnamed: 0,alternate_name,citizenship,first_name,full_name,gender,last_name,medals_bronze,medals_gold,medals_silver,medals_total,other_info,years,event
0,,SWE,Jonas,JAKOBSSON Jonas,Men,JAKOBSSON,8,17,2,27,,1980-2012,Para shooting
1,,ITA,Roberto,MARSON Roberto,Men,MARSON,3,16,7,26,,1964-1976,Wheelchair fencing
2,,GBR,Mike,KENNY Mike,Men,KENNY,0,16,0,16,,1976-1988,Para swimming
3,,BRA,Daniel,DIAS Daniel,Men,DIAS,3,14,7,24,,2008-2016,Para swimming
4,,SUI,Heinz,FREI Heinz,Men,FREI,6,14,6,26,,1984-2012,Para athletics
5,,SUI,Franz,NIETLISPACH Franz,Men,NIETLISPACH,2,14,6,22,,1980-2004,Para athletics | handcycling
6,,CAN,Michael,EDGSON Michael,Men,EDGSON,0,14,2,16,,1984-1992,Para swimming
7,,AUS,Matthew,COWDREY Matthew,Men,COWDREY,3,13,7,23,,2004-2012,Para swimming
8,,NOR,Erling,TRONDSEN Erling,Men,TRONDSEN,1,13,6,20,,1976-1992,Para swimming
9,,USA,Bart,DODSON Bart,Men,DODSON,4,13,3,20,,1984-2000,Para athletics


In [27]:
merged['game_type'] = 'Paralympic'
merged['season'] = 'Summer'

## Clean full_name for Paralympic athletes so that it's formatted as first_name LAST_NAME


In [28]:
merged['full_name'] = merged.first_name + ' ' + merged.last_name
merged

Unnamed: 0,alternate_name,citizenship,first_name,full_name,gender,last_name,medals_bronze,medals_gold,medals_silver,medals_total,other_info,years,event,game_type,season
0,,SWE,Jonas,Jonas JAKOBSSON,Men,JAKOBSSON,8,17,2,27,,1980-2012,Para shooting,Paralympic,Summer
1,,ITA,Roberto,Roberto MARSON,Men,MARSON,3,16,7,26,,1964-1976,Wheelchair fencing,Paralympic,Summer
2,,GBR,Mike,Mike KENNY,Men,KENNY,0,16,0,16,,1976-1988,Para swimming,Paralympic,Summer
3,,BRA,Daniel,Daniel DIAS,Men,DIAS,3,14,7,24,,2008-2016,Para swimming,Paralympic,Summer
4,,SUI,Heinz,Heinz FREI,Men,FREI,6,14,6,26,,1984-2012,Para athletics,Paralympic,Summer
5,,SUI,Franz,Franz NIETLISPACH,Men,NIETLISPACH,2,14,6,22,,1980-2004,Para athletics | handcycling,Paralympic,Summer
6,,CAN,Michael,Michael EDGSON,Men,EDGSON,0,14,2,16,,1984-1992,Para swimming,Paralympic,Summer
7,,AUS,Matthew,Matthew COWDREY,Men,COWDREY,3,13,7,23,,2004-2012,Para swimming,Paralympic,Summer
8,,NOR,Erling,Erling TRONDSEN,Men,TRONDSEN,1,13,6,20,,1976-1992,Para swimming,Paralympic,Summer
9,,USA,Bart,Bart DODSON,Men,DODSON,4,13,3,20,,1984-2000,Para athletics,Paralympic,Summer


In [29]:
merged.to_csv('para_summer_complete.csv', index=False)