# Lede Summer 2019 Project - Part 2
## Make a database of female and male Paralympic all-time medallists (winter)
* get years only, not start_year and end_year

In [1]:
import requests
import pandas as pd
import re
import numpy as np
import os

import itertools

from bs4 import BeautifulSoup
from selenium import webdriver

#### LINKS:
Summer:
https://www.paralympic.org/sdms4/hira/web/multiMedallists/type/summer

Winter:
https://www.paralympic.org/sdms4/hira/web/multiMedallists/type/winter

In [2]:
url = 'https://www.paralympic.org/sdms4/hira/web/multiMedallists/type/winter'
response = requests.get(url)
para_doc = BeautifulSoup(response.text)

### Get names of paralympic athletes

In [4]:
# Get athlete names (full name)
para_info = para_doc.find_all('tr')
full_name = []

for athlete in para_info:
    try:
        full_name.append(athlete.find_all('td')[1].string)
    except:
        pass
# full_name

In [5]:
# Get athlete LAST names
para_info = para_doc.find_all('tr')
last_names = []

for name in full_name:
    last_names.append(re.findall(r'([A-Z]+[ -]?[A-Z]{2,}).*', name)[0])

# last_names

In [7]:
# Get athlete FIRST names
para_info = para_doc.find_all('tr')
first_names = []

for name in full_name:
    first_names.append(re.findall(r'[A-Z]+ (.*)', name)[0])
    
# first_names

### Get citizenship (country code) of paralympic athletes

In [8]:
# Get citizenships of paralympic athletes

para_info = para_doc.find_all('tr')
para_citizenship = []

for athlete in para_info:
    try:
        citizenship = athlete.find_all('td')[3]
        no_citizenship = len(citizenship.find_all('a'))    
        if no_citizenship == 1:
            para_citizenship.append(citizenship.text)

    ## Check for multinationals
        elif no_citizenship == 2:
            multi = []
            multi.append(citizenship.text[:3])
            multi.append(citizenship.text[3:])
            para_citizenship.append(multi)
        else:
            print('Not single or dual citizenship, or there is some other problem')
    except:
        pass
    
# para_citizenship

### Get the year/s during which the paralympic medallists competed

In [1]:
para_info = para_doc.find_all('tr')

years_dict = {}
years = []
for athlete in para_info:
    try:
        year = athlete.find_all('td')[4].string
        years.append(year)
    except:
        pass
# years

NameError: name 'para_doc' is not defined

### Get total number of medals for paralympic athletes

In [12]:
para_info = para_doc.find_all('tr')
medals_total = []

for athlete in para_info:
    try:
        medals_total.append(athlete.find_all('td')[8].text)
    except:
        pass

# medals_total

### Gold medals for paralympic athletes 

In [13]:
para_info = para_doc.find_all('tr')
medals_gold = []

for athlete in para_info:
    try:
        medals_gold.append(athlete.find_all('td')[5].text)
    except:
        pass

# medals_gold

### Silver medals for paralympic athletes 

In [14]:
para_info = para_doc.find_all('tr')
medals_silver = []

for athlete in para_info:
    try:
        medals_silver.append(athlete.find_all('td')[6].text)
    except:
        pass

# medals_silver

### Bronze medals for paralympic athletes 

In [15]:
para_info = para_doc.find_all('tr')
medals_bronze = []

for athlete in para_info:
    try:
        medals_bronze.append(athlete.find_all('td')[7].text)
    except:
        pass

# medals_bronze

### Assign gender for para athletes

In [16]:
para_info = para_doc.find_all('tr')
gender = []

for athlete in para_info:
    try:
        gender.append(athlete.find_previous('h2').text.strip())
    except:
        pass

# gender

### Put all para info into one list of dictionaries, then into a dataframe

In [17]:
para_info = para_doc.find_all('tr')
rows = []

for athlete in para_info:
    row = {}
    try:
        full_name = athlete.find_all('td')[1].string
        row['full_name'] = full_name
        row['last_name'] = re.findall(r'([A-Z]+[ -]?[A-Z]{2,}).*', full_name)[0]
        row['first_name'] = re.findall(r'[A-Z]+ (.*)', full_name)[0]
        
        row['medals_total'] = athlete.find_all('td')[8].text.strip()
        row['medals_gold'] = athlete.find_all('td')[5].text.strip()
        row['medals_silver'] = athlete.find_all('td')[6].text.strip()
        row['medals_bronze'] = athlete.find_all('td')[7].text.strip()
        
        row['gender'] = athlete.find_previous('h2').text.strip()
        row['other_info'] = ''
        row['alternate_name'] = ''
        
        years = athlete.find_all('td')[4].string
        row['years'] = years

        citizenship = athlete.find_all('td')[3]
        no_citizenship = len(citizenship.find_all('a'))    
        if no_citizenship == 1:
            row['citizenship'] = citizenship.text.strip()

    ## Check for multinationals
        elif no_citizenship == 2:
            if 'NPA' in citizenship.text.strip():
                row['citizenship'] = citizenship.text[:3]
            elif 'CZE' in citizenship.text.strip():
                row['citizenship'] = citizenship.text[:3]
            elif 'GER' in citizenship.text.strip():
                row['citizenship'] = citizenship.text[:3]
        else:
            row['citizenship'] = 'Not single or dual citizenship, or there is some other problem'
    except:
        pass
    rows.append(row)

rows

[{},
 {'full_name': 'SCHOENFELDER Gerd',
  'last_name': 'SCHOENFELDER',
  'first_name': 'Gerd',
  'medals_total': '22',
  'medals_gold': '16',
  'medals_silver': '4',
  'medals_bronze': '2',
  'gender': 'Men',
  'other_info': '',
  'alternate_name': '',
  'years': '1992-2010',
  'citizenship': 'GER'},
 {'full_name': 'LUNDSTROEM Knut',
  'last_name': 'LUNDSTROEM',
  'first_name': 'Knut',
  'medals_total': '21',
  'medals_gold': '14',
  'medals_silver': '5',
  'medals_bronze': '2',
  'gender': 'Men',
  'other_info': '',
  'alternate_name': '',
  'years': '1988-1998',
  'citizenship': 'NOR'},
 {'full_name': 'HOEFLE Frank',
  'last_name': 'HOEFLE',
  'first_name': 'Frank',
  'medals_total': '21',
  'medals_gold': '13',
  'medals_silver': '5',
  'medals_bronze': '3',
  'gender': 'Men',
  'other_info': '',
  'alternate_name': '',
  'years': '1988-2006',
  'citizenship': 'FRG'},
 {'full_name': 'MCKEEVER Brian',
  'last_name': 'MCKEEVER',
  'first_name': 'Brian',
  'medals_total': '17',
  'med

In [18]:
# Remove empty rows
rows = list(filter(None,rows))

In [19]:
df_winter = pd.DataFrame(rows)
df_winter

Unnamed: 0,alternate_name,citizenship,first_name,full_name,gender,last_name,medals_bronze,medals_gold,medals_silver,medals_total,other_info,years
0,,GER,Gerd,SCHOENFELDER Gerd,Men,SCHOENFELDER,2,16,4,22,,1992-2010
1,,NOR,Knut,LUNDSTROEM Knut,Men,LUNDSTROEM,2,14,5,21,,1988-1998
2,,FRG,Frank,HOEFLE Frank,Men,HOEFLE,3,13,5,21,,1988-2006
3,,CAN,Brian,MCKEEVER Brian,Men,MCKEEVER,2,13,2,17,,2002-2018
4,,SUI,Rolf,HEINZMANN Rolf,Men,HEINZMANN,0,12,2,14,,1980-2002
5,,FIN,Jouko,GRIP Jouko,Men,GRIP,0,10,5,15,,1980-1994
6,,NOR,Terje,LOEVAAS Terje,Men,LOEVAAS,0,10,3,13,,1980-1994
7,,GER,Martin,BRAXENTHALER Martin,Men,BRAXENTHALER,1,10,1,12,,1998-2010
8,,NOR,Hans Anton,AALIEN Hans Anton,Men,AALIEN,1,7,1,9,,1980-1988
9,,NOR,Cato Zahl,PEDERSEN Cato Zahl,Men,PEDERSEN,0,7,1,8,,1980-1994


In [20]:
df_winter.to_csv('para_winter.csv', index=False)

## The sport event is missing from this website!
* Add events for every athlete in a new text file

In [21]:
event = pd.read_csv('para_winter_events.txt', sep=',')
# event

In [22]:
merged = df_winter.merge(event, left_on='full_name', right_on='name')
merged = merged.drop(columns = 'name')

In [23]:
merged.tail(3)

Unnamed: 0,alternate_name,citizenship,first_name,full_name,gender,last_name,medals_bronze,medals_gold,medals_silver,medals_total,other_info,years,event
36,,CZE,Katerina,TEPLA Katerina,Women,TEPLA,0,5,4,9,,1992-2002,Para alpine skiing
37,,FRG,Annemie,SCHNEIDER Annemie,Women,SCHNEIDER,2,5,1,8,,1976-1994,Para alpine skiing
38,,AUT,Danja,HASLACHER Danja,Women,HASLACHER,1,5,0,6,,1998-2006,Para alpine skiing


In [24]:
merged['game_type'] = 'Paralympic'
merged['season'] = 'Winter'

## Clean full_name for Paralympic athletes so that it's formatted as first_name LAST_NAME

In [25]:
merged['full_name'] = merged.first_name + ' ' + merged.last_name
merged

Unnamed: 0,alternate_name,citizenship,first_name,full_name,gender,last_name,medals_bronze,medals_gold,medals_silver,medals_total,other_info,years,event,game_type,season
0,,GER,Gerd,Gerd SCHOENFELDER,Men,SCHOENFELDER,2,16,4,22,,1992-2010,Para alpine skiing,Paralympic,Winter
1,,NOR,Knut,Knut LUNDSTROEM,Men,LUNDSTROEM,2,14,5,21,,1988-1998,Para cross-country skiing | ice sledge speed r...,Paralympic,Winter
2,,FRG,Frank,Frank HOEFLE,Men,HOEFLE,3,13,5,21,,1988-2006,Para biathlon | cross-country skiing | cycling...,Paralympic,Winter
3,,CAN,Brian,Brian MCKEEVER,Men,MCKEEVER,2,13,2,17,,2002-2018,Para biathlon | cross-country skiing,Paralympic,Winter
4,,SUI,Rolf,Rolf HEINZMANN,Men,HEINZMANN,0,12,2,14,,1980-2002,Para alpine skiing,Paralympic,Winter
5,,FIN,Jouko,Jouko GRIP,Men,GRIP,0,10,5,15,,1980-1994,Para biathlon | cross-country skiing | athleti...,Paralympic,Winter
6,,NOR,Terje,Terje LOEVAAS,Men,LOEVAAS,0,10,3,13,,1980-1994,Para cross-country skiing,Paralympic,Winter
7,,GER,Martin,Martin BRAXENTHALER,Men,BRAXENTHALER,1,10,1,12,,1998-2010,Para alpine skiing,Paralympic,Winter
8,,NOR,Hans Anton,Hans Anton AALIEN,Men,AALIEN,1,7,1,9,,1980-1988,Para cross-country skiing | swimming (Summer O...,Paralympic,Winter
9,,NOR,Cato Zahl,Cato Zahl PEDERSEN,Men,PEDERSEN,0,7,1,8,,1980-1994,Para alpine skiing | cross-country skiing | at...,Paralympic,Winter


In [26]:
merged.to_csv('para_winter_complete.csv', index=False)