In [85]:
## Importing Necessary Modules 

from bs4 import BeautifulSoup
import pandas as pd
import requests
import numpy as np
import csv

In [72]:
## getting the appropriate website parsed into hmtl
### Defining a function which takes in a url and outputs soup
def get_soup(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    return(soup)


#format url based off the race 
def format_url(race = 'tour-de-france', year = 2023, stage = None):
    
    url_basic = 'https://www.procyclingstats.com/race/{}/{}'
    
    if stage:
        return url_basic.format(race, year) + '/stage-{}'.format(stage) + '/result'
    
    return url_basic.format(race, year) + '/result'


def extract_result(url):
    
    #soupify
    soup = get_soup(url)
    
    headers = []
    data = []
    
    #find rows
    rows = soup.find_all('tr')
    
    #generate header
    header_row = rows[0]
    for header in header_row.find_all('th'):
        headers.append(header.text) 
        data.append([])
    
    #generate rows
    for data_row in rows[1:11]:
        row = data_row.find_all('td')
        for i in range(len(row)):
            if len(row) != len(data):
                continue
            else:
                data[i].append(row[i].text.strip())
    
    #put into dataframe
    data_dict = {}
    for i in range(len(headers)):
        data_dict[headers[i]] = data[i]
    df = pd.DataFrame(data_dict)
    return(df)


def get_race_info(soup):
    
    
    ul_elements = soup.find('ul', class_='infolist')
    ul_points = ul_elements.find_all('li')
    
    results = {}
    
    for point in ul_points:
        divs = point.find_all('div')
        
        key = divs[0].text.strip().replace(':','')
        val = divs[1].text
        
        results[key] = val

    return results

def get_time_gap(soup):
      
    
    #find rows
    rows = soup.find_all('tr')
    
    #generate headers
    headers = rows[0]
    
    row2 = rows[2].find_all('td')
    
    for i, header in enumerate(headers): 
        if header.text == 'Time':
            time_gap = row2[i].text
    
    #if first and second finish together, return 0
    if ',' in time_gap:
        return 0
    
    
    split_values = time_gap.split(':')
    if len(split_values) >= 2:
  
        mins,secs = time_gap.split(':')[0:2]
        secs = float(secs)/10

        time = float(mins) * 60 + secs
        return np.float32(time)
    return 'Error'


In [75]:
def get_data_instance(race = 'tour-de-france', year = 2023, stage = None, manual_url = None):
    
    
    #get appropriate link
    if manual_url is not None:
        url = manual_url
        
    else:
        url = format_url(race, year, stage)
    
    soup = get_soup(url)
    hashmap = get_race_info(soup)
    
    
    race = race
    year = year
    stage = stage if stage is not None else 'None'
    profile_score = np.float32(float(hashmap['ProfileScore'])) #float32 for efficiency 
    distance = np.float32(float(hashmap['Distance'].split(' ')[0]))
    vert_meters = np.float32(float(hashmap['Vertical meters']))
    startlist_qual = np.float32(float(hashmap['Startlist quality score']))

    time_gap = get_time_gap(soup)
    
    
        
    return (race, year, stage, distance, vert_meters, profile_score, startlist_qual, time_gap)
    


In [77]:
get_data_instance(race = 'tour-de-france', year = 1986, stage = 2)

('tour-de-france', 1986, 2, 56.0, 595.0, 26.0, 1872.0, 'Error')

In [81]:
'''
Tour-de-france 1990 - 2023.
Data on previous stages does exist and we may want to get it later. 
It is a bit of a pain as there are sometimes more than 21 stages and some stages 
are split into 'stage1a' and stage'1b' etc.
'''

data1 = []
for year in range(2000, 2013):
    for stage in range(1, 21):
        print(year)
        print(stage)
        data1.append(get_data_instance(race = 'tour-de-france', year = year, stage = stage))
        
for year in range(2013, 2023):
    for stage in range(1, 22):
        print(year)
        print(stage)
        data1.append(get_data_instance(race = 'tour-de-france', year = year, stage = stage))
        


2000
1
2000
2
2000
3
2000
4
2000
5
2000
6
2000
7
2000
8
2000
9
2000
10
2000
11
2000
12
2000
13
2000
14
2000
15
2000
16
2000
17
2000
18
2000
19
2000
20
2001
1
2001
2
2001
3
2001
4
2001
5
2001
6
2001
7
2001
8
2001
9
2001
10
2001
11
2001
12
2001
13
2001
14
2001
15
2001
16
2001
17
2001
18
2001
19
2001
20
2002
1
2002
2
2002
3
2002
4
2002
5
2002
6
2002
7
2002
8
2002
9
2002
10
2002
11
2002
12
2002
13
2002
14
2002
15
2002
16
2002
17
2002
18
2002
19
2002
20
2003
1
2003
2
2003
3
2003
4
2003
5
2003
6
2003
7
2003
8
2003
9
2003
10
2003
11
2003
12
2003
13
2003
14
2003
15
2003
16
2003
17
2003
18
2003
19
2003
20
2004
1
2004
2
2004
3
2004
4
2004
5
2004
6
2004
7
2004
8
2004
9
2004
10
2004
11
2004
12
2004
13
2004
14
2004
15
2004
16
2004
17
2004
18
2004
19
2004
20
2005
1
2005
2
2005
3
2005
4
2005
5
2005
6
2005
7
2005
8
2005
9
2005
10
2005
11
2005
12
2005
13
2005
14
2005
15
2005
16
2005
17
2005
18
2005
19
2005
20
2006
1
2006
2
2006
3
2006
4
2006
5
2006
6
2006
7
2006
8
2006
9
2006
10
2006
11
2006
12
2006
13

In [82]:
for i in range(1,22):
    data1.append(get_data_instance(race = 'tour-de-france', year = 2023, stage = i))

In [86]:
headers = ["race", "year", "stage", "distance", "vertical meters", "profile score", "startlist quality", "time gap"]

with open('tour-de-france2000-2023.csv', mode='w', newline='') as file:
    writer = csv.writer(file)
    
    writer.writerow(headers)
    writer.writerows(data1)
