In [85]:
## Importing Necessary Modules 

from bs4 import BeautifulSoup
import pandas as pd
import requests
import numpy as np
import csv

In [167]:
## getting the appropriate website parsed into hmtl
### Defining a function which takes in a url and outputs soup
def get_soup(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    return(soup)


#format url based off the race 
def format_url(race = 'tour-de-france', year = 2023, stage = None):
    
    url_basic = 'https://www.procyclingstats.com/race/{}/{}'
    
    if stage:
        return url_basic.format(race, year) + '/stage-{}'.format(stage) + '/result'
    
    return url_basic.format(race, year) + '/result'


'''
For looking at all time gaps, somewhat void now.
'''
def extract_result(url):
    
    #soupify
    soup = get_soup(url)
    
    headers = []
    data = []
    
    #find rows
    rows = soup.find_all('tr')
    
    #generate header
    header_row = rows[0]
    for header in header_row.find_all('th'):
        headers.append(header.text) 
        data.append([])
    
    #generate rows
    for data_row in rows[1:11]:
        row = data_row.find_all('td')
        for i in range(len(row)):
            if len(row) != len(data):
                continue
            else:
                data[i].append(row[i].text.strip())
    
    #put into dataframe
    data_dict = {}
    for i in range(len(headers)):
        data_dict[headers[i]] = data[i]
    df = pd.DataFrame(data_dict)
    return(df)

def get_race_info(soup):
    
    
    ul_elements = soup.find('ul', class_='infolist')
    if ul_elements is None:
        results = {}
        results['ProfileScore'] = 'Error'
        results['Distance'] = 'Error'
        results['Vertical meters'] = 'Error'
        results['Startlist quality score'] = 'Error'
        
        return results
        
        
    ul_points = ul_elements.find_all('li')
    
    results = {}
    
    for point in ul_points:
        divs = point.find_all('div')
        
        key = divs[0].text.strip().replace(':','')
        val = divs[1].text
        
        results[key] = val

    return results

def get_time_gap(soup):
      
    time_gap = 'Error'
    #find rows
    rows = soup.find_all('tr')
    
   
    if len(rows) > 2: #avoid weird pages
        # Generate headers
        headers = rows[0]

        # Find 'td' elements in the third row (index 2)
        row2 = rows[2].find_all('td')

        # Process headers and row2 as needed
        for i, header in enumerate(headers):
            if header.text == 'Time' and i < len(row2):
                time_gap = row2[i].text
    
    
    #if first and second finish together, return 0
    if ',' in time_gap:
        return 0
    
    
    split_values = time_gap.split(':')
    if len(split_values) >= 2:
  
        mins,secs = time_gap.split(':')[0:2]
        secs = float(secs)/10

        time = float(mins) * 60 + secs
        return np.float32(time)
    
    time_gap = 'Error'
    return time_gap

'''
The key function for data collection 
'''
def get_data_instance(race = 'tour-de-france', year = 2023, stage = None, manual_url = None):
    
    
    #get appropriate link
    if manual_url is not None:
        url = manual_url
        
    else:
        url = format_url(race, year, stage)
    
    soup = get_soup(url)
    hashmap = get_race_info(soup)
    
    
    race = race
    year = year
    stage = stage if stage is not None else 'None'
    
    
    if hashmap['ProfileScore'] == 'Error':
        profile_score = 'Error'
    else:
        profile_score = np.float32(float(hashmap['ProfileScore'])) #float32 for efficiency 
        
        
        
    if hashmap['Distance'] == 'Error':
        distance = 'Error'
    else:
        distance = np.float32(float(hashmap['Distance'].split(' ')[0]))
        
        
        
    if hashmap['Vertical meters'] == 'Error':
        vert_meters = 'Error'
    else:
        vert_meters = np.float32(float(hashmap['Vertical meters'].split(' ')[0]))
        
    if hashmap['Startlist quality score'] == 'Error':
        startlist_qual = 'Error'
    else:
        startlist_qual = np.float32(float(hashmap['Startlist quality score']))

    time_gap = get_time_gap(soup)
    
    
        
    return (race, year, stage, distance, vert_meters, profile_score, startlist_qual, time_gap)
    
