In [1]:
import pandas as pd
from bs4 import BeautifulSoup as bs
import requests

In [2]:
import time
import datetime

# helper function for converting to raw seconds
def convert_time(t):
    if ':' in t:
        split = t.split(':')
        minutes = int(split[0])
        sec = float(split[1])
        return round(sec + (minutes*60), 2)
    else: 
        return t

In [3]:
base_url = "https://www.collegeswimming.com"
meets_url = "https://www.collegeswimming.com/results/?interval=all&meetType=120&orderBy=latest&page=1&period=past&region=conference_12"

In [4]:
# Openning the downloaded HTML pages
meets_page_1 = bs(open('IvyResultsPage1.html'), 'html.parser')
meets_page_2 = bs(open('IvyResultsPage2.html'), 'html.parser')

In [5]:
# Grabbing meet names and links to meets' results
meets1 = meets_page_1.find('section', {'class': 'c-list-grid'})
meets2 = meets_page_2.find('section', {'class': 'c-list-grid'})

meets_info = []
for link in meets1.findAll('a', {'class': "c-list-grid__item"}):
    
    lis = link.findAll('li')
    year = lis[1].text.split(' ')[2]
    location = lis[2].text.title()
    
    m_name = link.find('h3', {'class': 'c-list-grid__title'}).text
    if "Ivy" in m_name or "EISL" in m_name:
        meets_info.append({'name': m_name.title(), 'link': link.get('href'),
                           'year': year, 'location': location})
        
for link in meets2.findAll('a', {'class': "c-list-grid__item"}):
    
    lis = link.findAll('li')
    year = lis[1].text.split(' ')[2]
    location = lis[2].text.title()
    
    m_name = link.find('h3', {'class': 'c-list-grid__title'}).text
    if "Ivy" in m_name or "EISL" in m_name:
        meets_info.append({'name': m_name.title(), 'link': link.get('href'),
                           'year': year, 'location': location})

In [6]:
def get_events(meet_link):
    resp = requests.get(meet_link)
    soup = bs(resp.text, 'html.parser')
    events = []
    for li in soup.findAll('li', {'class': 'js-event-item'}):
        
        # Filtering out time trials (event # > 60)
        number = li.find('div', {'class': 'o-media__img u-color-success u-width-30'}).text
        if int(number) < 60:
            event = li.find('div', {'class':'o-media__body u-text-truncate'}).text
            if 'Diving' not in event:    
                events.append({'event': event, 'link': base_url + li.a.get('href')})
    return events

In [7]:
def get_relay_results(e):
    event = e['event']
    relay_link = e['link']
    e_resp = requests.get(relay_link)
    e_soup = bs(e_resp.text, 'html.parser')
    table = e_soup.find('tbody')
    is_df = False
    df = pd.DataFrame()
    for row in table.findAll('tr', recursive=False):
        
        # Accounting for DQ's
        try:
            time = convert_time(row.find('abbr').text.strip())
            place = row.find('td', {'class': 'c-table-clean__col-fit u-text-center u-pr0'}).text.strip()
        except AttributeError as e:
            time = None
            place = None
        team = row.find('span', {'class': 'hidden-xs'}).text
        
        if not is_df:
            r = {'event': [event], 'place': [place], 
                 'team': [team], 'name': [team], 
                 "time": [time], 'is_prelims': [False]}
            df = pd.DataFrame.from_dict(r)
            is_df = True
        else:
            r = {'event': event, 'place': place, 
                 'team': team, 'name': team, 
                 "time": time, 'is_prelims': False}
            df = df.append(r, ignore_index=True)

    return df

In [8]:
def get_individual_results(e):
    event = e['event']
    e_link = e['link']
    e_resp = requests.get(e_link)
    e_soup = bs(e_resp.text, 'html.parser')
    section = e_soup.find('div', {'class': 'c-card c-card--large'})
    tables = section.findAll('tbody') # A-Final, B-Final, C-Final, Prelims

    is_df = False
    df = pd.DataFrame()
    
    for i, t in enumerate(tables):
        is_prelims = (i+1 == len(tables)) # last table?
        
        for row in t.findAll('tr', recursive=False):
            # Accounting for DQ's
            try:
                time = convert_time(row.find('abbr').text.strip())
                place = row.find('td', {'class': 'c-table-clean__col-fit u-text-center u-pr0'}).text.strip()
            except AttributeError as e:
                time = None
                place = None
            name = row.find('a', {'class': 'bold'}).text
            team = row.find('span', {'class': 'hidden-xs'}).text
            
            if not is_df:
                r = {'event': [event], 'name': [name], 
                     'place': [place], 'team': [team], 
                     "time": [time], 'is_prelims': [is_prelims]}
                df = pd.DataFrame.from_dict(r)
                is_df = True
            else:
                r = {'event': event, 'name': name, 
                     'place': place, 'team': team, 
                     "time": time, 'is_prelims': is_prelims}
                df = df.append(r, ignore_index=True)

    return df

In [9]:
import time

# Loop through all meets and get results
def get_all_results():
    ret_exists = False
    ret = pd.DataFrame()
    for meet in meets_info:
        
        start = time.time()

        dfm_exists = False
        dfm = pd.DataFrame()

        # Iterate over each event
        for e in get_events(meet['link']):
            if "Relay" in e['event']:

                if not dfm_exists:
                    dfm = get_relay_results(e)
                    dfm_exists = True
                else:
                    dfm = dfm.append(get_relay_results(e), ignore_index=True)
            else:
                if not dfm_exists:
                    dfm = get_individual_results(e)
                    dfm_exists = True
                else:
                    dfm = dfm.append(get_individual_results(e), ignore_index=True)
        
        dfm['meet'] = meet['name']
        dfm['year'] = meet['year']
        dfm['location'] = meet['location']

        if not ret_exists:
            ret = dfm
            ret_exists = True
        else:
            ret = ret.append(dfm, ignore_index = True)
        
        # updating the user
        print("Finished grabbing {} {} in {} seconds".format(meet['year'], 
                                                             meet['name'], 
                                                             round(time.time() - start, 2)))
    return ret

In [10]:
df = get_all_results()

Finished grabbing 2020 Ivy League Championships (M) in 9.34 seconds
Finished grabbing 2020 Ivy League Championships (W) in 9.74 seconds
Finished grabbing 2019 Ivy League Championships (M) in 9.68 seconds
Finished grabbing 2019 Ivy League Championships (W) in 9.75 seconds
Finished grabbing 2018 Ivy League Championships (M) in 9.11 seconds
Finished grabbing 2018 Ivy League Championships (W) in 10.11 seconds
Finished grabbing 2017 Ivy League Championships (M) in 9.88 seconds
Finished grabbing 2017 Ivy League Championships (W) in 19.65 seconds
Finished grabbing 2016 Ivy League Championships (M) in 11.64 seconds
Finished grabbing 2016 Ivy League Championships (W) in 11.27 seconds
Finished grabbing 2015 Ivy League Championships (M) in 11.39 seconds
Finished grabbing 2015 Ivy League Championships (W) in 11.65 seconds
Finished grabbing 2014 Ivy League Championships (M) in 11.59 seconds
Finished grabbing 2014 Ivy League Championships (W) in 11.7 seconds
Finished grabbing 2013 Ivy League Mens Ch

In [11]:
df.shape

(20473, 9)

In [13]:
df.head()

Unnamed: 0,event,place,team,name,time,is_prelims,meet,year,location
0,200 Medley Relay Men,1,Columbia,Columbia,85.46,False,Ivy League Championships (M),2020,"Boston, Ma"
1,200 Medley Relay Men,2,Harvard,Harvard,86.34,False,Ivy League Championships (M),2020,"Boston, Ma"
2,200 Medley Relay Men,3,Cornell,Cornell,86.64,False,Ivy League Championships (M),2020,"Boston, Ma"
3,200 Medley Relay Men,4,Penn,Penn,87.42,False,Ivy League Championships (M),2020,"Boston, Ma"
4,200 Medley Relay Men,5,Brown,Brown,87.53,False,Ivy League Championships (M),2020,"Boston, Ma"


In [14]:
df.tail()

Unnamed: 0,event,place,team,name,time,is_prelims,meet,year,location
20468,400 Free Relay Women,4,Harvard,Harvard,206.63,False,Ivy League Women'S Championships,2001,"Denunzio Pool, Princ, Nj"
20469,400 Free Relay Women,5,Columbia,Columbia,208.79,False,Ivy League Women'S Championships,2001,"Denunzio Pool, Princ, Nj"
20470,400 Free Relay Women,6,Penn,Penn,213.88,False,Ivy League Women'S Championships,2001,"Denunzio Pool, Princ, Nj"
20471,400 Free Relay Women,7,Dartmouth,Dartmouth,213.94,False,Ivy League Women'S Championships,2001,"Denunzio Pool, Princ, Nj"
20472,400 Free Relay Women,8,Cornell,Cornell,215.56,False,Ivy League Women'S Championships,2001,"Denunzio Pool, Princ, Nj"


In [12]:
df.to_csv('IvyData.csv', index=False)

In [25]:
for e in df['event'].unique():
    print(e)

200 Medley Relay Men
800 Free Relay Men
500 Free Men
200 IM Men
50 Free Men
200 Free Relay Men
1000 Free Men
400 IM Men
100 Fly Men
200 Free Men
100 Breast Men
100 Back Men
400 Medley Relay Men
1650 Free Men
200 Back Men
100 Free Men
200 Breast Men
200 Fly Men
400 Free Relay Men
200 Medley Relay Women
800 Free Relay Women
500 Free Women
200 IM Women
50 Free Women
200 Free Relay Women
1000 Free Women
400 IM Women
100 Fly Women
200 Free Women
100 Breast Women
100 Back Women
400 Medley Relay Women
1650 Free Women
200 Back Women
100 Free Women
200 Breast Women
200 Fly Women
400 Free Relay Women
