In [1]:
import pandas as pd
from bs4 import BeautifulSoup as bs
import requests

In [2]:
import time
import datetime

def convert_time(t):
    if ':' in t:
        split = t.split(':')
        minutes = int(split[0])
        sec = float(split[1])
        return round(sec + (minutes*60), 2)
    else: 
        return t

In [3]:
base_url = "https://www.collegeswimming.com"
meets_url = "https://www.collegeswimming.com/results/?interval=all&meetType=120&orderBy=latest&page=1&period=past&region=conference_12"

In [4]:
# Openning the downloaded HTML pages
meets_page_1 = bs(open('IvyResultsPage1.html'), 'html.parser')
meets_page_2 = bs(open('IvyResultsPage2.html'), 'html.parser')

In [15]:
# Grabbing meet names and links to meets' results
meets1 = meets_page_1.find('section', {'class': 'c-list-grid'})
meets2 = meets_page_2.find('section', {'class': 'c-list-grid'})

meets_info = []
for link in meets1.findAll('a', {'class': "c-list-grid__item"}):
    
    lis = link.findAll('li')
    year = lis[1].text.split(' ')[2]
    location = lis[2].text.title()
    
    m_name = link.find('h3', {'class': 'c-list-grid__title'}).text
    if "Ivy" in m_name or "EISL" in m_name:
        meets_info.append({'name': m_name.title(), 'link': link.get('href'),
                           'year': year, 'location': location})
        
for link in meets2.findAll('a', {'class': "c-list-grid__item"}):
    
    lis = link.findAll('li')
    year = lis[1].text.split(' ')[2]
    location = lis[2].text.title()
    
    m_name = link.find('h3', {'class': 'c-list-grid__title'}).text
    if "Ivy" in m_name or "EISL" in m_name:
        meets_info.append({'name': m_name.title(), 'link': link.get('href'),
                           'year': year, 'location': location})
        print({'name': m_name.title(), 'link': link.get('href'),
                           'year': year, 'location': location})

{'name': 'Ivy league championships (w)', 'link': 'https://www.collegeswimming.com/results/22379', 'year': '2014', 'location': 'Providence, ri'}
{'name': 'Ivy league mens championships', 'link': 'https://www.collegeswimming.com/results/21469', 'year': '2013', 'location': ''}
{'name': 'Ivy league womens championships', 'link': 'https://www.collegeswimming.com/results/21455', 'year': '2013', 'location': 'Princeton, nj'}
{'name': 'Eisl championship', 'link': 'https://www.collegeswimming.com/results/5652', 'year': '2008', 'location': 'Blodgett pool, harva, ma'}
{'name': 'Eisl championships', 'link': 'https://www.collegeswimming.com/results/2310', 'year': '2007', 'location': 'Denunzio pool, princ, nj'}
{'name': 'Ivy league championships', 'link': 'https://www.collegeswimming.com/results/2241', 'year': '2007', 'location': 'Denunzio pool, princ, nj'}
{'name': "Women's ivy league champs", 'link': 'https://www.collegeswimming.com/results/97286', 'year': '2006', 'location': 'Blodgett pool, harva,

In [6]:
def get_events(meet_link):
    resp = requests.get(meet_link)
    soup = bs(resp.text, 'html.parser')
    events = []
    for li in soup.findAll('li', {'class': 'js-event-item'}):
        
        # Filtering out time trials (event # > 60)
        number = li.find('div', {'class': 'o-media__img u-color-success u-width-30'}).text
        if int(number) < 60:
            event = li.find('div', {'class':'o-media__body u-text-truncate'}).text
            if 'Diving' not in event:    
                events.append({'event': event, 'link': base_url + li.a.get('href')})
    return events

In [7]:
def get_relay_results(e):
    event = e['event']
    relay_link = e['link']
    e_resp = requests.get(relay_link)
    e_soup = bs(e_resp.text, 'html.parser')
    table = e_soup.find('tbody')
    is_df = False
    df = pd.DataFrame()
    for row in table.findAll('tr', recursive=False):
        
        # Accounting for DQ's
        try:
            time = convert_time(row.find('abbr').text.strip())
            place = row.find('td', {'class': 'c-table-clean__col-fit u-text-center u-pr0'}).text.strip()
        except AttributeError as e:
            time = None
            place = None
        team = row.find('span', {'class': 'hidden-xs'}).text
        
        if not is_df:
            r = {'event': [event], 'place': [place], 
                 'team': [team], 'name': [team], 
                 "time": [time], 'is_prelims': [False]}
            df = pd.DataFrame.from_dict(r)
            is_df = True
        else:
            r = {'event': event, 'place': place, 
                 'team': team, 'name': team, 
                 "time": time, 'is_prelims': False}
            df = df.append(r, ignore_index=True)

    return df

In [8]:
def get_individual_results(e):
    event = e['event']
    e_link = e['link']
    e_resp = requests.get(e_link)
    e_soup = bs(e_resp.text, 'html.parser')
    section = e_soup.find('div', {'class': 'c-card c-card--large'})
    tables = section.findAll('tbody') # A-Final, B-Final, C-Final, Prelims

    is_df = False
    df = pd.DataFrame()
    
    for i, t in enumerate(tables):
        is_prelims = (i+1 == len(tables)) # last table?
        
        for row in t.findAll('tr', recursive=False):
            # Accounting for DQ's
            try:
                time = convert_time(row.find('abbr').text.strip())
                place = row.find('td', {'class': 'c-table-clean__col-fit u-text-center u-pr0'}).text.strip()
            except AttributeError as e:
                time = None
                place = None
            name = row.find('a', {'class': 'bold'}).text
            team = row.find('span', {'class': 'hidden-xs'}).text
            
            if not is_df:
                r = {'event': [event], 'name': [name], 
                     'place': [place], 'team': [team], 
                     "time": [time], 'is_prelims': [is_prelims]}
                df = pd.DataFrame.from_dict(r)
                is_df = True
            else:
                r = {'event': event, 'name': name, 
                     'place': place, 'team': team, 
                     "time": time, 'is_prelims': is_prelims}
                df = df.append(r, ignore_index=True)
    return df

In [None]:
for meet in meets_info:
    for e in get_events(meet['link']):
        if "Relay" in e['event']:
            get_relay_results(e['link'])
        else:
            df = get_individual_results(e['link'])
            print(df.head())

In [11]:
es = get_events(meets_info[3]['link'])
get_individual_results(es[15]['link'])

Unnamed: 0,name,place,team,time,is_prelims
0,Isabella Hindley,1,Yale,47.85,False
1,Miki Dahlke,2,Harvard,48.51,False
2,Mei Colby,3,Harvard,49.21,False
3,Kennidy Quist,4,Harvard,49.56,False
4,Mary Ashby,5,Columbia,49.84,False
...,...,...,...,...,...
56,Maddy Redding,33,Cornell,52.47,True
57,Shoshana Swell,34,Cornell,52.81,True
58,Claire Lin,35,Columbia,52.95,True
59,Connie Zhang,36,Dartmouth,52.96,True


In [None]:
get_events(meet_links[2]['link'])

In [None]:
convert_time('1:23.45')

In [12]:
output_columns = list("name	event	year	pool	finish_place	is_prelims	team".split('\t'))
print(output_columns)

['name', 'event', 'year', 'pool', 'finish_place', 'is_prelims', 'team']
