In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from tqdm import tqdm
from ncaa_pbp import *
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}


In [2]:
# Utility functions

def toc_to_sec(row):
    time_in_quarter = list(map(int, row.time.split(':')))

    time_remaining = 2400 - (row.quarter-1)*600 - (600-time_in_quarter[0]*60-time_in_quarter[1])
    return time_remaining

def get_starters(box):
    rows = box.contents[0].contents[1].find_all('tr')
    players = []

    for i, row in enumerate(rows):
        if i not in [0,len(rows)-1,len(rows)-2]:
            players.append(row.contents[1].string.strip())

    return players[0:5]

def getCurrentLineup(input_row, team):

    global df, starters, current_players
    row = input_row[1]
    if row.name == 0:
        return starters[team]
    else:
        description = row['description']
        prev_row = df.loc[row.name - 1]
        lineup = (current_players[team][-1]).copy()
        
        if 'Sub' in description and team_map[team] in description:
            
            subbing_description = row['description'].split()
            in_or_out = subbing_description[1]
            player = row['description'].split('-')[-1].upper()
            if in_or_out == 'out':
                lineup.remove(player)
                return lineup
            else:
                lineup.append(player)
                return lineup
        else:
            return lineup

def scrape_pbp(pbp):
    categories = ['time', 'team', 'description', 'score', 'quarter']
    data = {col:[] for col in categories}

    quarters = pbp.find_all('div',class_='play-by-play-period-table')

    for n, quarter in enumerate(quarters):
        table = quarter.contents[0].contents[3]

        for col in categories:
            if col == 'team':
                column_data = table.find_all('img')
                data[col].extend( [img.attrs['alt'].split()[0] for img in column_data] )
            elif col == 'quarter':
                data[col].extend( [n+1 for i in column_data] )
            else:
                column_data = table.find_all('td',class_=col)
                data[col].extend( [t.string.strip() for t in column_data] )
        

    return pd.DataFrame(data)

class Game():
    def __init__(self, id, load_from='web') -> None:
        self.id = id
        if load_from == 'web':
            pass
        elif load_from == 'file':
            pass
        else:
            raise ValueError

In [3]:
# 2022-23 D-I Women's Basketball NET Ranking thru games 04/02/2023
r = requests.get("https://stats.ncaa.org/selection_rankings/nitty_gritties/31608", headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
net_table = soup.find('tbody')

net_df = pd.read_html(r.text)[0]


In [4]:
names, ids, conferences = [], [], []

for row in net_table.find_all('tr'):
    names.append(row.contents[1].text.strip())
    ids.append(row.contents[1].contents[-1].get('href'))
    conferences.append(row.contents[3].text)


In [5]:
# Team page
team_r = requests.get(f"https://stats.ncaa.org{ids[158]}", headers=headers)
soup = BeautifulSoup(team_r.text)
games_table = soup.find_all('table')[1].contents[3]

In [6]:
# Get opponent and link to game for each game a team played
opponents = []
games = []

for row in games_table.find_all('tr'):
    if len(row.contents) > 3: # Skip buffer rows
        # Error handling for opponents without links
        try:
            opponents.append(row.contents[3].contents[1].text.strip())
        except:
            opponents.append(row.contents[3].contents[0].text.strip())

        # Error handling for if the game was not played
        try:
            games.append(row.contents[5].find('a').get('href'))
        except:
            games.append(None)


In [10]:
pbp_ids = []

for game in tqdm(games):
    r = requests.get(f"https://stats.ncaa.org{game}", headers=headers)
    soup = BeautifulSoup(r.text)
    pbp_ids.append(soup.find('a', href=re.compile('play_by_play'))['href'])

    # Save box score html to file
    with open(f"FGCU/box_{game.split('/')[2]}.txt", "w") as file:
        file.write(r.text)

100%|██████████| 37/37 [00:33<00:00,  1.10it/s]


In [7]:
r = requests.get(f"https://stats.ncaa.org{games[0]}", headers=headers)
soup = BeautifulSoup(r.text)

In [8]:
pd.read_html(r.text,match='FGM',header=1)[0].Player.iloc[:5].tolist()

['Young, Amari',
 'Duckett, Jada',
 'McLaughlin, Jordan',
 'Clark, Kaye',
 'Dickens, Makayla']

In [24]:
# Save play-by-play ids to file
f=open('pbp_ids_fgcu.txt','w')
for id in pbp_ids:
    f.write(id+'\n')

In [None]:
game_r = requests.get(f"https://stats.ncaa.org{pbp_ids[0]}", headers=headers)
soup = BeautifulSoup(game_r.text)
with open("FGCU/game1.txt", "w") as file:
    file.write(game_r.text)

In [114]:
with open("FGCU/game1.txt", "r") as file:
    txt = file.read()
    soup = BeautifulSoup(txt)

In [None]:
pd.read_html(txt,header=0)[8].head(10)

In [3]:
class Game():

    def __init__(self, id) -> None:
        self.id = id
        self.teams = None
        self.__pbp = None

        self.load_boxScore()
        self.load_pbp()
    
    
    def load_boxScore(self):
        r = requests.get(base_url + f'contests/{self.id}/box_score', headers=headers)
        soup = BeautifulSoup(r.text, 'html.parser')
        print(r)
        
        # Get teams' IDs from the hyperlink on the team name from the quarter-by-quarter scoring table
        self.teams = [team['href'].split('/')[-1] for team in soup.find('table').find_all('a')]

        tables = pd.read_html(r.text,match='FGM',header=1)
        self.__boxScore = {self.teams[1]: tables[1], self.teams[0]: tables[0]}
        self.starters = {team:[" ".join(player.split(', ')[::-1]) for player in self.__boxScore[team].Player.iloc[:5].tolist()] \
                         for team in self.teams}

        self.__pbp_id = soup.find('a', href=re.compile('play_by_play'))['href']
    
    def load_pbp(self):
        r = requests.get(base_url + self.__pbp_id, headers=headers)
        soup = BeautifulSoup(r.text, 'html.parser')
        pbp = pd.concat([quarter for quarter in pd.read_html(r.text, header=0) if 'Time' in quarter.columns])
        pbp.rename(columns={ pbp.columns[1]: 'Away', pbp.columns[3]: 'Home'}, inplace=True)
        pbp.fillna(0,inplace=True)

        self.__pbp = []
        prev_lineups = self.starters
        for ix, row in pbp.iterrows():
            self.__pbp.append(PbpItem(ix, row.to_dict(), self.teams, prev_lineups))
            prev_lineups = self.__pbp[-1].current_players
            
    
    @property
    def boxScore(self):
        return self.__boxScore
    
    @property
    def pbp(self):
        return self.__pbp

In [8]:
class PbpItem():

    def __init__(self, id, event, team_ids, prev_lineups):
        self.id = id
        self.time = event['Time']
        self.quarter = None
        self.score = event['Score']
        self.team = None

        if event['Home'] and not event['Away']:
            self.team = team_ids[1]
            self.description = event['Home']
        elif event['Away'] and not event['Home']:
            self.team = team_ids[0]
            self.description = event['Away']
        else: # if the event pertains to the game, not a specific team
            self.description = event['Score']
        
        # Current Players
        lineups = prev_lineups
        if 'substitution out' in self.description:
            lineups[self.team].remove(self.description.split(', substitution')[0])
        elif 'substitution in' in self.description:
            lineups[self.team].append(self.description.split(', substitution')[0])
        self.current_players = lineups
    
    @property
    def data(self):
        return self.__dict__
    
    @property
    def isPossessionEnding(self):
        return 
    
    @property
    def isFGA(self):
        if any(shotType in self.description for shotType in ['2pt', '3pt']):
            return True
        else: return False
    
    @property
    def eventType(self):
        types = ['rebound (defensive,offensive,team,offensivedeadball)', 'turnover (offensive,badpass,travel,outofbounds,lostball,other,shotclock,team)','assist', \
                 'steal','jumpball (won,lost,heldball)','substitution (in,out)','foul (personal,offesnsive,shooting,2freethrow,1freethrow)','foulon','freethrow (#of#,made,missed)', \
                 'timeout (short,full,commercial)']
        shotAttributes = ['made','missed','layup','jumpshot','hookshot','turnaroundjumpshot','drivinglayup','stepbackjumpshot','pullupjumpshot','floatingjumpshot','blocked','fromturnover','2ndchance','pointsinthepaint','fastbreak']

In [21]:
line = {}
g = Game(2330047)

<Response [200]>


In [22]:
desc = []
for a in g.pbp:
    if ',' in a.description:
        ix = a.description.index(',')
        text = a.description[ix:].split()[1:]
        for t in text:
            text2 = t.split(';')
            for t2 in text2:
                if t2 not in desc:
                    desc.append(t2)
desc

['jumpball',
 'won',
 'turnover',
 'other',
 'lost',
 'badpass',
 'assist',
 '2pt',
 'layup',
 'fromturnover',
 'pointsinthepaint',
 '',
 'made',
 'drivinglayup',
 'missed',
 'rebound',
 'defensive',
 '3pt',
 'jumpshot',
 'fastbreak',
 'offensive',
 '2ndchance',
 'lostball',
 'steal',
 'foul',
 'personal',
 'shooting',
 '1freethrow',
 'foulon',
 'substitution',
 'out',
 'in',
 'freethrow',
 '1of1',
 'pullupjumpshot',
 'turnaroundjumpshot',
 'shotclock',
 'team',
 'offensivedeadball',
 '2freethrow',
 '2of2',
 '1of2',
 'hookshot',
 'blocked',
 'block',
 'travel',
 'stepbackjumpshot',
 'timeout',
 'short',
 'heldball',
 'floatingjumpshot',
 'full']

In [63]:
g.pbp[3]

<__main__.PbpItem at 0x1c405481b80>

In [118]:
df = pd.concat([quarter for quarter in pd.read_html(txt, header=0) if 'Time' in quarter.columns])
df['Lineups'] = 'a'
df.fillna(0)


Unnamed: 0,Time,Old Dominion,Score,FGCU,Lineups
0,10:00:00,game start,game start,game start,a
1,10:00:00,jumpball startperiod,jumpball startperiod,jumpball startperiod,a
2,10:00:00,period start,period start,period start,a
3,09:58:00,"Amari Young, jumpball won",0-0,0,a
4,09:58:00,0,0-0,"Sophia Stiles, jumpball lost",a
...,...,...,...,...,...
158,00:04:40,"Althea Kara Angeles, 2pt jumpshot blocked; missed",62-81,0,a
159,00:04:40,0,62-81,"Jordan Campbell, block",a
160,00:04:40,0,62-81,"Jordan Campbell, rebound defensive",a
161,00:00:00,period end confirmed;,period end confirmed;,period end confirmed;,a
