In [1]:
from ncaa_pbp import Game, Team
import requests
from bs4 import BeautifulSoup
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}
base_url = "https://stats.ncaa.org/"
import pandas as pd
import re
from io import StringIO
from copy import deepcopy

In [None]:
a = Team('561062')
a.roster

In [9]:
r = requests.get(base_url + f'contests/{'3970485'}/individual_stats', headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')


In [279]:
class Game():

    def __init__(self, id, loadShotChart=False) -> None:
        self.id = id
        self.teams = None
        self.__pbp = None

        self.load_boxScore()
        self.load_pbp()

        if loadShotChart:
            self.load_shotChart()
        else:
            self.__shotChart = None
    
    def load_boxScore(self):
        r = requests.get(base_url + f'contests/{self.id}/individual_stats', headers=headers)
        soup = BeautifulSoup(r.text, 'html.parser')
        print(r)
        
        # Get teams' IDs from the hyperlink on the team name from the quarter-by-quarter scoring table
        self.teams = [team['href'].split('/')[-1] for team in soup.find('table').find_all('a', {'class':'skipMask'}) if team.text]

        tables = pd.read_html(StringIO(r.text),match='FGM',header=0)
        self.__boxScore = {self.teams[1]: tables[1], self.teams[0]: tables[0]}

        # Properly format starters' names for each team
        self.__starters = {team:[" ".join(player.split(', ')[::-1]) for player in self.__boxScore[team].loc[self.__boxScore[team].GS==1,'Name'].tolist()[:-1]] \
                         for team in self.teams}
    
    ##########################

    def load_shotChart(self):
        options = Options()
        options.add_argument("--headless")
        driver = webdriver.Firefox(options=options)
        driver.get(base_url + f'contests/{self.id}/box_score')
        soup = BeautifulSoup(driver.page_source, "html.parser")
        driver.quit()
        shotsList = soup.find_all('circle')[3:-4]

        shots = pd.DataFrame([shot.attrs for shot in shotsList])
        shots[['Period','player_id','team_id','shot','ShotMade']] = pd.DataFrame(shots['class'].to_list())

        shots['text']   = pd.DataFrame([shot.text for shot in shotsList])
        shots['Team']   = shots['text'].apply(lambda string: re.findall('\\((.+)\\)',string)[0])
        shots['Player'] = shots['text'].apply(lambda string: re.findall('by (.+)\\(',string)[0])
        shots['Time']   = shots['text'].apply(lambda string: re.findall('... (.*) :',string)[0])

        shots.drop(columns=['r','style','class','shot','text'], inplace=True)

        # Sort the events based on period and time remaining in quarter; this is mainly due to overtime periods being out of order
        shots.sort_values(['Period','Time'],ascending=[True,False],inplace=True)
        shots.reset_index(drop=True,inplace=True)

        self.__shotChart = shots

    @property
    def boxScore(self):
        return self.__boxScore
    
    @property
    def starters(self):
        return self.__starters
    
    @property
    def pbp(self):
        return self.__pbp
    
    @property
    def shotChart(self):
        return self.__shotChart
    
    def assign_event_type(text):
        if 'start' in text:
            return 0
        if any(etype in text for etype in ['won', 'lost', 'startperiod']) and 'jumpball' in text:
            return 1
        if any(etype in text for etype in ['2pt', '3pt']) and 'missed' in text:
            return 2
        if any(etype in text for etype in ['2pt', '3pt']) and 'made' in text:
            return 3
        if 'assist' in text:
            return 4
        if 'block' in text:
            return 5
        if 'rebound' in text:
            return 6
        if any(etype in text for etype in ['foulon', 'foul']):
            return 7
        if 'heldball' in text:
            return 8
        if 'steal' in text:
            return 9
        if ' turnover' in text:
            return 10
        if 'timeout' in text:
            return 11
        if 'substitution' in text:
            return 12
        if 'freethrow' in text:
            return 13
        if 'end' in text:
            return 14

    def load_pbp(self):
        r = requests.get(base_url + f'contests/{self.id}/play_by_play', headers=headers)
        pbp = pbp = [quarter for quarter in pd.read_html(StringIO(r.text), header=0) if 'Time' in quarter.columns]
        # Add quarter to pbp
        for q, table in enumerate(pbp):
            table.insert(0,'Quarter',q+1)
            
        pbp = pd.concat(pbp).reset_index(drop=True)
        pbp.columns = ['Quarter','Time',self.teams[0],'Score',self.teams[1]]

        self.__pbp = pbp

In [280]:
r = requests.get(base_url + f'contests/{"3978635"}/play_by_play', headers=headers)
pbp = pbp = [quarter for quarter in pd.read_html(StringIO(r.text), header=0) if 'Time' in quarter.columns]
# Add quarter to pbp
for q, table in enumerate(pbp):
    table.insert(0,'Quarter',q+1)
    
pbp = pd.concat(pbp).reset_index(drop=True)
pbp.columns = ['Quarter','Time','561062','Score','560762']

team1 = pbp.loc[~pbp.iloc[:,-1].isna(),pbp.columns[-1]].to_frame('text')
team1['team'] = pbp.columns[-1]
team2 = pbp.loc[~pbp.iloc[:,2].isna(),pbp.columns[2]].to_frame('text')
team2['team'] = pbp.columns[2]

combinedDesc = pd.concat([team1,team2]).sort_index()
combinedDesc.loc[combinedDesc.index.duplicated(keep='last'),'team'] = None
combinedDesc = combinedDesc.loc[~combinedDesc.index.duplicated()]

pbp[['text','team']] = combinedDesc
# Delete placeholder offensive rebounds between free throws
pbp.drop(index=pbp[pbp.text.str.contains('offensivedeadball')].index, inplace=True)

pbp['eventType'] = pbp.text.apply(assign_event_type)

df = pbp.sort_values(by=['Quarter','Time','eventType'],ascending=[True,False,True]).reset_index(drop=True)

pbplist = []
for i, row in df.iterrows():
    pbplist.append(PbpItem(row))

print(df.loc[30,'text'])
pbplist[30].description

Peyton Kennedy, 2pt layup pointsinthepaint; made


'Peyton Kennedy, 2pt layup pointsinthepaint; made'

In [None]:
for item in pbplist:
    print(f'{item.des}')

In [258]:
df

Unnamed: 0,Quarter,Time,561062,Score,560762,text,team,eventType
0,1,10:00:00,game start,game start,game start,game start,,0
1,1,10:00:00,period start,period start,period start,period start,,0
2,1,10:00:00,jumpball startperiod,jumpball startperiod,jumpball startperiod,jumpball startperiod,,0
3,1,09:57:00,"Marcavia Shavers, jumpball lost",0-0,,"Marcavia Shavers, jumpball lost",561062,1
4,1,09:57:00,,0-0,"Jazmyn Doster, jumpball won","Jazmyn Doster, jumpball won",560762,1
...,...,...,...,...,...,...,...,...
557,4,00:24:60,,58-91,"Jada Brown, 2pt jumpshot fromturnover; missed","Jada Brown, 2pt jumpshot fromturnover; missed",560762,2
558,4,00:22:10,"Bri Johns, rebound defensive",58-91,,"Bri Johns, rebound defensive",561062,6
559,4,00:04:70,"Camreé Clegg, 3pt jumpshot made",61-91,,"Camreé Clegg, 3pt jumpshot made",561062,3
560,4,00:00:00,period end confirmed;,period end confirmed;,period end confirmed;,period end confirmed;,,14


In [233]:
a = Game("3978635")
z = a.pbp

<Response [200]>


In [48]:
def assign_event_type(text):
    if 'start' in text:
        return 0
    if any(etype in text for etype in ['won', 'lost', 'startperiod']) and 'jumpball' in text:
        return 1
    if any(etype in text for etype in ['2pt', '3pt']) and 'missed' in text:
        return 2
    if any(etype in text for etype in ['2pt', '3pt']) and 'made' in text:
        return 3
    if 'assist' in text:
        return 4
    if 'block' in text:
        return 5
    if 'rebound' in text:
        return 6
    if any(etype in text for etype in ['foulon', 'foul']):
        return 7
    if 'heldball' in text:
        return 8
    if 'steal' in text:
        return 9
    if 'turnover' in text:
        return 10
    if 'timeout' in text:
        return 11
    if 'substitution' in text:
        return 12
    if 'freethrow' in text:
        return 13
    if 'end' in text:
        return 14

In [246]:
pbplist = []
for i, row in df.iterrows():
    pbplist.append(PbpItem(row))

In [281]:
class PbpItem():

    def __init__(self, row):
        self.description = row.text
        self.team = row.team
        self.eventType = row.eventType
        if pbplist:
            self.__previousEvent = pbplist[-1]
        else:
            self.__previousEvent = None
    
    @property
    def previousEvent(self):
        return self.__previousEvent
    
    @property
    def currentLineups(self):
        if not self.previousEvent:
            return a.starters
        if self.eventType != 12: # substitution
            return self.previousEvent.currentLineups
        else:
            lineups = deepcopy(self.previousEvent.currentLineups) # deepcopy for dict to not alter previous events
            player = self.description.split(',')[0]
            if 'substitution out' in self.description:
                lineups[self.team].remove(player)
            if 'substitution in' in self.description:
                lineups[self.team].append(player)
            return lineups
        

    @property
    def data(self):
        return self.__dict__
    
    @property
    def isPossessionEnding(self):
        if any(playType in self.description for playType in ['rebound defensive','turnover']):
            return True
        if (self.isFGA or self.isLastFTA) and 'made' in self.description:
            return True
        else: return False
    
    @property
    def isFGA(self):
        if self.eventType == 2 or self.eventType == 3: 
            return True
        else: return False
    
    @property
    def isLastFTA(self):
        return any(ftType in self.description for ftType in ['2of2','1of1','3of3'])
    
    @property
    def eventType1(self):
        types = ['rebound (defensive,offensive,team,offensivedeadball)', 'turnover (offensive,badpass,travel,outofbounds,lostball,other,shotclock,team)','assist', \
                 'steal','jumpball (won,lost,heldball)','substitution (in,out)','foul (personal,offesnsive,shooting,2freethrow,1freethrow)','foulon','freethrow (#of#,made,missed)', \
                 'timeout (short,full,commercial)']
        shotAttributes = ['made','missed','layup','jumpshot','hookshot','turnaroundjumpshot','drivinglayup','stepbackjumpshot','pullupjumpshot','floatingjumpshot','blocked','fromturnover','2ndchance','pointsinthepaint','fastbreak']

In [282]:
def isPossessionEnding(row):
        if any(playType in row.text for playType in ['rebound defensive',' turnover']):
            return True
        if ((row.eventType == 2 or row.eventType == 3) and 'made' in row.text):
            return True
        if any(ftType in row.text for ftType in ['2of2','1of1','3of3']) and 'made' in row.text:
            return True
        else: return False

In [285]:
df['ending'] = df.apply(isPossessionEnding,axis=1)
df.head(40)

Unnamed: 0,Quarter,Time,561062,Score,560762,text,team,eventType,ending
0,1,10:00:00,game start,game start,game start,game start,,0,False
1,1,10:00:00,period start,period start,period start,period start,,0,False
2,1,10:00:00,jumpball startperiod,jumpball startperiod,jumpball startperiod,jumpball startperiod,,0,False
3,1,09:57:00,"Marcavia Shavers, jumpball lost",0-0,,"Marcavia Shavers, jumpball lost",561062.0,1,False
4,1,09:57:00,,0-0,"Jazmyn Doster, jumpball won","Jazmyn Doster, jumpball won",560762.0,1,False
5,1,09:50:00,,0-2,"Paula Suarez, 2pt layup pointsinthepaint; made","Paula Suarez, 2pt layup pointsinthepaint; made",560762.0,3,True
6,1,09:16:00,"Team, turnover shotclock team;",0-2,,"Team, turnover shotclock team;",561062.0,10,True
7,1,08:56:00,,0-2,"Jazmyn Doster, 2pt jumpshot fromturnover;point...","Jazmyn Doster, 2pt jumpshot fromturnover;point...",560762.0,2,False
8,1,08:53:00,"Julia Martinez, rebound defensive",0-2,,"Julia Martinez, rebound defensive",561062.0,6,True
9,1,08:50:00,"Julia Martinez, 2pt jumpshot pointsinthepaint;...",0-2,,"Julia Martinez, 2pt jumpshot pointsinthepaint;...",561062.0,2,False


In [278]:
row = df.iloc[557]
any(playType in row.text for playType in ['rebound defensive','turnover'])

True