In [1]:
from time import strftime,localtime
import pandas as pd
from os.path import exists
from os import mkdir

In [2]:
class Web_scraper:
    def __init__(self):
        self.teams = self.Teams()
        self.players = self.Players()
    
    class Teams:
        def __init__(self):
            self.root = "https://www.espn.com/nba/standings/_"
            self.tail = "/group/league"
            team_abbrvs = pd.read_csv("team_names.csv",usecols=["prefix_1"])
            self.team_names = "|".join(list(team_abbrvs["prefix_1"].str.upper()))
            del team_abbrvs
            
        def build_url(self,year_n,current = True,preseason = False):
            if(current and not preseason):
                return self.root + self.tail
            elif(current and preseason):
                return self.root + "/seasontype/pre" + self.tail
            elif(not current and preseason):
                return self.root + "/seasontype/pre/season/" + year_n + self.tail
            else:
                return self.root + "/season/" + year_n + self.tail
            

        def correct_teams(self, df):
            teams = df
            first = teams.columns[0]
            row_0 = pd.DataFrame([first],columns=["Team"])
            teams = teams.rename(columns={first:"Team"})
            teams = pd.concat([row_0,teams],ignore_index=True)

            return teams
        
        def preprocess(self,df, complete_league = True):
            if(complete_league):
                df["Team"] = df["Team"].str.extract(r'(.+--)(.+)', expand = True)[1]

            processed = df["Team"].str.extract(fr'({self.team_names})(.+)', expand = True)

            df.insert(0,"ID",processed[0])
            df["Team"] = processed[1]

            return df
    
        def build_table(self, url,complete_league=True):
            dfs = pd.read_html(url,match=".+ | \n")
            stats = dfs[1] 
            teams = self.correct_teams(dfs[0]) # Needed to correct empty table header on espn site
            table = teams.join(stats)

            return self.preprocess(table,complete_league)
        
    class Players:
        def __init__(self):
            self.REG = "2"
            self.POST = "3"
            self.root = "https://www.espn.com/nba/team/stats/_/name" #[teamname -prefix_1]
            self.mid = "/season" #[year] YYYY
            self.tail = "/seasontype" #[2|3] 2 = regular season, 3 = postseason
            team_abbrvs = pd.read_csv("team_names.csv",usecols=["prefix_1"])
            self.team_names = "|".join(list(team_abbrvs["prefix_1"].str.upper()))
            del team_abbrvs
            
        def build_url(self,year,team,postseason = False):
            if(postseason):
                return self.root + "/" + team + self.mid + "/" +year + self.tail + "/" +self.POST
            
            return self.root + "/" + team + self.mid + "/" +year + self.tail + "/" +self.REG
        
        def preprocess(self,df,shooting = True):
            processed = df["Name"].str.extract(r'(.+ | Total)([A-Z]+\**)',expand = True)
            
            processed = processed.drop([len(processed)-1])
            
            if(shooting):
                df.insert(1,"POS",processed[1])
                
            df["Name"] = processed[0].str.strip()
            
            df = df.drop([len(df)-1])

#             df["Name"]= df["Name"].fillna("Total")
#             df["POS"]= df["POS"].fillna("")
            return df
            
        def build_table(self, url, team):
            dfs = pd.read_html(url)
            
            players = pd.concat([dfs[0],dfs[1]],axis=1)
            shooting = pd.concat([dfs[2],dfs[3]],axis=1)
            
            players = self.preprocess(players,False)
            shooting = self.preprocess(shooting)
            
            table = shooting.join(players.set_index("Name"),on = "Name")
            table["Team"] = team.upper()

            return table

In [3]:
class Team_standings:
    
    def __init__(self, teams):
        self.REG = 0
        self.PRE = 1
        self.labels =("regular_season", "pre_season")
        self.teams = teams
        self.reg_fp = "./team_standings/reg_season/"
        self.pre_fp = "./team_standings/pre_season/"


    def gen_key(self, year, season_type):
        return year+"_"+season_type

    
    def save_standings(self,filepath,year,preseason=False):
        if(exists(filepath)):
            return
        
        current = False
        season = self.teams.build_url(year,current, preseason)

        if(preseason):
            table = self.teams.build_table(season,False)
        else:
            table = self.teams.build_table(season)
            
        table.to_csv(filepath,index = False)
        
        return

    def update_team_standings(self):
        this_year = strftime("%Y",localtime())

        filename = self.gen_key(this_year,self.labels[self.REG]) + ".csv"
        filepath = self.reg_fp + filename
        
        reg_season = self.teams.build_url(this_year)
        table = self.teams.build_table(reg_season,False)
        table.to_csv(filepath,index = False)
        
        
        filename = self.gen_key(this_year,self.labels[self.PRE]) + ".csv"
        filepath =  self.pre_fp + filename
        
        pre_season = self.teams.build_url(this_year,preseason = True)
        table = self.teams.build_table(pre_season,False)
        table.to_csv(filepath,index = False)
    
        return
        
        
    
    def get_team_standings(self,last_n_years):
        this_year = strftime("%Y",localtime())
        year = int(this_year)

        for y in range(year,year-last_n_years,-1):
            filename = self.gen_key(str(y-1),self.labels[self.REG]) + ".csv"
            filepath = self.reg_fp + filename
            self.save_standings(filepath,str(y-1))
            
            
            filename = self.gen_key(str(y-1),self.labels[self.PRE]) + ".csv"
            filepath = self.pre_fp + filename
            self.save_standings(filepath,str(y-1),True)
        
        return

In [4]:
class Player_stats:
    def __init__(self,players):
        self.players = players
        team_abb = pd.read_csv("team_names.csv")
        self.teams = tuple(team_abb["prefix_1"].str.lower())
        del team_abb
        self.root = "./player_stats"
        self.REG = 0
        self.POST = 1
        self.labels =("/regular_season.csv", "/post_season.csv")
        
    def make_dir(self, path):
        if(not exists(path)):
            mkdir(path)
        return path
        
    def get_player_stats(self,last_n_years):
        this_year = strftime("%Y",localtime())
        year = int(this_year)
        
        for y in range(year,year-last_n_years,-1):
            file_dir = self.make_dir(self.root + "/" + str(y-1))
            
            if(exists(file_dir)):
                continue # there already is data on this year, skip
                
            yr = str(y-1)
            
            df = []
            reg_fp = file_dir + self.labels[self.REG]
            
            post_fp = file_dir + self.labels[self.POST]
            
            i = 0
            for t in self.teams:
                url = self.players.build_url(yr,t)
                print(url)
                df = self.players.build_table(url,t)
                
                df.to_csv(reg_fp, index = False,header = i==0, mode = "a")
                
                
                url = self.players.build_url(yr,t,True)
                df = self.players.build_table(url,t)
                df.to_csv(post_fp, index = False,header = i==0, mode = "a")
                i+=1
        
        return

In [5]:
wbs = Web_scraper()
obj = Team_standings(wbs.teams)

In [6]:
ps = Player_stats(wbs.players)

In [7]:
ps.get_player_stats(1)

In [20]:
obj.get_team_standings(5)
obj.update_team_standings()

AttributeError: 'Player_stats' object has no attribute 'columns'

In [11]:
df = pd.read_csv("./team_standings/reg_season/2022_regular_season.csv")

In [19]:
df.loc[df["Team"] == "Utah Jazz"]

Unnamed: 0,ID,Team,W,L,PCT,GB,HOME,AWAY,DIV,CONF,PPG,OPP PPG,DIFF,STRK,L10
2,UTAH,Utah Jazz,28,10,0.737,2,14-7,14-3,9-0,18-4,116.0,106.2,9.8,W2,8-2


In [None]:
stats = pd.read_html("https://www.espn.com/nba/team/stats/_/name/utah/season/2021/seasontype/2")

In [42]:
p = pd.read_csv("./player_stats/2021/regular_season.csv")

In [46]:
p["FGEfficacy"] = p["FGM"] *  (p["FG%"]/100)

In [50]:
p.sort_values(["FGEfficacy","FG%"], ascending=False)

Unnamed: 0,Name,POS,FGM,FGA,FG%,3PM,3PA,3P%,FTM,FTA,...,AST,STL,BLK,TO,PF,AST/TO,PER,Team,FGConfidence,FGEfficacy
390,Zion Williamson,PF,10.4,17.0,61.1,0.2,0.6,29.4,6.0,8.7,...,3.7,0.9,0.6,2.7,2.2,1.4,27.17,NO,10.3870,6.3544
301,Giannis Antetokounmpo,PF,10.3,18.0,56.9,1.1,3.6,30.3,6.5,9.5,...,5.9,1.2,1.2,3.4,2.8,1.7,29.24,MIL,10.2420,5.8607
534,Nikola Jokic,C,10.2,18.0,56.6,1.3,3.3,38.8,4.8,5.5,...,8.3,1.3,0.7,3.1,2.7,2.7,31.36,DEN,10.1880,5.7732
513,Bradley Beal,SG,11.2,23.0,48.5,2.2,6.2,34.9,6.8,7.7,...,4.4,1.2,0.4,3.1,2.3,1.4,22.81,WSH,11.1550,5.4320
22,Kyrie Irving,PG,10.2,20.1,50.6,2.8,7.0,40.2,3.7,4.0,...,6.0,1.4,0.7,2.4,2.6,2.5,24.51,BKN,10.1706,5.1612
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
410,Will Magnay,PF,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,-35.04,NO,0.0000,0.0000
533,Anzejs Pasecniks,C,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,5.0,2.0,0.2,-40.47,WSH,0.0000,0.0000
554,Greg Whittington,SF,0.0,0.8,0.0,0.0,0.5,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-10.13,DEN,0.0000,0.0000
555,Gary Clark,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.89,DEN,0.0000,0.0000


In [None]:
players = players.drop([len(players)-1])

In [None]:
players

In [None]:
shooting = stats[2].join(stats[3])

In [None]:
shooting = shooting.drop([len(shooting)-1])

In [None]:
shooting

In [None]:
processed = players["Name"].str.extract(r'(.+ | Total)([A-Z]+\**)'
                                         ,expand = True)

In [None]:
processed

In [None]:
# players.insert(1,"POS",processed[1])
players["Name"] = processed[0]

In [None]:
players

In [None]:
processed = shooting["Name"].str.extract(r'(.+ | Total)([A-Z]+\**)'
                                         ,expand = True)
shooting["Name"] = processed[0]
shooting.insert(1,"POS", processed[1])

In [None]:
shooting

In [None]:
s = shooting.join(players.set_index("Name"),on = "Name")

In [None]:
s

In [None]:
f = pd.read_html("/Users/school/Downloads/utah.html")

In [10]:
ps.teams[0] = "Nul"

TypeError: 'tuple' object does not support item assignment