In [25]:
### '''Import packages'''

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import time
import numpy as np
import pandas as pd
import urllib
import urllib.request
import requests
import re
from bs4 import BeautifulSoup
from html.parser import HTMLParser
from datetime import datetime
from datetime import date, timedelta
import random

In [43]:
#enter in date range
start_date = date(2024, 10, 22)
end_date = date(2025, 2, 13)

In [27]:
def list_dates_between(start_date, end_date):
    """
    Generates a list of dates between two dates (inclusive).

    Args:
        start_date (date): The starting date.
        end_date (date): The ending date.

    Returns:
        list: A list of dates between start_date and end_date.
    """
    dates = []
    current_date = start_date
    while current_date <= end_date:
        dates.append(current_date)
        current_date += timedelta(days=1)
    return dates

def daily_stats_url_list():
    """
    Generates a list of url to access Daily Stats from Basketball Reference Site
    
    Args:
        season_start_year (year): The starting year.

    Returns:
        list: A list of url to access Daily Stats from Basketball Reference Site.    
    """
    url_list = []

    for dt in date_list:
        urldata = "https://www.basketball-reference.com/friv/dailyleaders.fcgi?month="\
        +str(dt.month)+"&day="+str(dt.day)+"&year="+str(dt.year)+"&type=all"
        url_list.append(urldata)
        
    valid_url = []
    
    for item in url_list:
            r = requests.get(item, headers={'User-Agent': random.choice(useragents)})
            soup_new = BeautifulSoup(r.text,'html.parser')
            league_table = soup_new.find('table', class_='sortable stats_table')
            if league_table is not None:
                #print(item)
                valid_url.append(item)
            else:
                print('Error')
            time.sleep(random.randint(20,30))
    return(valid_url)

def url_date_substring(url_string):
    """
    Generates a date array by parsing through text to grab year, month, and day.

    Returns:
        list: A list of dates between start_date and end_date.
    """    
    date_array = []
    pattern = r"\=(.*?)\&"
    matches = re.findall(pattern, url_string)
    for match in matches:
        date_array.append(match)
    return(date_array)

def url_date_substring_loop():
    """
    Loop through URL's to grab year, month, and day.

    Returns:
        list: Full list of dates between start_date and end_date.
    """      
    url_date_substring_loop = []
    x=0
    for item in daily_stats_url_val:
        cnt_val = 1
        val = url_date_substring(url_string = daily_stats_url_val[x] )
        url_date_substring_loop.append(val)
        x= x + cnt_val
    return(url_date_substring_loop)

def bs4(urldata):
    """
    Open URL and parse through text to find table with specific class name.

    Returns:
        list: Returns table.
    """   
    #urldata = "https://www.basketball-reference.com/friv/dailyleaders.fcgi?month=10&day=1&year=2024&type=all"
    #urldata = "https://www.basketball-reference.com/friv/dailyleaders.fcgi?month=10&day=22&year=2024&type=all"
    r = requests.get(urldata, headers={'User-Agent': random.choice(useragents)})
    soup_new = BeautifulSoup(r.text,'html.parser')
    league_table = soup_new.find('table', class_='sortable stats_table')
    return(league_table)

def Source_check(urldata):
    """
    Confirm website is active.

    """       
    a = urllib.request.urlopen(urldata)
    if a.getcode() == 200:
        print("Result code: " + str(a.getcode()))
    else:
        print("Error, cannot parse results")
        
def Header_row(league_table):
    """
    Finder header row of table and create list of column names for final dataframe.

    """       
	head = league_table.find('thead')
	col = []
	column_name = re.findall('label="(.+?)"', str(head))
	st = column_name
	for i in st:
		list_i = [i]
		col.extend(list_i)
	column_name.remove('Rk')
	return(column_name)

def nba_player_data(league_table):
    """
    Parase through text to find player data

    """     
	i = 0
	nba_player_data_list = []
	for team in league_table.find_all('tbody'):
		rows = team.find_all('tr')
		for row in rows:
			if i < 700:
				i += 1
				nba_players = row.find_all('td')
				for player in nba_players:
					nba_player_data_list.append(player)
	return(nba_player_data_list)

def actual_data(league_table):
    """
    Parase through text to get statistics and apend them to.  Looks for values in between certain characters.

    """         
	list_ = nba_player_data(league_table)
	max_list_count = len(list_)
	col = []
	for num in range (0,max_list_count,1):
		if num ==0 or num%(len(Header_row(league_table)))==0:
			name = re.search('csk="(.+?)"',str(list_[num]))
			if name:
				col.append(name.group(1))
		else:
			actual_value = re.search('">(.+?)<',str(list_[num]))
			if actual_value is not None:
				col.append(actual_value.group(1))
			else:
				col.append('0.0')
	return(col)

def list_partition(league_table):
    """
    Create dataframe with full list of stats .

    """      
	main = actual_data(league_table)
	start= 0
	int = len(Header_row(league_table))
	variable_assign = ['x']
	tot = len(main)
	df = pd.DataFrame(columns = Header_row(league_table))
	while start < tot:
		for part in variable_assign:
			part = main[start:(start + int)]
			df_length = len(df)
			df.loc[df_length] = part
			break
		start += int
	return(df)

In [28]:
#Declare variables
useragents = ['Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:134.0) Gecko/20100101 Firefox/134.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 Edg/132.0.0.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:135.0) Gecko/20100101 Firefox/135.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 YaBrowser/24.12.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36 Edg/133.0.0.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 OPR/116.0.0.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:128.0) Gecko/20100101 Firefox/128.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36 OPR/115.0.0.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 GLS/100.10.9939.100',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 OPR/114.0.0.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:136.0) Gecko/20100101 Firefox/136.0',
'Mozilla/5.0 (Windows NT 10.0; rv:128.0) Gecko/20100101 Firefox/128.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36 Edg/130.0.0.0']

now = datetime.now()
file_date = (now.strftime("%d_%b_%Y"))
file_date.replace(r"/",".")
file_name = "NBAPlayerDailyStats.csv"
dfs = []
x=0

In [29]:
#Execute functions
date_list = list_dates_between(start_date, end_date)

daily_stats_url_val = daily_stats_url_list()

url_date_substring_loop_val = url_date_substring_loop()

#Loop through all url's in given date range
for item in daily_stats_url_val:
    url = bs4(item)
    Header_row(url)
    nba_player_data(url)
    actual_data(url)
    df = list_partition(url)
    cnt_val = 1
    dateval = url_date_substring_loop_val[x]
    x =x + cnt_val
    df['Player']= df['Player'].str.replace('-1', '') #Removes special characters from player name. 
    df['Tm']=df['Tm'].str.slice(-3) #Removes URL link from team name.
    df['Opp']=df['Opp'].str.slice(-3) #Removes URL link from team name.
    df = df.replace('<strong>', '', regex=True) #Replaces characters from all strings in all cells.
    game_date = date(int(dateval[2]),int(dateval[0]),int(dateval[1]))
    df['Game Date'] = game_date
    dfs.append(df)

#Concatenate all dataframes 
final_df = pd.concat(dfs,ignore_index=True)

Error
Error
Error
Error


In [30]:
#Create CSV File
final_df.to_csv(file_name, index=False)