# Get Raw data from www.champsorchump.us

In [18]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

def process_game_row(tr, team):
    date = tr.find('td', class_='text-center').get_text(strip=True)
    opponent_td = tr.find_all('td', class_='text-center')[1]
    opponent_raw = opponent_td.get_text(strip=True)
    score = tr.find_all('td', class_='text-center')[2].get_text(strip=True)
    is_playoff = opponent_td.find('span', class_='fa fa-bolt') is not None
    type = "Playoff" if is_playoff else "Regular"
    win_loss = "Win" if 'W' in score else "Loss"
    team1 = team.replace('-', '')
    opponent = opponent_raw.replace('@ ', '').replace('vs ', '').replace('-', '')
    return {'Date': date, 'Score': score, 'Win/Loss': win_loss, 'Type': type, 'Team1': team1, 'Team2': opponent}

def scrape_data_for_year(team, year):
    url = f"https://champsorchumps.us/team/nba/{team}/{year}"
    response = requests.get(url)
    print(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        target_div = soup.find('div', class_='col-md-12 col-lg-8')
        if target_div:
            game_rows = target_div.find_all('tr', id=lambda x: x and x.startswith('game_'))
            return [process_game_row(tr, team) for tr in game_rows if tr.find('td', class_='text-center')]
        else:
            print(f"Element not found for year {year}.")
            return []
    else:
        print(f"Failed to retrieve the web page for year {year}. Status code: {response.status_code}")
        return []

def scrape_data_for_team(team, start_year, end_year):
    all_games_data = []
    for year in range(start_year, end_year + 1):
        yearly_data = scrape_data_for_year(team, year)
        all_games_data.extend(yearly_data)
        time.sleep(1) # delay in seconds 
    return pd.DataFrame(all_games_data)

team_selection = input("Enter 'all' for all teams or specify a team (e.g., 'toronto-raptors'): ").strip()
start_year = int(input("Enter your start year: "))
end_year = int(input("Enter your end year: "))

master_df = pd.DataFrame()

if team_selection.lower() == 'all':
    for team in nba_teams:
        df = scrape_data_for_team(team, start_year, end_year)
        master_df = pd.concat([master_df, df], ignore_index=True)
else:
    master_df = scrape_data_for_team(team_selection, start_year, end_year)

# Rename 'Win/Loss' column to 'Win'
master_df.rename(columns={'Win/Loss': 'Win'}, inplace=True)

# Convert 'Type' column: 'Playoff' to 1, 'Regular' to 0
master_df['Type'] = master_df['Type'].apply(lambda x: 1 if x == 'Playoff' else 0)

master_df['Win'] = master_df['Win'].apply(lambda x: 1 if x == 'Win' else 0)

# Check and remove rows where 'Score' column is empty
master_df = master_df[master_df['Score'].str.strip().ne('')]

csv_file_name = f"nba_data_{start_year}_to_{end_year}.csv"
master_df.to_csv(csv_file_name, index=False)
print(f"CSV file generated: {csv_file_name}")


Enter 'all' for all teams or specify a team (e.g., 'toronto-raptors'):  all
Enter your start year:  2023
Enter your end year:  2023


NameError: name 'nba_teams' is not defined

In [21]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# Define the NBA teams
nba_teams = [
    'atlanta-hawks', 'boston-celtics', 'brooklyn-nets', 'charlotte-hornets',
    'chicago-bulls', 'cleveland-cavaliers', 'dallas-mavericks', 'denver-nuggets',
    'detroit-pistons', 'golden-state-warriors', 'houston-rockets', 'indiana-pacers',
    'los-angeles-clippers', 'los-angeles-lakers', 'memphis-grizzlies', 'miami-heat',
    'milwaukee-bucks', 'minnesota-timberwolves', 'new-orleans-pelicans', 'new-york-knicks',
    'oklahoma-city-thunder', 'orlando-magic', 'philadelphia-76ers', 'phoenix-suns',
    'portland-trail-blazers', 'sacramento-kings', 'san-antonio-spurs', 'toronto-raptors',
    'utah-jazz', 'washington-wizards'
]

def process_game_row(tr, team):
    date = tr.find('td', class_='text-center').get_text(strip=True)
    opponent_td = tr.find_all('td', class_='text-center')[1]
    opponent_raw = opponent_td.get_text(strip=True)
    score = tr.find_all('td', class_='text-center')[2].get_text(strip=True)
    is_playoff = opponent_td.find('span', class_='fa fa-bolt') is not None
    type = "Playoff" if is_playoff else "Regular"
    win_loss = "Win" if 'W' in score else "Loss"
    team1 = team.replace('-', ' ')
    opponent = opponent_raw.replace('@ ', '').replace('vs ', '').replace('-', ' ')
    
    # Check for OT by looking for a <td> with specific classes and text starting with "OT"
    ot_indicator = tr.find('td', class_='expanding text-center team-lost  pl-1 pr-1')
    OT = 1 if ot_indicator and ot_indicator.get_text(strip=True).startswith("OT") else 0
    
    return {'Date': date, 'Score': score, 'Win': win_loss, 'Type': type, 'Team1': team1, 'Team2': opponent, 'OT': OT}

def scrape_data_for_year(team, year):
    headers = {'User-Agent': 'Mozilla/5.0'}
    url = f"https://champsorchumps.us/team/nba/{team}/{year}"
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        target_div = soup.find('div', class_='col-md-12 col-lg-8')
        if target_div:
            game_rows = target_div.find_all('tr', id=lambda x: x and x.startswith('game_'))
            return [process_game_row(tr, team) for tr in game_rows if tr.find('td', class_='text-center')]
    else:
        print(f"Failed to retrieve the web page for year {year}. Status code: {response.status_code}")
        return []

def scrape_data_for_team(team, start_year, end_year):
    all_games_data = []
    for year in range(start_year, end_year + 1):
        yearly_data = scrape_data_for_year(team, year)
        all_games_data.extend(yearly_data)
        time.sleep(1)
    return pd.DataFrame(all_games_data)

# Main script to handle user inputs and aggregate data
team_selection = input("Enter 'all' for all teams or specify a team (e.g., 'toronto-raptors'): ").strip().lower().replace(' ', '-')
start_year = int(input("Enter your start year: "))
end_year = int(input("Enter your end year: "))

master_df = pd.DataFrame()

if team_selection == 'all':
    for team in nba_teams:
        print(f"Scraping data for {team}...")
        df = scrape_data_for_team(team, start_year, end_year)
        master_df = pd.concat([master_df, df], ignore_index=True)
else:
    master_df = scrape_data_for_team(team_selection, start_year, end_year)

# Data post-processing
master_df['Win'] = master_df['Win'].apply(lambda x: 1 if x == 'Win' else 0)
master_df['Type'] = master_df['Type'].apply(lambda x: 1 if x == 'Playoff' else 0)

print(master_df.head())

# Export to CSV
csv_file_name = f"nba_data_{start_year}_to_{end_year}.csv"
master_df.to_csv(csv_file_name, index=False)
print(f"CSV file generated: {csv_file_name}")


Enter 'all' for all teams or specify a team (e.g., 'toronto-raptors'):  all
Enter your start year:  2023
Enter your end year:  2023


Scraping data for atlanta-hawks...
Scraping data for boston-celtics...
Scraping data for brooklyn-nets...
Scraping data for charlotte-hornets...
Scraping data for chicago-bulls...
Scraping data for cleveland-cavaliers...
Scraping data for dallas-mavericks...
Scraping data for denver-nuggets...
Scraping data for detroit-pistons...
Scraping data for golden-state-warriors...
Scraping data for houston-rockets...
Scraping data for indiana-pacers...
Scraping data for los-angeles-clippers...
Scraping data for los-angeles-lakers...
Scraping data for memphis-grizzlies...
Scraping data for miami-heat...
Scraping data for milwaukee-bucks...
Scraping data for minnesota-timberwolves...
Scraping data for new-orleans-pelicans...
Scraping data for new-york-knicks...
Scraping data for oklahoma-city-thunder...
Scraping data for orlando-magic...
Scraping data for philadelphia-76ers...
Scraping data for phoenix-suns...
Scraping data for portland-trail-blazers...
Scraping data for sacramento-kings...
Scrap

In [None]:
# GET OVERTIME 

In [24]:
import requests
from bs4 import BeautifulSoup

def download_and_parse_html(url):
    # Use a common user-agent to mimic a browser request
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
    
    response = requests.get(url, headers=headers)
    
    if response.status_code == 200:
        return response.text
    else:
        print(f"Error fetching the page: Status code {response.status_code}")
        return None

def find_ot_games(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    ot_games = []  # Initialize an empty list to hold games that went into overtime

    # Find all rows in the schedule table
    for game_row in soup.find_all('tr'):
        date_cell = game_row.find('td', class_='text-center')  # Find the cell with the game date
        expanding_td = game_row.find('td', class_='expanding')  # Look for the expanding <td> which might indicate more details
        
        # Check if this row represents an OT game by looking for "OT" in the expanding <td>
        if expanding_td and 'OT' in expanding_td.text:
            if date_cell:
                date_text = date_cell.text.strip()  # Extract the game date
                ot_games.append(date_text)  # Add the date of the OT game to the list

    return ot_games

# URL of the Los Angeles Clippers' 2023 schedule
url = "https://champsorchumps.us/team/nba/los-angeles-clippers/2023"

# Download and parse the HTML content
html_content = download_and_parse_html(url)

if html_content:
    # Find and list all OT games with their dates
    ot_games = find_ot_games(html_content)
    if ot_games:
        print("OT Games Found with Dates:")
        for game in ot_games:
            print(game)
    else:
        print("No OT games found.")
else:
    print("Failed to download or parse the page.")


OT Games Found with Dates:
Wed, Dec  7, 2022
Mon, Dec 26, 2022
Sat, Feb  4, 2023
Fri, Feb 24, 2023
Sun, Feb 26, 2023


In [36]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# Define the NBA teams
nba_teams = [
    'atlanta-hawks', 'boston-celtics', 'brooklyn-nets', 'charlotte-hornets',
    'chicago-bulls', 'cleveland-cavaliers', 'dallas-mavericks', 'denver-nuggets',
    'detroit-pistons', 'golden-state-warriors', 'houston-rockets', 'indiana-pacers',
    'los-angeles-clippers', 'los-angeles-lakers', 'memphis-grizzlies', 'miami-heat',
    'milwaukee-bucks', 'minnesota-timberwolves', 'new-orleans-pelicans', 'new-york-knicks',
    'oklahoma-city-thunder', 'orlando-magic', 'philadelphia-76ers', 'phoenix-suns',
    'portland-trail-blazers', 'sacramento-kings', 'san-antonio-spurs', 'toronto-raptors',
    'utah-jazz', 'washington-wizards'
]

def process_game_row(tr, team):
    date = tr.find('td', class_='text-center').get_text(strip=True)
    opponent_td = tr.find_all('td', class_='text-center')[1]
    opponent_raw = opponent_td.get_text(strip=True)
    score = tr.find_all('td', class_='text-center')[2].get_text(strip=True)
    is_playoff = opponent_td.find('span', class_='fa fa-bolt') is not None
    type = "Playoff" if is_playoff else "Regular"
    win_loss = "Win" if 'W' in score else "Loss"
    team1 = team.replace('-', ' ')
    opponent = opponent_raw.replace('@ ', '').replace('vs ', '').replace('-', ' ')
    
    # Implement OT detection based on the provided method
    ot_game = 0  # Default to no OT
    expanding_td = tr.find('td', class_='expanding')
    if expanding_td and 'OT' in expanding_td.text:
        ot_game = 1  # Mark as OT game

    return {
        'Date': date, 'Score': score, 'Win': win_loss, 'Type': type,
        'Team1': team1, 'Team2': opponent, 'OT': ot_game
    }

def scrape_data_for_year(team, year):
    headers = {'User-Agent': 'Mozilla/5.0'}
    url = f"https://champsorchumps.us/team/nba/{team}/{year}"
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        target_div = soup.find('div', class_='col-md-12 col-lg-8')
        if target_div:
            game_rows = target_div.find_all('tr', id=lambda x: x and x.startswith('game_'))
            return [process_game_row(tr, team) for tr in game_rows if tr.find('td', class_='text-center')]
    else:
        print(f"Failed to retrieve the web page for year {year}. Status code: {response.status_code}")
        return []

def scrape_data_for_team(team, start_year, end_year):
    all_games_data = []
    for year in range(start_year, end_year + 1):
        print(f"Scraping data for {team}, year: {year}")
        yearly_data = scrape_data_for_year(team, year)
        all_games_data.extend(yearly_data)
        time.sleep(1)  # Respectful delay to avoid server overload
    return pd.DataFrame(all_games_data)

team_selection = input("Enter 'all' for all teams or specify a team (e.g., 'los-angeles-clippers'): ").strip().lower().replace(' ', '-')
start_year = int(input("Enter your start year: "))
end_year = int(input("Enter your end year: "))

master_df = pd.DataFrame()

if team_selection == 'all':
    for team in nba_teams:
        df = scrape_data_for_team(team, start_year, end_year)
        master_df = pd.concat([master_df, df], ignore_index=True)
else:
    if team_selection in nba_teams:
        master_df = scrape_data_for_team(team_selection, start_year, end_year)
    else:
        print("Team not found. Please ensure you've entered the team name correctly.")

# Data post-processing
master_df['Win'] = master_df['Win'].apply(lambda x: 1 if x == 'Win' else 0)
master_df['Type'] = master_df['Type'].apply(lambda x: 1 if x == 'Playoff' else 0)

# Export to CSV
csv_file_name = f"nba_data_{start_year}_to_{end_year}.csv"
master_df.to_csv(csv_file_name, index=False)
print(f"CSV file generated: {csv_file_name}")


Enter 'all' for all teams or specify a team (e.g., 'los-angeles-clippers'):  toronto-raptors
Enter your start year:  2024
Enter your end year:  2024


Scraping data for toronto-raptors, year: 2024
CSV file generated: nba_data_2024_to_2024.csv


# WORK! Use this for scraping 

In [66]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# Define the NBA teams
nba_teams = [
    'atlanta-hawks', 'boston-celtics', 'brooklyn-nets', 'charlotte-hornets',
    'chicago-bulls', 'cleveland-cavaliers', 'dallas-mavericks', 'denver-nuggets',
    'detroit-pistons', 'golden-state-warriors', 'houston-rockets', 'indiana-pacers',
    'los-angeles-clippers', 'los-angeles-lakers', 'memphis-grizzlies', 'miami-heat',
    'milwaukee-bucks', 'minnesota-timberwolves', 'new-orleans-pelicans', 'new-york-knicks',
    'oklahoma-city-thunder', 'orlando-magic', 'philadelphia-76ers', 'phoenix-suns',
    'portland-trail-blazers', 'sacramento-kings', 'san-antonio-spurs', 'toronto-raptors',
    'utah-jazz', 'washington-wizards'
]

def process_game_row(tr, team):
    date = tr.find('td', class_='text-center').get_text(strip=True)
    opponent_td = tr.find_all('td', class_='text-center')[1]
    opponent_raw = opponent_td.get_text(strip=True)
    score = tr.find_all('td', class_='text-center')[2].get_text(strip=True)
    is_playoff = opponent_td.find('span', class_='fa fa-bolt') is not None
    type = "Playoff" if is_playoff else "Regular"
    win_loss = "Win" if 'W' in score else "Loss"
    team1 = team.replace('-', ' ')
    opponent = opponent_raw.replace('@ ', '').replace('vs ', '').replace('-', ' ')
    
    ot_game = 0  # Default to no OT
    expanding_td = tr.find('td', class_='expanding')
    if expanding_td and 'OT' in expanding_td.text:
        ot_game = 1  # Mark as OT game

    return {
        'Date': date, 'Score': score, 'Win': win_loss, 'Type': type,
        'Team1': team1, 'Team2': opponent, 'OT': ot_game
    }

def scrape_data_for_year(team, year):
    headers = {'User-Agent': 'Mozilla/5.0'}
    url = f"https://champsorchumps.us/team/nba/{team}/{year}"
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        target_div = soup.find('div', class_='col-md-12 col-lg-8')
        if target_div:
            game_rows = target_div.find_all('tr', id=lambda x: x and x.startswith('game_'))
            print(f"Scraping data for {team}. Year {year}.")
            return [process_game_row(tr, team) for tr in game_rows if tr.find('td', class_='text-center')]
    else:
        print(f"Failed to retrieve the web page for year {year}. Status code: {response.status_code}")
        return []

def scrape_data_for_team(team, start_year, end_year):
    all_games_data = []
    for year in range(start_year, end_year + 1):
        yearly_data = scrape_data_for_year(team, year)
        all_games_data.extend(yearly_data)
        time.sleep(1)  # Delay
    return pd.DataFrame(all_games_data)

team_selection = input("Enter 'all' for all teams or specify a team (e.g., 'los-angeles-clippers'): ").strip().lower().replace(' ', '-')
start_year = int(input("Enter your start year: "))
end_year = int(input("Enter your end year: "))

master_df = pd.DataFrame()

if team_selection == 'all':
    for team in nba_teams:
        df = scrape_data_for_team(team, start_year, end_year)
        master_df = pd.concat([master_df, df], ignore_index=True)
else:
    if team_selection in nba_teams:
        master_df = scrape_data_for_team(team_selection, start_year, end_year)
    else:
        print("Team not found. Please ensure you've entered the team name correctly.")

# Remove rows with an empty 'Score' column
master_df = master_df[master_df['Score'].str.strip() != '']

# Data post-processing
master_df['Win'] = master_df['Win'].apply(lambda x: 1 if x == 'Win' else 0)
master_df['Type'] = master_df['Type'].apply(lambda x: 1 if x == 'Playoff' else 0)

# Export to CSV
csv_file_name = f"nba_data_{start_year}_to_{end_year}.csv"
master_df.to_csv(csv_file_name, index=False)
print(f"CSV file generated: {csv_file_name}")

Enter 'all' for all teams or specify a team (e.g., 'los-angeles-clippers'):  all
Enter your start year:  2023
Enter your end year:  2024


Scraping data for atlanta-hawks. Year 2023.
Scraping data for atlanta-hawks. Year 2024.
Scraping data for boston-celtics. Year 2023.
Scraping data for boston-celtics. Year 2024.
Scraping data for brooklyn-nets. Year 2023.
Scraping data for brooklyn-nets. Year 2024.
Scraping data for charlotte-hornets. Year 2023.
Scraping data for charlotte-hornets. Year 2024.
Scraping data for chicago-bulls. Year 2023.
Scraping data for chicago-bulls. Year 2024.
Scraping data for cleveland-cavaliers. Year 2023.
Scraping data for cleveland-cavaliers. Year 2024.
Scraping data for dallas-mavericks. Year 2023.
Scraping data for dallas-mavericks. Year 2024.
Scraping data for denver-nuggets. Year 2023.
Scraping data for denver-nuggets. Year 2024.
Scraping data for detroit-pistons. Year 2023.
Scraping data for detroit-pistons. Year 2024.
Scraping data for golden-state-warriors. Year 2023.
Scraping data for golden-state-warriors. Year 2024.
Scraping data for houston-rockets. Year 2023.
Scraping data for housto

In [None]:
# TEST ADDITIONAL FEATURES  DO NOT USE!!

In [6]:
import pandas as pd

# Sample DataFrame creation
data = {
    'Date': [1414454400, 1414454400, 1414454400, 1414454400, 1414454400],
    'Win': [1, 0, 0, 1, 1],
    'Type': [1, 1, 1, 0, 0],
    'Team1': ['SanAntonioSpurs', 'DallasMavericks', 'OrlandoMagic', 'NewOrleansPelicans', 'HoustonRockets'],
    'Team2': ['DallasMavericks', 'SanAntonioSpurs', 'NewOrleansPelicans', 'OrlandoMagic', 'LosAngelesLakers'],
    'Team1_Score': [101.0, 100.0, 84.0, 101.0, 108.0],
    'Team2_Score': [100.0, 101.0, 101.0, 84.0, 90.0],
    'Avg_Score': [108.470443, 108.038272, 104.425997, 109.980964, 110.674224],
    'Win_Ratio': [0.529557, 0.492593, 0.377091, 0.463198, 0.516706]
}

df = pd.DataFrame(data)

# Convert the date from UNIX timestamp to datetime for sorting
df['Date'] = pd.to_datetime(df['Date'], unit='s')

# Create a unique identifier for each game ignoring who is home/away
df['Matchup'] = df.apply(lambda x: '_'.join(sorted([x['Team1'], x['Team2']])), axis=1)

# Create a dictionary to store head-to-head wins
h2h_wins = {}

# Iterate through each game to populate the dictionary
for i, row in df.iterrows():
    matchup = row['Matchup']
    winner = row['Team1'] if row['Win'] == 1 else row['Team2']
    if matchup not in h2h_wins:
        h2h_wins[matchup] = {'Team1': 0, 'Team2': 0}
    # Increment the win count for the winning team in this matchup
    if winner == row['Team1']:
        h2h_wins[matchup]['Team1'] += 1
    else:
        h2h_wins[matchup]['Team2'] += 1

# Apply the accumulated head-to-head stats to the DataFrame
for i, row in df.iterrows():
    matchup = row['Matchup']
    df.at[i, 'Team1_H2H_Wins'] = h2h_wins[matchup]['Team1']
    df.at[i, 'Team2_H2H_Wins'] = h2h_wins[matchup]['Team2']
    df.at[i, 'H2H_Matchups'] = h2h_wins[matchup]['Team1'] + h2h_wins[matchup]['Team2']
    if df.at[i, 'H2H_Matchups'] > 0:
        # Calculate the win ratio based on which team is listed as Team1 in the current row
        if row['Team1'] in h2h_wins[matchup]:
            df.at[i, 'H2H_Win_Ratio'] = h2h_wins[matchup]['Team1'] / df.at[i, 'H2H_Matchups']
        else:
            df.at[i, 'H2H_Win_Ratio'] = h2h_wins[matchup]['Team2'] / df.at[i, 'H2H_Matchups']

# Now display the adjusted DataFrame head to verify the calculation
print(df.head())

# Ask user to save the file
save_file = input("Do you want to save the modified DataFrame to a CSV file? (yes/no): ").strip().lower()
if save_file == 'yes':
    df.to_csv('nba_games_with_corrected_h2h_features.csv', index=False)
    print("DataFrame saved as 'nba_games_with_corrected_h2h_features.csv'.")
else:
    print("DataFrame not saved.")














        Date  Win  Type               Team1               Team2  Team1_Score  \
0 2014-10-28    1     1     SanAntonioSpurs     DallasMavericks        101.0   
1 2014-10-28    0     1     DallasMavericks     SanAntonioSpurs        100.0   
2 2014-10-28    0     1        OrlandoMagic  NewOrleansPelicans         84.0   
3 2014-10-28    1     0  NewOrleansPelicans        OrlandoMagic        101.0   
4 2014-10-28    1     0      HoustonRockets    LosAngelesLakers        108.0   

   Team2_Score   Avg_Score  Win_Ratio                          Matchup  \
0        100.0  108.470443   0.529557  DallasMavericks_SanAntonioSpurs   
1        101.0  108.038272   0.492593  DallasMavericks_SanAntonioSpurs   
2        101.0  104.425997   0.377091  NewOrleansPelicans_OrlandoMagic   
3         84.0  109.980964   0.463198  NewOrleansPelicans_OrlandoMagic   
4         90.0  110.674224   0.516706  HoustonRockets_LosAngelesLakers   

   Team1_H2H_Wins  Team2_H2H_Wins  H2H_Matchups  H2H_Win_Ratio  
0        

Do you want to save the modified DataFrame to a CSV file? (yes/no):  YES


DataFrame saved as 'nba_games_with_corrected_h2h_features.csv'.


In [42]:
import requests
from bs4 import BeautifulSoup
import csv
import os
import tempfile

def download_html(url):
    """Download HTML content of a page and save it to a temporary file."""
    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        temp_file = tempfile.NamedTemporaryFile(delete=False, mode='w+', encoding='utf-8', suffix='.html')
        temp_file.write(response.text)
        temp_file.close()
        return temp_file.name
    else:
        print("Failed to retrieve the page.")
        return None

def parse_html_and_save_to_csv(html_file_path, csv_filename="nba_scores.csv"):
    with open(html_file_path, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'html.parser')

    team1 = "Atlanta Hawks"
    csv_data = [["Date", "Team1", "Q1_Team1", "Q2_Team1", "Q3_Team1", "Q4_Team1"]]

    game_rows = soup.find_all('tr', id=lambda x: x and x.startswith('game_'))

    for tr in game_rows:
        date = tr.find('td', class_='text-center').get_text(strip=True)

        # Initialize quarter scores with placeholders
        q_scores = [""] * 4

        # Locate the corresponding detailed score row by ID reference
        detailed_score_id = "score_" + tr['id'].split('_')[2]
        detailed_score_row = soup.find('tr', id=detailed_score_id)
        
        if detailed_score_row:
            detailed_scores = detailed_score_row.find('table')
            if detailed_scores:
                score_rows = detailed_scores.find_all('tr')[1:3]  # Skip header row
                
                for index, row in enumerate(score_rows):
                    tds = row.find_all('td')[1:5]  # Focus on quarter scores
                    for i, td in enumerate(tds):
                        if index == 0:  # Assuming the first row is for team1
                            q_scores[i] = td.get_text(strip=True)

        csv_data.append([date, team1] + q_scores)

    with open(csv_filename, 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerows(csv_data)
    print(f"Data saved to {csv_filename}.")

if __name__ == "__main__":
    url = 'https://champsorchumps.us/team/nba/atlanta-hawks/2023'
    html_file_path = download_html(url)

    if html_file_path:
        parse_html_and_save_to_csv(html_file_path)

        os.remove(html_file_path)
        print(f"Temporary HTML file deleted: {html_file_path}")


Data saved to nba_scores.csv.
Temporary HTML file deleted: C:\Users\Tommy\AppData\Local\Temp\tmphvt_hoa4.html


In [22]:
import requests
from bs4 import BeautifulSoup
import csv



# URL of the page you want to scrape
url = 'https://champsorchumps.us/team/nba/atlanta-hawks/2023'

# Headers to mimic a real user visit
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}

# Use requests to retrieve data from a given URL
response = requests.get(url, headers=headers)

# Parse the content of the request with BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')

# Find all 'table' tags with a specific class
tables = soup.find_all('table', class_='table table-sm pb-0 mb-0 text-center lines-table')

# Initialize a list to hold processed data for CSV
csv_data = [["Q1_Team1", "Q1_Team2", "Q2_Team1", "Q2_Team2", "Q3_Team1", "Q3_Team2", "Q4_Team1", "Q4_Team2"]]

# Process each table
for table in tables:
    rows = table.find_all('tr')[1:3]  # Focus on the two team rows
    if len(rows) != 2:
        continue  # Skip tables that don't have exactly two team rows
    scores = [[td.text.strip() for td in row.find_all('td')[1:5]] for row in rows]  # Extract scores for quarters 1-4
    scores_flattened = [item for sublist in zip(*scores) for item in sublist]  # Flatten and alternate scores
    csv_data.append(scores_flattened)

    # For demonstration, print the first two tables in the console
    if len(csv_data) <= 3:  # Includes header row, so limit to 3 for two tables
        print("Table:")
        print(", ".join(csv_data[0]))  # Print header
        print(", ".join(scores_flattened))
        print("\n---\n")

# Function to save data to CSV
def save_to_csv(data, filename):
    with open(filename, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerows(data)

# Ask user if they want to save the tables to a CSV file
user_input = input("Do you want to save the tables to a CSV file? Enter 'yes' or 'no': ").strip().lower()
if user_input == 'yes':
    save_to_csv(csv_data, "nba_scores.csv")
    print("Tables saved to nba_scores.csv.")
else:
    print("Tables were not saved.")


Table:
Q1_Team1, Q1_Team2, Q2_Team1, Q2_Team2, Q3_Team1, Q3_Team2, Q4_Team1, Q4_Team2
20, 26, 30, 33, 30, 25, 27, 33

---

Table:
Q1_Team1, Q1_Team2, Q2_Team1, Q2_Team2, Q3_Team1, Q3_Team2, Q4_Team1, Q4_Team2
27, 19, 27, 31, 27, 30, 17, 28

---



Do you want to save the tables to a CSV file? Enter 'yes' or 'no':  yes


Tables saved to nba_scores.csv.


In [None]:
# Should work

In [64]:
import requests
from bs4 import BeautifulSoup
import csv

# List of NBA teams for user to choose from
nba_teams = [
    'atlanta-hawks', 'boston-celtics', 'brooklyn-nets', 'charlotte-hornets',
    'chicago-bulls', 'cleveland-cavaliers', 'dallas-mavericks', 'denver-nuggets',
    'detroit-pistons', 'golden-state-warriors', 'houston-rockets', 'indiana-pacers',
    'los-angeles-clippers', 'los-angeles-lakers', 'memphis-grizzlies', 'miami-heat',
    'milwaukee-bucks', 'minnesota-timberwolves', 'new-orleans-pelicans', 'new-york-knicks',
    'oklahoma-city-thunder', 'orlando-magic', 'philadelphia-76ers', 'phoenix-suns',
    'portland-trail-blazers', 'sacramento-kings', 'san-antonio-spurs', 'toronto-raptors',
    'utah-jazz', 'washington-wizards'
]

# Ask user for team, start year, and end year
team = input("Enter a team from the NBA list: ").strip().lower().replace(" ", "-")
if team not in nba_teams:
    print("Team not found in the list. Please enter a valid NBA team.")
    exit()
start_year = int(input("Enter start year: "))
end_year = int(input("Enter end year: "))

# Headers to mimic a real user visit
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}

# Initialize a list to hold processed data for CSV
csv_data = [["Year", "Q1_Team1", "Q1_Team2", "Q2_Team1", "Q2_Team2", "Q3_Team1", "Q3_Team2", "Q4_Team1", "Q4_Team2"]]

# Loop through each year and scrape data
for year in range(start_year, end_year + 1):
    url = f"https://champsorchumps.us/team/nba/{team}/{year}"
    
    # Use requests to retrieve data from a given URL
    response = requests.get(url, headers=headers)
    
    # Parse the content of the request with BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find all 'table' tags with a specific class
    tables = soup.find_all('table', class_='table table-sm pb-0 mb-0 text-center lines-table')
    
    # Process each table
    for table in tables:
        rows = table.find_all('tr')[1:3]  # Focus on the two team rows
        if len(rows) != 2:
            continue  # Skip tables that don't have exactly two team rows
        scores = [[td.text.strip() for td in row.find_all('td')[1:5]] for row in rows]  # Extract scores for quarters 1-4
        scores_flattened = [year] + [item for sublist in zip(*scores) for item in sublist]  # Flatten and alternate scores, prepend year
        csv_data.append(scores_flattened)

# Function to save data to CSV
def save_to_csv(data, filename):
    with open(filename, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerows(data)

# Show table head (first few rows for preview)
print("Preview of data to be saved:")
for row in csv_data[:3]:  # Show header and first two rows of actual data
    print(", ".join(map(str, row)))

# Ask user if they want to save the tables to a CSV file
user_input = input("Do you want to save the tables to a CSV file? Enter 'yes' or 'no': ").strip().lower()
if user_input == 'yes':
    filename = f"{team}_nba_scores_{start_year}_to_{end_year}.csv"
    save_to_csv(csv_data, filename)
    print(f"Tables saved to {filename}.")
else:
    print("Tables were not saved.")


Enter a team from the NBA list:  atlanta-hawks
Enter start year:  2024
Enter end year:  2024


Preview of data to be saved:
Year, Q1_Team1, Q1_Team2, Q2_Team1, Q2_Team2, Q3_Team1, Q3_Team2, Q4_Team1, Q4_Team2
2024, 29, 25, 23, 26, 21, 27, 37, 38
2024, 35, 31, 34, 30, 31, 31, 26, 28


Do you want to save the tables to a CSV file? Enter 'yes' or 'no':  yes


Tables saved to atlanta-hawks_nba_scores_2024_to_2024.csv.


In [None]:
# 4 feb work, get Q scores

In [43]:
from bs4 import BeautifulSoup
import csv
import os
import tempfile
import requests

def download_and_parse_html(url, csv_filename="nba_scores.csv"):
    # Download HTML content
    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(url, headers=headers)

    if response.status_code != 200:
        print("Failed to retrieve the page.")
        return

    # Use BeautifulSoup to parse the HTML content
    soup = BeautifulSoup(response.text, 'html.parser')

    team1 = "Atlanta Hawks"
    csv_data = [["Date", "Team1", "Q1_Team1", "Q2_Team1", "Q3_Team1", "Q4_Team1"]]

    # Find all game rows and their corresponding detailed score rows
    game_rows = soup.find_all('tr', id=lambda x: x and x.startswith('game_'))

    for tr in game_rows:
        date = tr.find('td', class_='text-center').get_text(strip=True)

        # Locate the corresponding detailed score row by ID reference
        detailed_score_id = "score_" + tr['id'].split('_')[2]
        detailed_score_row = soup.find('tr', id=detailed_score_id, style="display: none;")

        q_scores = [""] * 4
        if detailed_score_row:
            score_table = detailed_score_row.find('table')
            if score_table:
                hawks_row = score_table.find_all('tr', class_='font-weight-bold')
                if hawks_row:
                    q_scores = [td.get_text(strip=True) for td in hawks_row[0].find_all('td')[1:5]]

        csv_data.append([date, team1] + q_scores)

    # Save the extracted data to a CSV file
    with open(csv_filename, 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerows(csv_data)
    print(f"Data saved to {csv_filename}.")

if __name__ == "__main__":
    url = 'https://champsorchumps.us/team/nba/atlanta-hawks/2023'
    download_and_parse_html(url)


Data saved to nba_scores.csv.


In [9]:
import pandas as pd

# Assuming you have file paths for CSV 1 and CSV 2
csv1_file_path = 'path_to_your_first_csv_file.csv'
csv2_file_path = 'path_to_your_second_csv_file.csv'

# Load CSVs into DataFrames
df_csv1 = pd.read_csv(csv1_file_path)
df_csv2 = pd.read_csv(csv2_file_path)

# Merge the DataFrames by adding the columns from CSV 2 to CSV 1
df_merged = pd.concat([df_csv1, df_csv2], axis=1)

# If there are any missing values in the merged DataFrame, fill them with 0
df_merged.fillna(0, inplace=True)

# Now, save the merged DataFrame to a new CSV file without altering the original data
output_file_path = 'merged_csv_output.csv'
df_merged.to_csv(output_file_path, index=False)

print(f"Merged CSV saved as '{output_file_path}'")

FileNotFoundError: [Errno 2] No such file or directory: 'path_to_your_first_csv_file.csv'

In [89]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# Define the NBA teams
nba_teams = [
    'atlanta-hawks', 'boston-celtics', 'brooklyn-nets', 'charlotte-hornets',
    'chicago-bulls', 'cleveland-cavaliers', 'dallas-mavericks', 'denver-nuggets',
    'detroit-pistons', 'golden-state-warriors', 'houston-rockets', 'indiana-pacers',
    'los-angeles-clippers', 'los-angeles-lakers', 'memphis-grizzlies', 'miami-heat',
    'milwaukee-bucks', 'minnesota-timberwolves', 'new-orleans-pelicans', 'new-york-knicks',
    'oklahoma-city-thunder', 'orlando-magic', 'philadelphia-76ers', 'phoenix-suns',
    'portland-trail-blazers', 'sacramento-kings', 'san-antonio-spurs', 'toronto-raptors',
    'utah-jazz', 'washington-wizards'
]

def process_game_row(tr, team):
    date = tr.find('td', class_='text-center').get_text(strip=True)
    opponent_td = tr.find_all('td', class_='text-center')[1]
    opponent_raw = opponent_td.get_text(strip=True)
    score = tr.find_all('td', class_='text-center')[2].get_text(strip=True)
    is_playoff = opponent_td.find('span', class_='fa fa-bolt') is not None
    type = "1" if is_playoff else "0"
    win_loss = "Win" if 'W' in score else "Loss"
    team1 = team.replace('-', ' ')
    opponent = opponent_raw.replace('@ ', '').replace('vs ', '').replace('-', ' ')
    
    ot_game = 0  # Default to no OT
    expanding_td = tr.find('td', class_='expanding')
    if expanding_td and 'OT' in expanding_td.text:
        ot_game = 1  # Mark as OT game

    return {
        'Date': date, 'Score': score, 'Win': win_loss, 'Type': type,
        'Team1': team1, 'Team2': opponent, 'OT': ot_game
    }

def scrape_data_for_year(team, year):
    headers = {'User-Agent': 'Mozilla/5.0'}
    url = f"https://champsorchumps.us/team/nba/{team}/{year}"
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        target_div = soup.find('div', class_='col-md-12 col-lg-8')
        if target_div:
            game_rows = target_div.find_all('tr', id=lambda x: x and x.startswith('game_'))
            print(f"Scraping data for {team}. Year {year}.")
            return [process_game_row(tr, team) for tr in game_rows if tr.find('td', class_='text-center')]
    else:
        print(f"Failed to retrieve the web page for year {year}. Status code: {response.status_code}")
        return []

def scrape_data_for_team(team, start_year, end_year):
    all_games_data = []
    for year in range(start_year, end_year + 1):
        url = f"https://champsorchumps.us/team/nba/{team}/{year}"
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(url, headers=headers)
        
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            game_rows = soup.find_all('tr', id=lambda x: x and x.startswith('game_'))





            
            for tr in game_rows:
                # Extract actual game details
                date = tr.find('td', class_='text-center').get_text(strip=True)  # Adjust class as per actual HTML
                opponent = tr.find('td', class_='text-center').get_text(strip=True)  # Adjust class
                score = tr.find('td', class_='text-center').get_text(strip=True)  # Adjust class
                is_playoff = "Playoff" if tr.find('span', class_='fa fa-bolt') else "Regular"
                win_loss = "Win" if 'W' in score else "Loss"
                ot_game = 1 if "OT" in score else 0
                
                # Extract quarter scores
                quarter_scores = tr.find_all('td', class_='quarter-score-class')  # Adjust class
                q_scores = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]  # Default quarter scores
                for i, q_score in enumerate(quarter_scores[:8]):
                    q_scores[i] = float(q_score.get_text(strip=True))
                
                game_data = [date, score, win_loss, is_playoff, team.replace('-', ' ').title(), opponent, ot_game] + q_scores
                all_games_data.append(game_data)
                
            time.sleep(1)  # Respectful delay
        else:
            print(f"Failed to retrieve data for {team} in {year}. Status code: {response.status_code}")
    
    columns = ["Date", "Score", "Win", "Type", "Team1", "Team2", "OT", "Q1_Team1", "Q1_Team2", "Q2_Team1", "Q2_Team2", "Q3_Team1", "Q3_Team2", "Q4_Team1", "Q4_Team2"]
    return pd.DataFrame(all_games_data, columns=columns)


team_selection = input("Enter 'all' for all teams or specify a team (e.g., 'los-angeles-clippers'): ").strip().lower().replace(' ', '-')
start_year = int(input("Enter your start year: "))
end_year = int(input("Enter your end year: "))

master_df = pd.DataFrame()

if team_selection == 'all':
    for team in nba_teams:
        df = scrape_data_for_team(team, start_year, end_year)
        master_df = pd.concat([master_df, df], ignore_index=True)
else:
    if team_selection in nba_teams:
        master_df = scrape_data_for_team(team_selection, start_year, end_year)
    else:
        print("Team not found. Please ensure you've entered the team name correctly.")

# Remove rows with an empty 'Score' column
master_df = master_df[master_df['Score'].str.strip() != '']

# Data post-processing
master_df['Win'] = master_df['Win'].apply(lambda x: 1 if x == 'Win' else 0)
master_df['Type'] = master_df['Type'].apply(lambda x: 1 if x == 'Playoff' else 0)

# Export to CSV
csv_file_name = f"nba_data_{start_year}_to_{end_year}.csv"
master_df.to_csv(csv_file_name, index=False)
print(f"CSV file generated: {csv_file_name}")

Enter 'all' for all teams or specify a team (e.g., 'los-angeles-clippers'):  ALL
Enter your start year:  2023
Enter your end year:  2024


KeyboardInterrupt: 

# Try this

In [2]:
import requests
from bs4 import BeautifulSoup
import csv

# List of NBA teams for user to choose from
nba_teams = [
    'atlanta-hawks', 'boston-celtics', 'brooklyn-nets', 'charlotte-hornets',
    'chicago-bulls', 'cleveland-cavaliers', 'dallas-mavericks', 'denver-nuggets',
    'detroit-pistons', 'golden-state-warriors', 'houston-rockets', 'indiana-pacers',
    'los-angeles-clippers', 'los-angeles-lakers', 'memphis-grizzlies', 'miami-heat',
    'milwaukee-bucks', 'minnesota-timberwolves', 'new-orleans-pelicans', 'new-york-knicks',
    'oklahoma-city-thunder', 'orlando-magic', 'philadelphia-76ers', 'phoenix-suns',
    'portland-trail-blazers', 'sacramento-kings', 'san-antonio-spurs', 'toronto-raptors',
    'utah-jazz', 'washington-wizards'
]

# Ask user for team, start year, and end year
team = input("Enter a team from the NBA list: ").strip().lower().replace(" ", "-")
if team not in nba_teams:
    print("Team not found in the list. Please enter a valid NBA team.")
    exit()
start_year = int(input("Enter start year: "))
end_year = int(input("Enter end year: "))

# Headers to mimic a real user visit
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}

# Initialize a list to hold processed data for CSV
csv_data = [["Year", "Team1", "Q1_Team1", "Q2_Team1", "Q3_Team1", "Q4_Team1", "Team2", "Q1_Team2", "Q2_Team2", "Q3_Team2", "Q4_Team2"]]

# Loop through each year and scrape data
for year in range(start_year, end_year + 1):
    url = f"https://champsorchumps.us/team/nba/{team}/{year}"
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    tables = soup.find_all('table', class_='table table-sm pb-0 mb-0 text-center lines-table')
    
    for table in tables:
        rows = table.find_all('tr')[1:3]  # Focus on the two team rows
        if len(rows) != 2:
            continue  # Skip tables that don't have exactly two team rows
        
        # Extract team names and scores
        team_names = [row.find('td', class_='text-left').text.strip() for row in rows]
        scores = [[td.text.strip() for td in row.find_all('td')[1:5]] for row in rows]
        
        # Determine unique team names and assign to Team1 and Team2
        unique_team_names = list(set(team_names))
        if len(unique_team_names) != 2:
            continue  # Skip if we don't have exactly two unique team names
        
        # Flatten and alternate scores, prepend year and team names
        scores_flattened = [year] + [unique_team_names[0]] + scores[0] + [unique_team_names[1]] + scores[1]
        csv_data.append(scores_flattened)

# Function to save data to CSV
def save_to_csv(data, filename):
    with open(filename, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerows(data)

# Show table head (first few rows for preview)
print("Preview of data to be saved:")
for row in csv_data[:3]:  # Show header and first two rows of actual data
    print(", ".join(map(str, row)))

# Ask user if they want to save the tables to a CSV file
user_input = input("Do you want to save the tables to a CSV file? Enter 'yes' or 'no': ").strip().lower()
if user_input == 'yes':
    filename = f"{team}_nba_scores_{start_year}_to_{end_year}.csv"
    save_to_csv(csv_data, filename)
    print(f"Tables saved to {filename}.")
else:
    print("Tables were not saved.")


Enter a team from the NBA list:  atlanta-hawks
Enter start year:  2023
Enter end year:  2024


Preview of data to be saved:
Year, Team1, Q1_Team1, Q2_Team1, Q3_Team1, Q4_Team1, Team2, Q1_Team2, Q2_Team2, Q3_Team2, Q4_Team2
2023, Rockets, 20, 30, 30, 27, Hawks, 26, 33, 25, 33
2023, Magic, 27, 27, 27, 17, Hawks, 19, 31, 30, 28


Do you want to save the tables to a CSV file? Enter 'yes' or 'no':  yes


Tables saved to atlanta-hawks_nba_scores_2023_to_2024.csv.


In [1]:
import requests
from bs4 import BeautifulSoup
import csv

# List of NBA teams for user to choose from
nba_teams = [
    'atlanta-hawks', 'boston-celtics', 'brooklyn-nets', 'charlotte-hornets',
    'chicago-bulls', 'cleveland-cavaliers', 'dallas-mavericks', 'denver-nuggets',
    'detroit-pistons', 'golden-state-warriors', 'houston-rockets', 'indiana-pacers',
    'los-angeles-clippers', 'los-angeles-lakers', 'memphis-grizzlies', 'miami-heat',
    'milwaukee-bucks', 'minnesota-timberwolves', 'new-orleans-pelicans', 'new-york-knicks',
    'oklahoma-city-thunder', 'orlando-magic', 'philadelphia-76ers', 'phoenix-suns',
    'portland-trail-blazers', 'sacramento-kings', 'san-antonio-spurs', 'toronto-raptors',
    'utah-jazz', 'washington-wizards'
]

team = input("Enter a team from the NBA list: ").strip().lower().replace(" ", "-")
if team not in nba_teams:
    print("Team not found in the list. Please enter a valid NBA team.")
    exit()
start_year = int(input("Enter start year: "))
end_year = int(input("Enter end year: "))

# Headers for the HTTP request
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}

# Initialize CSV data list
csv_data = [["Year", "Team1", "Q1_Team1", "Q2_Team1", "Q3_Team1", "Q4_Team1", "Total_Team1", "Team2", "Q1_Team2", "Q2_Team2", "Q3_Team2", "Q4_Team2", "Total_Team2"]]

# Function to process and append game data correctly
def process_and_append_game_data(year, game_data):
    # Ensure the game data aligns with the expected structure before appending
    if len(game_data) == 13:  # Expected number of columns based on the header
        csv_data.append(game_data)
    else:
        print(f"Skipping inconsistent game data for year {year}: {game_data}")

# Loop through each year to scrape and process data
for year in range(start_year, end_year + 1):
    url = f"https://champsorchumps.us/team/nba/{team}/{year}"
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    tables = soup.find_all('table', class_='table table-sm pb-0 mb-0 text-center lines-table')
    
    for table in tables:
        rows = table.find_all('tr')
        for row in rows:
            # Find the team names and scores within this row
            cells = row.find_all('td')
            if len(cells) > 5:  # Ensuring this row contains score data
                team_name = cells[0].text.strip()
                scores = [cell.text.strip() for cell in cells[1:6]]  # Scores for Q1-Q4 and Total
                if team_name:  # Check if the team name was found
                    # Determine if this is Team1 or Team2 based on presence in csv_data
                    if len(csv_data[-1]) < 7:  # If the last entry is still filling Team1 data
                        game_data = csv_data[-1] + [team_name] + scores
                    else:  # Start a new game entry
                        game_data = [year, team_name] + scores
                    process_and_append_game_data(year, game_data)

# Prompt for saving the data to CSV
def save_to_csv(filename):
    with open(filename, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerows(csv_data)
    print(f"Data saved to {filename}.")

user_input = input("Do you want to save the tables to a CSV file? Enter 'yes' or 'no': ").strip().lower()
if user_input == 'yes':
    filename = f"{team}_nba_scores_{start_year}_to_{end_year}.csv"
    save_to_csv(filename)
else:
    print("Data saving cancelled.")

Enter a team from the NBA list:  atlanta-hawks
Enter start year:  2023
Enter end year:  2024


Skipping inconsistent game data for year 2023: [2023, 'Rockets', '20', '30', '30', '27', '107']
Skipping inconsistent game data for year 2023: [2023, 'Hawks', '26', '33', '25', '33', '117']
Skipping inconsistent game data for year 2023: [2023, 'Magic', '27', '27', '27', '17', '98']
Skipping inconsistent game data for year 2023: [2023, 'Hawks', '19', '31', '30', '28', '108']
Skipping inconsistent game data for year 2023: [2023, 'Hornets', '22', '37', '45', '22', '126']
Skipping inconsistent game data for year 2023: [2023, 'Hawks', '30', '25', '29', '25', '109']
Skipping inconsistent game data for year 2023: [2023, 'Hawks', '36', '26', '27', '29', '118']
Skipping inconsistent game data for year 2023: [2023, 'Pistons', '32', '29', '28', '24', '113']
Skipping inconsistent game data for year 2023: [2023, 'Hawks', '35', '34', '37', '30', '136']
Skipping inconsistent game data for year 2023: [2023, 'Pistons', '29', '38', '27', '18', '112']
Skipping inconsistent game data for year 2023: [2023,

Do you want to save the tables to a CSV file? Enter 'yes' or 'no':  yes


Data saved to atlanta-hawks_nba_scores_2023_to_2024.csv.


# Adjust format

# USE THIS TO ADJUST

In [2]:
import pandas as pd

# Dictionary for mapping team names to their IDs
teams_id = {
    'atlantahawks': 1610612737, 'bostonceltics': 1610612738, 'clevelandcavaliers': 1610612739,
    'neworleanspelicans': 1610612740, 'chicagobulls': 1610612741, 'dallasmavericks': 1610612742,
    'denvernuggets': 1610612743, 'goldenstatewarriors': 1610612744, 'houstonrockets': 1610612745,
    'losangelesclippers': 1610612746, 'losangeleslakers': 1610612747, 'miamiheat': 1610612748,
    'milwaukeebucks': 1610612749, 'minnesotatimberwolves': 1610612750, 'brooklynnets': 1610612751,
    'newyorkknicks': 1610612752, 'orlandomagic': 1610612753, 'indianapacers': 1610612754,
    'philadelphia76ers': 1610612755, 'phoenixsuns': 1610612756, 'portlandtrailblazers': 1610612757,
    'sacramentokings': 1610612758, 'sanantoniospurs': 1610612759, 'oklahomacitythunder': 1610612760,
    'torontoraptors': 1610612761, 'utahjazz': 1610612762, 'memphisgrizzlies': 1610612763,
    'washingtonwizards': 1610612764, 'detroitpistons': 1610612765, 'charlottehornets': 1610612766,
}

# Assuming 'nba_data_2015_to_2024.csv' is your CSV file
df = pd.read_csv('nba_data_2023_to_2024.csv')


# Function to normalize team names
def normalize_team_name(name):
    # Remove '@' or 'vs' at the beginning of the string
    name = name.lstrip('@').lstrip('vs').strip()
    # Convert to lower case and remove spaces for matching with dictionary keys
    return name.replace(" ", "").lower()

# Normalize and map 'Team1' and 'Team2' names
#df['Team1'] = df['Team1'].apply(normalize_team_name).map(teams_id)

df['Team2'] = df['Team2'].str.replace(' ', '', regex=True).apply(normalize_team_name).map(teams_id)

# Before mapping 'Team2', remove '@' or 'vs' from the beginning
df['Team2'] = df['Team2'].str.replace('^(?:@|vs)\s*', '', regex=True).apply(normalize_team_name).map(teams_id)
# Correct date conversion, assuming the date is in a recognizable format

df['Date'] = pd.to_datetime(df['Date'])  # Adjust this line based on the actual format of your 'Date' column, if necessary


# Extract score values with varying formats
score_pattern = r'(\d{2,3})\s*[-–]\s*(\d{2,3})'
scores = df['Score'].str.extract(score_pattern)



# Create new columns 'Team1_Score' and 'Team2_Score' and assign extracted values
df['Team1_Score'] = scores.iloc[:, 0].astype(float)
df['Team2_Score'] = scores.iloc[:, 1].astype(float)

# Remove 'Win' or 'Loss' from the "Score" column
df['Score'] = df['Score'].str.extract('(\d{2,3}\s*[-–]\s*\d{2,3})')[0]

# Calculate average score for each team in "Team1"
average_scores = df.groupby('Team1')['Team1_Score'].mean().rename('Avg_Score')





# Calculate win ratio for each team in "Team1"
win_ratios = df.groupby('Team1')['Win'].mean().rename('Avg_Win_Ratio')


# Initialize dictionaries to hold win/loss counts
regular_games = {}
playoff_games = {}

# Iterate through each row to count wins/losses
for index, row in df.iterrows():
    team_id = row['Team1']
    game_type = 'playoff' if row['Type'] == 1 else 'regular'
    win = row['Win'] == 1
    
    # Initialize team record in dictionaries if not already present
    if team_id not in regular_games:
        regular_games[team_id] = {'wins': 0, 'losses': 0}
    if team_id not in playoff_games:
        playoff_games[team_id] = {'wins': 0, 'losses': 0}
    
    # Increment win/loss count based on game type
    if game_type == 'regular':
        if win:
            regular_games[team_id]['wins'] += 1
        else:
            regular_games[team_id]['losses'] += 1
    elif game_type == 'playoff':
        if win:
            playoff_games[team_id]['wins'] += 1
        else:
            playoff_games[team_id]['losses'] += 1

# Calculate win ratios for regular and playoff games
regular_win_ratios = {team_id: record['wins'] / (record['wins'] + record['losses']) for team_id, record in regular_games.items() if (record['wins'] + record['losses']) > 0}
playoff_win_ratios = {team_id: record['wins'] / (record['wins'] + record['losses']) for team_id, record in playoff_games.items() if (record['wins'] + record['losses']) > 0}

# Convert win ratio dictionaries to DataFrames for easy merging
regular_win_ratios_df = pd.DataFrame(list(regular_win_ratios.items()), columns=['Team1', 'Regular_Win_Ratio'])
playoff_win_ratios_df = pd.DataFrame(list(playoff_win_ratios.items()), columns=['Team1', 'Playoff_Win_Ratio'])

# Merge win ratios back to the original DataFrame
df = pd.merge(df, regular_win_ratios_df, on='Team1', how='left')
df = pd.merge(df, playoff_win_ratios_df, on='Team1', how='left')











# Step 1: Calculate the absolute score difference
df['Score_Difference'] = abs(df['Team1_Score'] - df['Team2_Score'])

# Step 2: Normalize the score difference to a similarity score (0 to 1)
# Assuming the maximum score difference observed is a useful normalization factor
max_diff = df['Score_Difference'].max()
df['Similarity_Score'] = 1 - (df['Score_Difference'] / max_diff)

# Note: This is a simple normalization that assumes the max score difference is a good denominator.
# This might need to be adjusted based on your specific criteria for similarity.

# Drop the 'Score_Difference' column if it's no longer needed
df.drop('Score_Difference', axis=1, inplace=True)










# Remove the "Score" column
df.drop('Score', axis=1, inplace=True)

# Join the average score and win ratio back to the main DataFrame
df = df.join(average_scores, on='Team1')
df = df.join(win_ratios, on='Team1')

# Calculate Point Differential
df['Point_Differential'] = df['Team2_Score'] - df['Team1_Score']

# Display the head of the DataFrame to show the new columns
print(df.head())

# Conditional save to CSV based on user input
save_file = input("Do you want to save the updated data to a CSV file? (yes/no): ").strip().lower()
if save_file == 'yes':
    csv_file_name = input("Enter the name for the CSV file (e.g., 'updated_nba_data.csv'): ").strip()
    df.to_csv(csv_file_name, index=False)
    print(f"Updated data saved to '{csv_file_name}'.")


AttributeError: Can only use .str accessor with string values!

In [98]:
import pandas as pd

# Assuming 'nba_data_2015_to_2024.csv' is your CSV file
df = pd.read_csv('nba_data_2023_to_2024.csv')

# Dictionary for mapping team names to their IDs
teams_id = {
    'atlantahawks': 1610612737, 'bostonceltics': 1610612738, 'clevelandcavaliers': 1610612739,
    'neworleanspelicans': 1610612740, 'chicagobulls': 1610612741, 'dallasmavericks': 1610612742,
    'denvernuggets': 1610612743, 'goldenstatewarriors': 1610612744, 'houstonrockets': 1610612745,
    'losangelesclippers': 1610612746, 'losangeleslakers': 1610612747, 'miamiheat': 1610612748,
    'milwaukeebucks': 1610612749, 'minnesotatimberwolves': 1610612750, 'brooklynnets': 1610612751,
    'newyorkknicks': 1610612752, 'orlandomagic': 1610612753, 'indianapacers': 1610612754,
    'philadelphia76ers': 1610612755, 'phoenixsuns': 1610612756, 'portlandtrailblazers': 1610612757,
    'sacramentokings': 1610612758, 'sanantoniospurs': 1610612759, 'oklahomacitythunder': 1610612760,
    'torontoraptors': 1610612761, 'utahjazz': 1610612762, 'memphisgrizzlies': 1610612763,
    'washingtonwizards': 1610612764, 'detroitpistons': 1610612765, 'charlottehornets': 1610612766,
}

# Normalize team names and map them
def normalize_team_name(name):
    name = name.lstrip('@').lstrip('vs').strip().lower()  # Keep spaces, convert to lowercase
    return name.replace(" ", "")

df['Team1'] = df['Team1'].apply(normalize_team_name).map(teams_id)
df['Team2'] = df['Team2'].str.replace('^(?:@|vs)\s*', '', regex=True).apply(normalize_team_name).map(teams_id)

df['Date'] = pd.to_datetime(df['Date'])

# Score extraction
score_pattern = r'(\d+)\s*[-–]\s*(\d+)'
scores = df['Score'].str.extract(score_pattern)
df['Team1_Score'] = scores[0].astype(float)
df['Team2_Score'] = scores[1].astype(float)

# Calculate Win/Loss ratio and average scores without causing large memory allocations
# Here, assume 'Win' column is a binary indicator (1 for win, 0 for loss)
df['Win'] = df['Score'].str.startswith('Win').astype(int)  # Assuming 'Win' prefix indicates a win

# Calculate averages and win ratios directly without excessive DataFrame merges
df['Team1_ID'] = df['Team1']  # Use team ID directly for grouping
average_scores = df.groupby('Team1_ID')['Team1_Score'].mean().rename('Avg_Score')
win_ratios = df.groupby('Team1_ID')['Win'].mean().rename('Win_Ratio')

# Merge calculated statistics back to the main DataFrame if necessary
# For large DataFrames, consider whether this step is essential, as it can be memory-intensive

# Simplify the process by directly calculating the required metrics without merging when possible
df['Avg_Score_Team1'] = df['Team1_ID'].map(average_scores)
df['Win_Ratio_Team1'] = df['Team1_ID'].map(win_ratios)





# Calculate win ratio for each team in "Team1"
win_ratios = df.groupby('Team1')['Win'].mean().rename('Avg_Win_Ratio')


# Initialize dictionaries to hold win/loss counts
regular_games = {}
playoff_games = {}

# Iterate through each row to count wins/losses
for index, row in df.iterrows():
    team_id = row['Team1']
    game_type = 'playoff' if row['Type'] == 1 else 'regular'
    win = row['Win'] == 1
    
    # Initialize team record in dictionaries if not already present
    if team_id not in regular_games:
        regular_games[team_id] = {'wins': 0, 'losses': 0}
    if team_id not in playoff_games:
        playoff_games[team_id] = {'wins': 0, 'losses': 0}
    
    # Increment win/loss count based on game type
    if game_type == 'regular':
        if win:
            regular_games[team_id]['wins'] += 1
        else:
            regular_games[team_id]['losses'] += 1
    elif game_type == 'playoff':
        if win:
            playoff_games[team_id]['wins'] += 1
        else:
            playoff_games[team_id]['losses'] += 1

# Calculate win ratios for regular and playoff games
regular_win_ratios = {team_id: record['wins'] / (record['wins'] + record['losses']) for team_id, record in regular_games.items() if (record['wins'] + record['losses']) > 0}
playoff_win_ratios = {team_id: record['wins'] / (record['wins'] + record['losses']) for team_id, record in playoff_games.items() if (record['wins'] + record['losses']) > 0}

# Convert win ratio dictionaries to DataFrames for easy merging
regular_win_ratios_df = pd.DataFrame(list(regular_win_ratios.items()), columns=['Team1', 'Regular_Win_Ratio'])
playoff_win_ratios_df = pd.DataFrame(list(playoff_win_ratios.items()), columns=['Team1', 'Playoff_Win_Ratio'])

# Merge win ratios back to the original DataFrame
df = pd.merge(df, regular_win_ratios_df, on='Team1', how='left')
df = pd.merge(df, playoff_win_ratios_df, on='Team1', how='left')











# Step 1: Calculate the absolute score difference
df['Score_Difference'] = abs(df['Team1_Score'] - df['Team2_Score'])

# Step 2: Normalize the score difference to a similarity score (0 to 1)
# Assuming the maximum score difference observed is a useful normalization factor
max_diff = df['Score_Difference'].max()
df['Similarity_Score'] = 1 - (df['Score_Difference'] / max_diff)

# Note: This is a simple normalization that assumes the max score difference is a good denominator.
# This might need to be adjusted based on your specific criteria for similarity.

# Drop the 'Score_Difference' column if it's no longer needed
df.drop('Score_Difference', axis=1, inplace=True)










# Remove the "Score" column
df.drop('Score', axis=1, inplace=True)

# Join the average score and win ratio back to the main DataFrame
df = df.join(average_scores, on='Team1')
df = df.join(win_ratios, on='Team1')

# Here's the new part:
# Calculate the Avg_Score for each team when it appears as Team1
avg_scores = df.groupby('Team1')['Team1_Score'].mean().rename('Avg_Score_Team1')

# Join the Avg_Score back to the main DataFrame for Team1
df = df.join(avg_scores, on='Team1')

# Calculate the average score for each team when it appears as Team1
avg_scores_team1 = df.groupby('Team1')['Team1_Score'].mean()

# Convert the Series to a DataFrame for easier manipulation
avg_scores_df = avg_scores_team1.reset_index()

# Rename columns for clarity
avg_scores_df.columns = ['Team1', 'AVG_SCORE']

# Create a dictionary from the DataFrame for faster lookup
team_avg_score_dict = avg_scores_df.set_index('Team1')['AVG_SCORE'].to_dict()

# Function to calculate Avg_Point_Differential
def calculate_avg_point_differential(row):
    team1_avg_score = team_avg_score_dict.get(row['Team1'], 0)
    team2_avg_score = team_avg_score_dict.get(row['Team2'], 0)
    return team1_avg_score - team2_avg_score

# Calculate Avg_Point_Differential for each row
df['Avg_Point_Differential'] = df.apply(calculate_avg_point_differential, axis=1)

df['OT'] = ot_column

# Display the head of the DataFrame to show the new columns
print(df.head())

# Ask the user before saving the updated DataFrame to a CSV file
save_file = input("Do you want to save the updated data to a CSV file? (yes/no): ").strip().lower()
if save_file == 'yes':
    csv_file_name = input("Enter the name for the CSV file (e.g., 'updated_nba_data.csv'): ").strip()
    df.to_csv(csv_file_name, index=False)
    print(f"Updated data saved to '{csv_file_name}'.")


ValueError: columns overlap but no suffix specified: Index(['Avg_Score_Team1'], dtype='object')

In [4]:
import pandas as pd

# Dictionary for mapping team names to their IDs
teams_id = {
    'atlantahawks': 1610612737, 'bostonceltics': 1610612738, 'clevelandcavaliers': 1610612739,
    'neworleanspelicans': 1610612740, 'chicagobulls': 1610612741, 'dallasmavericks': 1610612742,
    'denvernuggets': 1610612743, 'goldenstatewarriors': 1610612744, 'houstonrockets': 1610612745,
    'losangelesclippers': 1610612746, 'losangeleslakers': 1610612747, 'miamiheat': 1610612748,
    'milwaukeebucks': 1610612749, 'minnesotatimberwolves': 1610612750, 'brooklynnets': 1610612751,
    'newyorkknicks': 1610612752, 'orlandomagic': 1610612753, 'indianapacers': 1610612754,
    'philadelphia76ers': 1610612755, 'phoenixsuns': 1610612756, 'portlandtrailblazers': 1610612757,
    'sacramentokings': 1610612758, 'sanantoniospurs': 1610612759, 'oklahomacitythunder': 1610612760,
    'torontoraptors': 1610612761, 'utahjazz': 1610612762, 'memphisgrizzlies': 1610612763,
    'washingtonwizards': 1610612764, 'detroitpistons': 1610612765, 'charlottehornets': 1610612766,
}

# Assuming 'nba_data_2023_to_2024.csv' is your CSV file
df = pd.read_csv('nba_data_2023_to_2024.csv')

# Calculate the Avg_Score for each team when it appears as Team1
avg_scores = df.groupby('Team1')['Team1_Score'].mean().rename('Avg_Score_Team1')

# Join the Avg_Score back to the main DataFrame for Team1
df = df.join(avg_scores, on='Team1')

# Calculate the average score for each team when it appears as Team1
avg_scores_team1 = df.groupby('Team1')['Team1_Score'].mean()

# Convert the Series to a DataFrame for easier manipulation
avg_scores_df = avg_scores_team1.reset_index()

# Rename columns for clarity
avg_scores_df.columns = ['Team1', 'AVG_SCORE']

# Create a dictionary from the DataFrame for faster lookup
team_avg_score_dict = avg_scores_df.set_index('Team1')['AVG_SCORE'].to_dict()

# Function to calculate Avg_Point_Differential
def calculate_avg_point_differential(row):
    team1_avg_score = team_avg_score_dict.get(row['Team1'], 0)
    team2_avg_score = team_avg_score_dict.get(row['Team2'], 0)
    return team1_avg_score - team2_avg_score

# Calculate Avg_Point_Differential for each row
df['Avg_Point_Differential'] = df.apply(calculate_avg_point_differential, axis=1)

# Normalize team names function
def normalize_team_name(name):
    return name.replace(" ", "").lower()

# Normalize and map 'Team1' names
df['Team1'] = df['Team1'].str.replace(' ', '', regex=True).apply(normalize_team_name).map(teams_id)

# Normalize and map 'Team2' names
df['Team2'] = df['Team2'].str.replace('^(?:@|vs)\s*', '', regex=True).apply(normalize_team_name).map(teams_id)

# Convert dates to datetime format
df['Date'] = pd.to_datetime(df['Date'])

# Extract and process scores
score_pattern = r'(\d{2,3})\s*[-–]\s*(\d{2,3})'
scores = df['Score'].str.extract(score_pattern)
df['Team1_Score'] = scores.iloc[:, 0].astype(float)
df['Team2_Score'] = scores.iloc[:, 1].astype(float)
df['Score'] = df['Score'].str.extract('(\d{2,3}\s*[-–]\s*\d{2,3})')[0]

# Optional steps for calculating additional metrics like average scores, win ratios, etc., can be performed here.

# Final cleanup: remove unnecessary columns if needed
# For example, to drop the original 'Score' column now that we have 'Team1_Score' and 'Team2_Score':
df.drop('Score', axis=1, inplace=True)

# Display the head of the DataFrame to verify changes
print(df.head())

# Save the DataFrame to a new CSV file
csv_file_name = 'updated_nba_data.csv'
df.to_csv(csv_file_name, index=False)
print(f"Updated data saved to '{csv_file_name}'.")


KeyError: 'Column not found: Team1_Score'

In [112]:
import pandas as pd

# Dictionary for mapping team names to their IDs
teams_id = {
    'atlantahawks': 1610612737, 'bostonceltics': 1610612738, 'clevelandcavaliers': 1610612739,
    'neworleanspelicans': 1610612740, 'chicagobulls': 1610612741, 'dallasmavericks': 1610612742,
    'denvernuggets': 1610612743, 'goldenstatewarriors': 1610612744, 'houstonrockets': 1610612745,
    'losangelesclippers': 1610612746, 'losangeleslakers': 1610612747, 'miamiheat': 1610612748,
    'milwaukeebucks': 1610612749, 'minnesotatimberwolves': 1610612750, 'brooklynnets': 1610612751,
    'newyorkknicks': 1610612752, 'orlandomagic': 1610612753, 'indianapacers': 1610612754,
    'philadelphia76ers': 1610612755, 'phoenixsuns': 1610612756, 'portlandtrailblazers': 1610612757,
    'sacramentokings': 1610612758, 'sanantoniospurs': 1610612759, 'oklahomacitythunder': 1610612760,
    'torontoraptors': 1610612761, 'utahjazz': 1610612762, 'memphisgrizzlies': 1610612763,
    'washingtonwizards': 1610612764, 'detroitpistons': 1610612765, 'charlottehornets': 1610612766,
}

# Load CSV file
df = pd.read_csv('nba_data_2023_to_2024.csv')



# Correct date conversion
df['Date'] = pd.to_datetime(df['Date'])

# Extract and split score values
score_pattern = r'(\d{2,3})\s*[-–]\s*(\d{2,3})'
scores = df['Score'].str.extract(score_pattern)
df['Team1_Score'] = scores.iloc[:, 0].astype(float)
df['Team2_Score'] = scores.iloc[:, 1].astype(float)

# Clean up 'Score' column to only show score values
df['Score'] = df['Score'].str.extract('(\d{2,3}\s*[-–]\s*\d{2,3})')[0]

# Calculate average scores and win ratios
average_scores = df.groupby('Team1')['Team1_Score'].mean().rename('Avg_Score')
win_ratios = df.groupby('Team1')['Win'].mean().rename('Avg_Win_Ratio')

# Initialize dictionaries for game counts
regular_games = {}
playoff_games = {}

# Count wins and losses
for index, row in df.iterrows():
    team_id = row['Team1']
    game_type = 'playoff' if row['Type'] == 1 else 'regular'
    win = row['Win'] == 1
    
    if team_id not in regular_games:
        regular_games[team_id] = {'wins': 0, 'losses': 0}
    if team_id not in playoff_games:
        playoff_games[team_id] = {'wins': 0, 'losses': 0}
    
    if game_type == 'regular':
        if win:
            regular_games[team_id]['wins'] += 1
        else:
            regular_games[team_id]['losses'] += 1
    elif game_type == 'playoff':
        if win:
            playoff_games[team_id]['wins'] += 1
        else:
            playoff_games[team_id]['losses'] += 1

# Calculate win ratios
regular_win_ratios = {team_id: record['wins'] / (record['wins'] + record['losses']) for team_id, record in regular_games.items() if (record['wins'] + record['losses']) > 0}
playoff_win_ratios = {team_id: record['wins'] / (record['wins'] + record['losses']) for team_id, record in playoff_games.items() if (record['wins'] + record['losses']) > 0}

# Convert to DataFrames and merge
regular_win_ratios_df = pd.DataFrame(list(regular_win_ratios.items()), columns=['Team1', 'Regular_Win_Ratio'])
playoff_win_ratios_df = pd.DataFrame(list(playoff_win_ratios.items()), columns=['Team1', 'Playoff_Win_Ratio'])
df = pd.merge(df, regular_win_ratios_df, on='Team1', how='left')
df = pd.merge(df, playoff_win_ratios_df, on='Team1', how='left')

# Calculate similarity score
df['Score_Difference'] = abs(df['Team1_Score'] - df['Team2_Score'])
max_diff = df['Score_Difference'].max()
df['Similarity_Score'] = 1 - (df['Score_Difference'] / max_diff)
df.drop('Score_Difference', axis=1, inplace=True)

# Join average score and win ratio
df = df.join(average_scores, on='Team1')
df = df.join(win_ratios, on='Team1')

# Calculate Point Differential
df['Point_Differential'] = df['Team2_Score'] - df['Team1_Score']


# Function to normalize team names
def normalize_team_name(name):
   # name = name.lower().replace(" ", "").lstrip('@').lstrip('vs').strip()
    name.lower().replace(" ", "")
    return name

# Normalize and map 'Team1' and 'Team2' names
df['Team1'] = df['Team1'].apply(normalize_team_name).map(teams_id)
df['Team2'] = df['Team2'].apply(normalize_team_name).map(teams_id)

# Option 1: Fill NaN values with a placeholder (-1) before converting to int
df['Team1'] = df['Team1'].fillna(-1).astype(int)
df['Team2'] = df['Team2'].fillna(-1).astype(int)

# Show the updated DataFrame
print(df.head())

# Save to CSV if requested
save_file = input("Do you want to save the updated data to a CSV file? (yes/no): ").strip().lower()
if save_file == 'yes':
    csv_file_name = input("Enter the name for the CSV file (e.g., 'updated_nba_data.csv'): ").strip()
    df.to_csv(csv_file_name, index=False)
    print(f"Updated data saved to '{csv_file_name}'.")


        Date     Score  Win  Type  Team1  Team2  OT  Team1_Score  Team2_Score  \
0 2022-10-19  117- 107    1     0     -1     -1   0        117.0        107.0   
1 2022-10-21   108- 98    1     0     -1     -1   0        108.0         98.0   
2 2022-10-23  109 -126    0     0     -1     -1   0        109.0        126.0   
3 2022-10-26  118- 113    1     0     -1     -1   0        118.0        113.0   
4 2022-10-28  136- 112    1     0     -1     -1   0        136.0        112.0   

   Regular_Win_Ratio  Playoff_Win_Ratio  Similarity_Score   Avg_Score  \
0           0.515789           0.395349          0.838710  119.275362   
1           0.515789           0.395349          0.838710  119.275362   
2           0.515789           0.395349          0.725806  119.275362   
3           0.515789           0.395349          0.919355  119.275362   
4           0.515789           0.395349          0.612903  119.275362   

   Avg_Win_Ratio  Point_Differential  
0       0.478261               -10.

KeyboardInterrupt: Interrupted by user

# XGBoost model

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBClassifier

# Load data
data = pd.read_csv('Ready_for_XGBoost.csv')

# Convert 'Date' to datetime format and sort
#data['Date'] = pd.to_datetime(data['Date'], unit='s')
#data.sort_values('Date', ascending=True, inplace=True)

# Drop columns that might lead to data leakage
data = data.drop(['Team1_Score', 'Team2_Score', 'Date'], axis=1)

# Prepare features and target
X = data.drop(['Win'], axis=1)
y = data['Win']

# One-hot encode categorical features
encoder = OneHotEncoder(sparse=False)
categorical_features = ['Type', 'Team1', 'Team2']
X_encoded = encoder.fit_transform(X[categorical_features])
feature_names = encoder.get_feature_names_out(categorical_features)

# Replace categorical features with their encoded versions
X = X.drop(categorical_features, axis=1)
X_encoded = pd.DataFrame(X_encoded, columns=feature_names)
X = pd.concat([X.reset_index(drop=True), X_encoded], axis=1)

# Splitting the dataset for the last 10 games prediction
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=20, shuffle=False)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)

# Initialize XGBoost model
model = XGBClassifier(objective='binary:logistic', eval_metric='logloss', use_label_encoder=False)

# Hyperparameters grid
param_grid = {
    'max_depth': [4, 5, 6],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'subsample': [0.8, 0.9, 1],
     #   'reg_alpha': [0.01, 0.1],  # L1 regularization term
 # 'reg_lambda': [1, 10]  # L2 regularization term
}
#param_grid = {
  #  'max_depth': [4, 5, 6],
 #   'n_estimators': [100, 200, 300],
  #  'learning_rate': [0.01, 0.05, 0.1],
  #  'subsample': [0.8, 1.0],
 #   'reg_alpha': [0.01, 0.1],  # L1 regularization term
 #   'reg_lambda': [1, 10]  # L2 regularization term
#}

# Grid search for hyperparameter tuning
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='roc_auc', cv=10, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best model
best_model = grid_search.best_estimator_

# Validation metrics
y_val_pred = best_model.predict(X_val)
print("Validation Metrics")
print(f"Accuracy: {accuracy_score(y_val, y_val_pred)}")
print(f"ROC AUC: {roc_auc_score(y_val, best_model.predict_proba(X_val)[:, 1])}")
print(f"Confusion Matrix:\n{confusion_matrix(y_val, y_val_pred)}")
print(classification_report(y_val, y_val_pred, zero_division=0))  # Adjusting zero_division parameter here

# Predictions on test set (last 10 games)
y_test_pred = best_model.predict(X_test)
print("\nTest Metrics (Last 10 Games)")
print(f"Accuracy: {accuracy_score(y_test, y_test_pred)}")
print(f"ROC AUC: {roc_auc_score(y_test, best_model.predict_proba(X_test)[:, 1])}")
print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_test_pred)}")

# Display predictions for the last 10 games
#for i, (actual, predicted) in enumerate(zip(y_test, y_test_pred), start=1):
   #print(f"Game {i}: Actual Outcome: {'Win' if actual == 1 else 'Loss'}, Predicted Outcome: {'Win' if predicted == 1 else 'Loss'}")


In [83]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBClassifier

# Load data
data = pd.read_csv('Ready_for_XGBoost.csv')

# Convert 'Date' to datetime format and sort
#data['Date'] = pd.to_datetime(data['Date'], unit='s')
#data.sort_values('Date', ascending=True, inplace=True)

# Drop columns that might lead to data leakage
data = data.drop(['Team1_Score', 'Team2_Score'], axis=1)

# Prepare features and target
X = data.drop(['Win', 'Date'], axis=1)
y = data['Win']

# One-hot encode categorical features
encoder = OneHotEncoder(sparse=False)
categorical_features = ['Type', 'Team1', 'Team2']
X_encoded = encoder.fit_transform(X[categorical_features])
feature_names = encoder.get_feature_names_out(categorical_features)

# Replace categorical features with their encoded versions
X = X.drop(categorical_features, axis=1)
X_encoded = pd.DataFrame(X_encoded, columns=feature_names)
X = pd.concat([X.reset_index(drop=True), X_encoded], axis=1)

# Splitting the dataset
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=25, shuffle=False)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)

# Initialize XGBoost model
model = XGBClassifier(objective='binary:logistic', eval_metric='logloss', use_label_encoder=False)

# Hyperparameters grid
param_grid = {
    'max_depth': [3, 5, 7],
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.8, 1.0],
    'reg_alpha': [0.01, 0.1],  # L1 regularization term
    'reg_lambda': [1, 10]  # L2 regularization term
}

# Grid search for hyperparameter tuning
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='roc_auc', cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best model
best_model = grid_search.best_estimator_

# Validation metrics
y_val_pred = best_model.predict(X_val)
print("Validation Metrics")
print(f"Accuracy: {accuracy_score(y_val, y_val_pred)}")
print(f"ROC AUC: {roc_auc_score(y_val, best_model.predict_proba(X_val)[:, 1])}")
print(f"Confusion Matrix:\n{confusion_matrix(y_val, y_val_pred)}")
print(classification_report(y_val, y_val_pred))

# Predictions on test set
y_test_pred = best_model.predict(X_test)
print("\nTest Metrics (Last 25 Games)")
print(f"Accuracy: {accuracy_score(y_test, y_test_pred)}")
print(f"ROC AUC: {roc_auc_score(y_test, best_model.predict_proba(X_test)[:, 1])}")
print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_test_pred)}")
print(classification_report(y_test, y_test_pred))




Validation Metrics
Accuracy: 0.6124411704522201
ROC AUC: 0.6495986942489
Confusion Matrix:
[[1400 1030]
 [ 864 1593]]
              precision    recall  f1-score   support

           0       0.62      0.58      0.60      2430
           1       0.61      0.65      0.63      2457

    accuracy                           0.61      4887
   macro avg       0.61      0.61      0.61      4887
weighted avg       0.61      0.61      0.61      4887


Test Metrics (Last 25 Games)
Accuracy: 0.24
ROC AUC: 0.6578947368421052
Confusion Matrix:
[[ 0 19]
 [ 0  6]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        19
           1       0.24      1.00      0.39         6

    accuracy                           0.24        25
   macro avg       0.12      0.50      0.19        25
weighted avg       0.06      0.24      0.09        25



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [93]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBClassifier

# Load data
data = pd.read_csv('Ready_for_XGBoost.csv')

# Convert 'Date' to datetime format and sort
#data['Date'] = pd.to_datetime(data['Date'], unit='s')
#data.sort_values('Date', ascending=True, inplace=True)

# Drop columns that might lead to data leakage
data = data.drop(['Team1_Score', 'Team2_Score', 'Date'], axis=1)

# Prepare features and target
X = data.drop(['Win'], axis=1)
y = data['Win']

# One-hot encode categorical features
encoder = OneHotEncoder(sparse=False)
categorical_features = ['Type', 'Team1', 'Team2']
X_encoded = encoder.fit_transform(X[categorical_features])
feature_names = encoder.get_feature_names_out(categorical_features)

# Replace categorical features with their encoded versions
X = X.drop(categorical_features, axis=1)
X_encoded = pd.DataFrame(X_encoded, columns=feature_names)
X = pd.concat([X.reset_index(drop=True), X_encoded], axis=1)

# Splitting the dataset for the last 10 games prediction
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=20, shuffle=False)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)

# Initialize XGBoost model
model = XGBClassifier(objective='binary:logistic', eval_metric='logloss', use_label_encoder=False)

# Hyperparameters grid
param_grid = {
    'max_depth': [4, 5, 6],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'subsample': [0.8, 0.9, 1],
     #   'reg_alpha': [0.01, 0.1],  # L1 regularization term
 # 'reg_lambda': [1, 10]  # L2 regularization term
}
#param_grid = {
  #  'max_depth': [4, 5, 6],
 #   'n_estimators': [100, 200, 300],
  #  'learning_rate': [0.01, 0.05, 0.1],
  #  'subsample': [0.8, 1.0],
 #   'reg_alpha': [0.01, 0.1],  # L1 regularization term
 #   'reg_lambda': [1, 10]  # L2 regularization term
#}

# Grid search for hyperparameter tuning
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='roc_auc', cv=10, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best model
best_model = grid_search.best_estimator_

# Validation metrics
y_val_pred = best_model.predict(X_val)
print("Validation Metrics")
print(f"Accuracy: {accuracy_score(y_val, y_val_pred)}")
print(f"ROC AUC: {roc_auc_score(y_val, best_model.predict_proba(X_val)[:, 1])}")
print(f"Confusion Matrix:\n{confusion_matrix(y_val, y_val_pred)}")
print(classification_report(y_val, y_val_pred, zero_division=0))  # Adjusting zero_division parameter here

# Predictions on test set (last 10 games)
y_test_pred = best_model.predict(X_test)
print("\nTest Metrics (Last 10 Games)")
print(f"Accuracy: {accuracy_score(y_test, y_test_pred)}")
print(f"ROC AUC: {roc_auc_score(y_test, best_model.predict_proba(X_test)[:, 1])}")
print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_test_pred)}")
#print(classification_report(y_test, y_test_pred, zero_division=0))  # Adjusting zero_division parameter here

# Display predictions for the last 10 games
for i, (actual, predicted) in enumerate(zip(y_test, y_test_pred), start=1):
   print(f"Game {i}: Actual Outcome: {'Win' if actual == 1 else 'Loss'}, Predicted Outcome: {'Win' if predicted == 1 else 'Loss'}")




Validation Metrics
Accuracy: 0.6020867430441899
ROC AUC: 0.6344016105529796
Confusion Matrix:
[[1373 1048]
 [ 897 1570]]
              precision    recall  f1-score   support

           0       0.60      0.57      0.59      2421
           1       0.60      0.64      0.62      2467

    accuracy                           0.60      4888
   macro avg       0.60      0.60      0.60      4888
weighted avg       0.60      0.60      0.60      4888


Test Metrics (Last 10 Games)
Accuracy: 0.2
ROC AUC: 0.75
Confusion Matrix:
[[ 0 16]
 [ 0  4]]
Game 1: Actual Outcome: Loss, Predicted Outcome: Win
Game 2: Actual Outcome: Loss, Predicted Outcome: Win
Game 3: Actual Outcome: Loss, Predicted Outcome: Win
Game 4: Actual Outcome: Win, Predicted Outcome: Win
Game 5: Actual Outcome: Loss, Predicted Outcome: Win
Game 6: Actual Outcome: Loss, Predicted Outcome: Win
Game 7: Actual Outcome: Loss, Predicted Outcome: Win
Game 8: Actual Outcome: Loss, Predicted Outcome: Win
Game 9: Actual Outcome: Loss, Pred

# Updated CSV, new futures added. Model adjusted