In [293]:
import numpy as np
import pandas as pd
from tqdm import tqdm

In [294]:
# Base URL for the data
base_url = 'https://raw.githubusercontent.com/BerkeAltiparmak/SportSense.AI/main/data/'

# File names for each month
file_names = ['fpdata_March.csv', 'fpdata_Feb.csv', 'fpdata_Jan.csv', 'fpdata_Dec.csv']

# Initialize the list to store dataframes
dfs = []

# Loop through the file names, download the data, and append to the list
for file_name in file_names:
    file_url = base_url + file_name
    df = pd.read_csv(file_url)
    dfs.append(df)

# Concatenate all dataframes into one
dirty_predictor_website_df = pd.concat(dfs, ignore_index=True)
dirty_predictor_website_df.head() # it's not clean bc of NaNs, whose values appear at the bottom bc of bad data structure

Unnamed: 0,web-scraper-order,web-scraper-start-url,Rank,Name,Team,Pos,OPP,PTS,REB,AST,...,FG_percent,FT_percent,3P_percent,FTM,2PM,3PM,Turnover,MIN,FPts,page
0,1712980617-1,https://fantasydata.com/nba/fantasy-basketball...,1.0,Domantas Sabonis\n \n \n \n \n...,,,,,,,...,,,,,,,,,,
1,1712980617-2,https://fantasydata.com/nba/fantasy-basketball...,2.0,Tyrese Haliburton\n \n \n \n \...,,,,,,,...,,,,,,,,,,
2,1712980617-3,https://fantasydata.com/nba/fantasy-basketball...,3.0,Anthony Edwards\n \n \n \n \n ...,,,,,,,...,,,,,,,,,,
3,1712980617-4,https://fantasydata.com/nba/fantasy-basketball...,4.0,Jalen Brunson\n \n \n \n \n ...,,,,,,,...,,,,,,,,,,
4,1712980617-5,https://fantasydata.com/nba/fantasy-basketball...,5.0,De'Aaron Fox\n \n \n \n \n ...,,,,,,,...,,,,,,,,,,


In [295]:
def clean_data(data):
    """
    The logic behind this algorithm is very complex.
    I sold my soul to the devil to get the alignment problem right.
    """
    # Identifying rows with player names and stats
    player_indices = data.index[data['Name'].notna()].tolist()
    stats_indices = data.index[data['PTS'].notna()].tolist()

    segments = []
    current_segment = {'players': [], 'stats': []}

    # Initialize flags to track when a new segment needs to be started
    start_new_segment = True

    # Collect indices into segments
    for i in tqdm(range(len(data))):
        if i in player_indices:
            if start_new_segment:
                # If a new segment is needed, start it
                if current_segment['players'] or current_segment['stats']:
                    segments.append(current_segment)
                    current_segment = {'players': [], 'stats': []}
                start_new_segment = False  # Reset the flag
            current_segment['players'].append(i)
        elif i in stats_indices:
            current_segment['stats'].append(i)
            start_new_segment = True  # New segment will start after this stats block

    # Add the last segment
    if current_segment['players'] or current_segment['stats']:
        segments.append(current_segment)

    # Process each segment separately
    paired_data = []
    for segment in tqdm(segments):
        p_idx = 0
        s_idx = 0
        while p_idx < len(segment['players']) and s_idx < len(segment['stats']):
            paired_data.append((segment['players'][p_idx], segment['stats'][s_idx]))
            p_idx += 1
            s_idx += 1


    # Creating a structured DataFrame
    paired_df = pd.DataFrame({
        'web-scraper-order': [data.iloc[p[0]]['web-scraper-order'] for p in paired_data],
        'web-scraper-start-url': [data.iloc[p[0]]['web-scraper-start-url'] for p in paired_data],
        'Name': [data.iloc[p[0]]['Name'] for p in paired_data],
        'Team': [data.iloc[p[1]]['Team'] for p in paired_data],
        'Pos': [data.iloc[p[1]]['Pos'] for p in paired_data],
        'OPP': [data.iloc[p[1]]['OPP'] for p in paired_data],
        'PTS': [data.iloc[p[1]]['PTS'] for p in paired_data],
        'REB': [data.iloc[p[1]]['REB'] for p in paired_data],
        'AST': [data.iloc[p[1]]['AST'] for p in paired_data],
        'BLK': [data.iloc[p[1]]['BLK'] for p in paired_data],
        'STL': [data.iloc[p[1]]['STL'] for p in paired_data],
        'FG_percent': [data.iloc[p[1]]['FG_percent'] for p in paired_data],
        'FT_percent': [data.iloc[p[1]]['FT_percent'] for p in paired_data],
        '3P_percent': [data.iloc[p[1]]['3P_percent'] for p in paired_data],
        'FTM': [data.iloc[p[1]]['FTM'] for p in paired_data],
        '2PM': [data.iloc[p[1]]['2PM'] for p in paired_data],
        '3PM': [data.iloc[p[1]]['3PM'] for p in paired_data],
        'Turnover': [data.iloc[p[1]]['Turnover'] for p in paired_data],
        'MIN': [data.iloc[p[1]]['MIN'] for p in paired_data],
        'FPts': [data.iloc[p[1]]['FPts'] for p in paired_data]
    })
    return paired_df

In [296]:
def extract_date(url):
    from urllib.parse import urlparse, parse_qs
    parsed_url = urlparse(url)
    query_params = parse_qs(parsed_url.query)
    return query_params.get('date', [None])[0]

In [297]:
def calculate_fantasy_points(row):
    # Points scored
    points_scored = row['pts'] * 0.5
    # Rebounds, Assists, Steals, and Blocks
    rebound_points = (row['oreb'] + row['dreb']) * 1
    assist_points = row['ast'] * 1
    steal_points = row['stl'] * 2
    block_points = row['blk'] * 2
    # Turnovers
    turnover_points = row['tov'] * -1
    # Three-points made
    three_points = row['3pm'] * 0.5
    # Double-Double and Triple-Double
    double_digits = sum(i >= 10 for i in [row['pts'], row['oreb'] + row['dreb'], row['ast'], row['stl'], row['blk']])
    double_double_points = 1.0 if double_digits >= 2 else 0
    triple_double_points = 2.0 if double_digits >= 3 else 0
    # 40+ and 50+ bonuses
    fourty_pt_bonus = 2.0 * (row['pts'] >= 40)
    fifty_pt_bonus = 2.0 * (row['pts'] >= 50)
    # Total fantasy points
    total_points = (points_scored + rebound_points + assist_points +
                    steal_points + block_points + turnover_points + three_points +
                    double_double_points + triple_double_points +
                    fourty_pt_bonus + fifty_pt_bonus)
    return total_points

In [298]:
predictor_website_df = clean_data(dirty_predictor_website_df)

100%|██████████| 44752/44752 [00:12<00:00, 3723.36it/s]
100%|██████████| 156/156 [00:00<00:00, 10656.71it/s]


In [299]:
# Clean 'Name' column
predictor_website_df['Name'] = predictor_website_df['Name'].str.extract(r'([^\n]+)')

# Extract 'Date' from URL
predictor_website_df['Date'] = predictor_website_df['web-scraper-start-url'].apply(extract_date)
predictor_website_df['Date'] = pd.to_datetime(predictor_website_df['Date'], format='%m-%d-%Y')

# Split REB into OREB and DREB for our convention
predictor_website_df['OREB'] = 0
predictor_website_df['DREB'] = predictor_website_df['REB']

predictor_website_df.head()

Unnamed: 0,web-scraper-order,web-scraper-start-url,Name,Team,Pos,OPP,PTS,REB,AST,BLK,...,3P_percent,FTM,2PM,3PM,Turnover,MIN,FPts,Date,OREB,DREB
0,1712980617-1,https://fantasydata.com/nba/fantasy-basketball...,Domantas Sabonis,SAC,C,MEM,19.99,13.65,8.16,0.6,...,0.0,3.31,7.68,0.44,2.94,35.0,48.75,2024-03-18,0,13.65
1,1712980617-2,https://fantasydata.com/nba/fantasy-basketball...,Tyrese Haliburton,IND,PG,CLE,23.02,4.38,12.4,0.74,...,33.3,3.19,5.1,3.21,2.64,35.0,48.28,2024-03-18,0,4.38
2,1712980617-3,https://fantasydata.com/nba/fantasy-basketball...,Anthony Edwards,MIN,SG,UTA,27.82,5.89,5.25,0.82,...,42.9,5.6,7.09,2.68,2.73,36.0,44.27,2024-03-18,0,5.89
3,1712980617-4,https://fantasydata.com/nba/fantasy-basketball...,Jalen Brunson,NY,PG,GS,27.68,3.9,6.46,0.2,...,42.9,5.41,7.07,2.71,2.53,35.0,42.14,2024-03-18,0,3.9
4,1712980617-5,https://fantasydata.com/nba/fantasy-basketball...,De'Aaron Fox,SAC,PG,MEM,25.51,4.34,5.64,0.41,...,42.9,4.29,6.77,2.56,2.55,33.0,40.69,2024-03-18,0,4.34


In [301]:
# Mapping of current column names to the column names we have
column_mapping = {
    'Name': 'player',
    'Team': 'team',
    'Date': 'gamedate',
    'PTS': 'pts',
    'OREB': 'oreb',
    'DREB': 'dreb',
    'AST': 'ast',
    'BLK': 'blk',
    'STL': 'stl',
    '3PM': '3pm',
    'Turnover': 'tov',
    'MIN': 'minuters'
}
predictor_website_df = predictor_website_df[list(column_mapping.keys())]
predictor_website_df.rename(columns=column_mapping, inplace=True)

# Calculate fantasy points, the way we defined them
predictor_website_df['fp'] = predictor_website_df.apply(calculate_fantasy_points, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  predictor_website_df.rename(columns=column_mapping, inplace=True)


In [302]:
predictor_website_df.head(215)

Unnamed: 0,player,team,gamedate,pts,oreb,dreb,ast,blk,stl,3pm,tov,minuters,fp
0,Domantas Sabonis,SAC,2024-03-18,19.99,0,13.65,8.16,0.60,0.94,0.44,2.94,35.0,33.165
1,Tyrese Haliburton,IND,2024-03-18,23.02,0,4.38,12.40,0.74,1.28,3.21,2.64,35.0,32.295
2,Anthony Edwards,MIN,2024-03-18,27.82,0,5.89,5.25,0.82,1.30,2.68,2.73,36.0,27.900
3,Jalen Brunson,NY,2024-03-18,27.68,0,3.90,6.46,0.20,1.11,2.71,2.53,35.0,25.645
4,De'Aaron Fox,SAC,2024-03-18,25.51,0,4.34,5.64,0.41,1.62,2.56,2.55,33.0,25.525
...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,Ashton Hagans,MEM,2024-03-18,0.00,0,0.00,0.00,0.00,0.00,0.00,0.00,0.0,0.000
211,Shaedon Sharpe,MEM,2024-03-18,0.00,0,0.00,0.00,0.00,0.00,0.00,0.00,0.0,0.000
212,Justin Minaya,MEM,2024-03-18,0.00,0,0.00,0.00,0.00,0.00,0.00,0.00,0.0,0.000
213,Ibou Badji,MEM,2024-03-18,0.00,0,0.00,0.00,0.00,0.00,0.00,0.00,0.0,0.000


In [303]:
predictor_website_df.head(5415)

Unnamed: 0,player,team,gamedate,pts,oreb,dreb,ast,blk,stl,3pm,tov,minuters,fp
0,Domantas Sabonis,SAC,2024-03-18,19.99,0,13.65,8.16,0.60,0.94,0.44,2.94,35.0,33.165
1,Tyrese Haliburton,IND,2024-03-18,23.02,0,4.38,12.40,0.74,1.28,3.21,2.64,35.0,32.295
2,Anthony Edwards,MIN,2024-03-18,27.82,0,5.89,5.25,0.82,1.30,2.68,2.73,36.0,27.900
3,Jalen Brunson,NY,2024-03-18,27.68,0,3.90,6.46,0.20,1.11,2.71,2.53,35.0,25.645
4,De'Aaron Fox,SAC,2024-03-18,25.51,0,4.34,5.64,0.41,1.62,2.56,2.55,33.0,25.525
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5410,Isaiah Joe,OKC,2024-02-25,5.37,0,1.45,0.72,0.14,0.37,1.15,0.45,11.0,6.000
5411,Cedi Osman,SA,2024-02-25,4.59,0,1.65,1.07,0.12,0.31,0.81,0.43,10.0,5.850
5412,Dalen Terry,CHI,2024-02-25,3.16,0,2.02,1.37,0.25,0.46,0.29,0.50,10.0,6.035
5413,Jae'Sean Tate,HOU,2024-02-25,3.58,0,2.33,0.91,0.11,0.46,0.28,0.46,11.0,5.850


In [309]:
predictor_website_df.head(19415)

Unnamed: 0,player,team,gamedate,pts,oreb,dreb,ast,blk,stl,3pm,tov,minuters,fp
0,Domantas Sabonis,SAC,2024-03-18,19.99,0,13.65,8.16,0.60,0.94,0.44,2.94,35.0,33.165
1,Tyrese Haliburton,IND,2024-03-18,23.02,0,4.38,12.40,0.74,1.28,3.21,2.64,35.0,32.295
2,Anthony Edwards,MIN,2024-03-18,27.82,0,5.89,5.25,0.82,1.30,2.68,2.73,36.0,27.900
3,Jalen Brunson,NY,2024-03-18,27.68,0,3.90,6.46,0.20,1.11,2.71,2.53,35.0,25.645
4,De'Aaron Fox,SAC,2024-03-18,25.51,0,4.34,5.64,0.41,1.62,2.56,2.55,33.0,25.525
...,...,...,...,...,...,...,...,...,...,...,...,...,...
19410,Jusuf Nurkic,PHO,2023-12-12,13.25,0,9.36,3.66,1.04,0.93,0.89,2.12,27.0,21.910
19411,Jarrett Allen,CLE,2023-12-12,14.25,0,9.54,2.04,1.19,0.81,0.02,1.58,32.0,21.135
19412,Derrick White,BOS,2023-12-12,16.69,0,4.31,4.45,1.11,0.89,2.60,1.59,33.0,20.815
19413,Michael Porter Jr.,DEN,2023-12-12,17.93,0,7.47,1.49,0.65,0.73,3.10,1.21,32.0,21.025


Looks correct to me as the player-team combination seems right!

In [304]:
len(predictor_website_df)

20983