In [266]:
import numpy as np
import pandas as pd

In [267]:
predictor_website_url = 'https://raw.githubusercontent.com/BerkeAltiparmak/SportSense.AI/main/data/fpdata_Dec.csv'
dirty_predictor_website_df = pd.read_csv(predictor_website_url)
dirty_predictor_website_df.head(428) # it's not clean bc of NaNs, whose values appear at the bottom bc of bad data structure

Unnamed: 0,web-scraper-order,web-scraper-start-url,Rank,Name,Team,Pos,OPP,PTS,REB,AST,...,FG_percent,FT_percent,3P_percent,FTM,2PM,3PM,Turnover,MIN,FPts,page
0,1712978097-1,https://fantasydata.com/nba/fantasy-basketball...,1.0,Shai Gilgeous-Alexander\n \n \n \n ...,,,,,,,...,,,,,,,,,,
1,1712978097-2,https://fantasydata.com/nba/fantasy-basketball...,2.0,Anthony Davis\n \n \n \n \n ...,,,,,,,...,,,,,,,,,,
2,1712978097-3,https://fantasydata.com/nba/fantasy-basketball...,3.0,Trae Young\n \n \n \n \n ...,,,,,,,...,,,,,,,,,,
3,1712978097-4,https://fantasydata.com/nba/fantasy-basketball...,4.0,Jayson Tatum\n \n \n \n \n ...,,,,,,,...,,,,,,,,,,
4,1712978097-5,https://fantasydata.com/nba/fantasy-basketball...,5.0,Ja Morant\n \n \n \n \n ...,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
423,1712978097-424,https://fantasydata.com/nba/fantasy-basketball...,,,PHO,SG,ORL,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
424,1712978097-425,https://fantasydata.com/nba/fantasy-basketball...,,,PHO,SF,ORL,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
425,1712978107-426,https://fantasydata.com/nba/fantasy-basketball...,1.0,Luka Doncic\n \n \n \n \n ...,,,,,,,...,,,,,,,,,,
426,1712978107-427,https://fantasydata.com/nba/fantasy-basketball...,2.0,Anthony Davis\n \n \n \n \n ...,,,,,,,...,,,,,,,,,,


In [268]:
def clean_data(data):
    """
    The logic behind this algorithm is very complex.
    I sold my soul to the devil to get the alignment problem right.
    """
    # Identifying rows with player names and stats
    player_indices = data.index[data['Name'].notna()].tolist()
    stats_indices = data.index[data['PTS'].notna()].tolist()

    segments = []
    current_segment = {'players': [], 'stats': []}

    # Initialize flags to track when a new segment needs to be started
    start_new_segment = True

    # Collect indices into segments
    for i in range(len(data)):
        if i in player_indices:
            if start_new_segment:
                # If a new segment is needed, start it
                if current_segment['players'] or current_segment['stats']:
                    segments.append(current_segment)
                    current_segment = {'players': [], 'stats': []}
                start_new_segment = False  # Reset the flag
            current_segment['players'].append(i)
        elif i in stats_indices:
            current_segment['stats'].append(i)
            start_new_segment = True  # New segment will start after this stats block

    # Add the last segment
    if current_segment['players'] or current_segment['stats']:
        segments.append(current_segment)

    # Process each segment separately
    paired_data = []
    for segment in segments:
        p_idx = 0
        s_idx = 0
        while p_idx < len(segment['players']) and s_idx < len(segment['stats']):
            paired_data.append((segment['players'][p_idx], segment['stats'][s_idx]))
            p_idx += 1
            s_idx += 1


    # Creating a structured DataFrame
    paired_df = pd.DataFrame({
        'web-scraper-order': [data.iloc[p[0]]['web-scraper-order'] for p in paired_data],
        'web-scraper-start-url': [data.iloc[p[0]]['web-scraper-start-url'] for p in paired_data],
        'Name': [data.iloc[p[0]]['Name'] for p in paired_data],
        'Team': [data.iloc[p[1]]['Team'] for p in paired_data],
        'Pos': [data.iloc[p[1]]['Pos'] for p in paired_data],
        'OPP': [data.iloc[p[1]]['OPP'] for p in paired_data],
        'PTS': [data.iloc[p[1]]['PTS'] for p in paired_data],
        'REB': [data.iloc[p[1]]['REB'] for p in paired_data],
        'AST': [data.iloc[p[1]]['AST'] for p in paired_data],
        'BLK': [data.iloc[p[1]]['BLK'] for p in paired_data],
        'STL': [data.iloc[p[1]]['STL'] for p in paired_data],
        'FG_percent': [data.iloc[p[1]]['FG_percent'] for p in paired_data],
        'FT_percent': [data.iloc[p[1]]['FT_percent'] for p in paired_data],
        '3P_percent': [data.iloc[p[1]]['3P_percent'] for p in paired_data],
        'FTM': [data.iloc[p[1]]['FTM'] for p in paired_data],
        '2PM': [data.iloc[p[1]]['2PM'] for p in paired_data],
        '3PM': [data.iloc[p[1]]['3PM'] for p in paired_data],
        'Turnover': [data.iloc[p[1]]['Turnover'] for p in paired_data],
        'MIN': [data.iloc[p[1]]['MIN'] for p in paired_data],
        'FPts': [data.iloc[p[1]]['FPts'] for p in paired_data]
    })
    return paired_df

In [269]:
def extract_date(url):
    from urllib.parse import urlparse, parse_qs
    parsed_url = urlparse(url)
    query_params = parse_qs(parsed_url.query)
    return query_params.get('date', [None])[0]

In [270]:
def calculate_fantasy_points(row):
    # Points scored
    points_scored = row['pts'] * 0.5
    # Rebounds, Assists, Steals, and Blocks
    rebound_points = (row['oreb'] + row['dreb']) * 1
    assist_points = row['ast'] * 1
    steal_points = row['stl'] * 2
    block_points = row['blk'] * 2
    # Turnovers
    turnover_points = row['tov'] * -1
    # Three-points made
    three_points = row['3pm'] * 0.5
    # Double-Double and Triple-Double
    double_digits = sum(i >= 10 for i in [row['pts'], row['oreb'] + row['dreb'], row['ast'], row['stl'], row['blk']])
    double_double_points = 1.0 if double_digits >= 2 else 0
    triple_double_points = 2.0 if double_digits >= 3 else 0
    # 40+ and 50+ bonuses
    fourty_pt_bonus = 2.0 * (row['pts'] >= 40)
    fifty_pt_bonus = 2.0 * (row['pts'] >= 50)
    # Total fantasy points
    total_points = (points_scored + rebound_points + assist_points +
                    steal_points + block_points + turnover_points + three_points +
                    double_double_points + triple_double_points +
                    fourty_pt_bonus + fifty_pt_bonus)
    return total_points

In [271]:
predictor_website_df = clean_data(dirty_predictor_website_df)

In [272]:
# Clean 'Name' column
predictor_website_df['Name'] = predictor_website_df['Name'].str.extract(r'([^\n]+)')

# Extract 'Date' from URL
predictor_website_df['Date'] = predictor_website_df['web-scraper-start-url'].apply(extract_date)
predictor_website_df['Date'] = pd.to_datetime(predictor_website_df['Date'], format='%m-%d-%Y')

# Split REB into OREB and DREB for our convention
predictor_website_df['OREB'] = 0
predictor_website_df['DREB'] = predictor_website_df['REB']

predictor_website_df.head()

Unnamed: 0,web-scraper-order,web-scraper-start-url,Rank,Name,Team,Pos,OPP,PTS,REB,AST,...,FTM,2PM,3PM,Turnover,MIN,FPts,page,Date,OREB,DREB
0,1712978097-1,https://fantasydata.com/nba/fantasy-basketball...,,Shai Gilgeous-Alexander,OKC,PG,BKN,31.22,5.31,5.79,...,8.97,9.79,0.89,2.33,34.0,49.47,,2023-12-31,0,5.31
1,1712978097-2,https://fantasydata.com/nba/fantasy-basketball...,,Anthony Davis,LAL,C,NO,23.53,12.35,2.85,...,5.65,8.4,0.36,1.98,33.0,47.43,,2023-12-31,0,12.35
2,1712978097-3,https://fantasydata.com/nba/fantasy-basketball...,,Trae Young,ATL,PG,WAS,26.57,3.2,10.65,...,7.24,5.66,2.67,3.68,34.0,45.77,,2023-12-31,0,3.2
3,1712978097-4,https://fantasydata.com/nba/fantasy-basketball...,,Jayson Tatum,BOS,SF,SA,26.66,9.0,4.64,...,5.59,6.2,2.89,2.35,35.0,45.39,,2023-12-31,0,9.0
4,1712978097-5,https://fantasydata.com/nba/fantasy-basketball...,,Ja Morant,MEM,PG,SAC,25.09,6.12,8.3,...,5.47,7.59,1.48,2.91,33.0,45.09,,2023-12-31,0,6.12


In [273]:
predictor_website_df.head(215)

Unnamed: 0,web-scraper-order,web-scraper-start-url,Rank,Name,Team,Pos,OPP,PTS,REB,AST,...,FTM,2PM,3PM,Turnover,MIN,FPts,page,Date,OREB,DREB
0,1712978097-1,https://fantasydata.com/nba/fantasy-basketball...,,Shai Gilgeous-Alexander,OKC,PG,BKN,31.22,5.31,5.79,...,8.97,9.79,0.89,2.33,34.0,49.47,,2023-12-31,0,5.31
1,1712978097-2,https://fantasydata.com/nba/fantasy-basketball...,,Anthony Davis,LAL,C,NO,23.53,12.35,2.85,...,5.65,8.40,0.36,1.98,33.0,47.43,,2023-12-31,0,12.35
2,1712978097-3,https://fantasydata.com/nba/fantasy-basketball...,,Trae Young,ATL,PG,WAS,26.57,3.20,10.65,...,7.24,5.66,2.67,3.68,34.0,45.77,,2023-12-31,0,3.20
3,1712978097-4,https://fantasydata.com/nba/fantasy-basketball...,,Jayson Tatum,BOS,SF,SA,26.66,9.00,4.64,...,5.59,6.20,2.89,2.35,35.0,45.39,,2023-12-31,0,9.00
4,1712978097-5,https://fantasydata.com/nba/fantasy-basketball...,,Ja Morant,MEM,PG,SAC,25.09,6.12,8.30,...,5.47,7.59,1.48,2.91,33.0,45.09,,2023-12-31,0,6.12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,1712978097-261,https://fantasydata.com/nba/fantasy-basketball...,,Markelle Fultz,PHO,SF,ORL,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.0,0.00,,2023-12-31,0,0.00
211,1712978107-426,https://fantasydata.com/nba/fantasy-basketball...,,Luka Doncic,DAL,PG,GS,32.62,8.55,8.26,...,7.29,7.64,3.35,3.34,36.0,55.45,,2023-12-30,0,8.55
212,1712978107-427,https://fantasydata.com/nba/fantasy-basketball...,,Anthony Davis,LAL,C,MIN,23.53,12.35,2.85,...,5.65,8.40,0.36,1.98,33.0,47.43,,2023-12-30,0,12.35
213,1712978107-428,https://fantasydata.com/nba/fantasy-basketball...,,LeBron James,LAL,PG,MIN,23.16,7.71,6.29,...,3.90,6.69,1.96,2.47,32.0,43.04,,2023-12-30,0,7.71


In [274]:
# Mapping of current column names to the column names we have
column_mapping = {
    'Name': 'player',
    'Team': 'team',
    'Date': 'gamedate',
    'PTS': 'pts',
    'OREB': 'oreb',
    'DREB': 'dreb',
    'AST': 'ast',
    'BLK': 'blk',
    'STL': 'stl',
    '3PM': '3pm',
    'Turnover': 'tov',
    'MIN': 'minuters'
}
predictor_website_df = predictor_website_df[list(column_mapping.keys())]
predictor_website_df.rename(columns=column_mapping, inplace=True)

# Calculate fantasy points, the way we defined them
predictor_website_df['fp'] = predictor_website_df.apply(calculate_fantasy_points, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  predictor_website_df.rename(columns=column_mapping, inplace=True)


In [275]:
predictor_website_df.head(215)

Unnamed: 0,player,team,gamedate,pts,oreb,dreb,ast,blk,stl,3pm,tov,minuters,fp
0,Shai Gilgeous-Alexander,OKC,2023-12-31,31.22,0,5.31,5.79,0.89,1.87,0.89,2.33,34.0,30.345
1,Anthony Davis,LAL,2023-12-31,23.53,0,12.35,2.85,2.46,0.93,0.36,1.98,33.0,32.945
2,Trae Young,ATL,2023-12-31,26.57,0,3.20,10.65,0.17,1.36,2.67,3.68,34.0,28.850
3,Jayson Tatum,BOS,2023-12-31,26.66,0,9.00,4.64,0.61,1.05,2.89,2.35,35.0,29.385
4,Ja Morant,MEM,2023-12-31,25.09,0,6.12,8.30,0.27,1.29,1.48,2.91,33.0,27.915
...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,Markelle Fultz,PHO,2023-12-31,0.00,0,0.00,0.00,0.00,0.00,0.00,0.00,0.0,0.000
211,Luka Doncic,DAL,2023-12-30,32.62,0,8.55,8.26,0.47,1.29,3.35,3.34,36.0,34.975
212,Anthony Davis,LAL,2023-12-30,23.53,0,12.35,2.85,2.46,0.93,0.36,1.98,33.0,32.945
213,LeBron James,LAL,2023-12-30,23.16,0,7.71,6.29,0.77,1.06,1.96,2.47,32.0,27.750


In [278]:
predictor_website_df.head(5415)

Unnamed: 0,player,team,gamedate,pts,oreb,dreb,ast,blk,stl,3pm,tov,minuters,fp
0,Shai Gilgeous-Alexander,OKC,2023-12-31,31.22,0,5.31,5.79,0.89,1.87,0.89,2.33,34.0,30.345
1,Anthony Davis,LAL,2023-12-31,23.53,0,12.35,2.85,2.46,0.93,0.36,1.98,33.0,32.945
2,Trae Young,ATL,2023-12-31,26.57,0,3.20,10.65,0.17,1.36,2.67,3.68,34.0,28.850
3,Jayson Tatum,BOS,2023-12-31,26.66,0,9.00,4.64,0.61,1.05,2.89,2.35,35.0,29.385
4,Ja Morant,MEM,2023-12-31,25.09,0,6.12,8.30,0.27,1.29,1.48,2.91,33.0,27.915
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5410,Brandon Ingram,NO,2023-12-01,27.53,0,6.00,6.41,0.31,0.91,1.31,2.92,37.0,26.350
5411,Kevin Durant,PHO,2023-12-01,27.59,0,6.64,4.79,1.09,0.74,2.07,2.65,34.0,27.270
5412,Zion Williamson,NO,2023-12-01,26.85,0,6.32,5.57,0.44,1.06,0.18,2.96,34.0,25.445
5413,Julius Randle,NY,2023-12-01,23.05,0,9.82,4.42,0.25,0.73,2.51,2.58,34.0,26.400


In [276]:
len(predictor_website_df)

5578