In [293]:
import numpy as np
import pandas as pd

In [294]:
predictor_website_url = 'https://raw.githubusercontent.com/BerkeAltiparmak/SportSense.AI/main/data/fpdata_Dec.csv'
dirty_predictor_website_df = pd.read_csv(predictor_website_url)
dirty_predictor_website_df.head() # it's not clean bc of NaNs, whose values appear at the bottom bc of bad data structure

Unnamed: 0,web-scraper-order,web-scraper-start-url,Rank,Name,Team,Pos,OPP,PTS,REB,AST,...,FG_percent,FT_percent,3P_percent,FTM,2PM,3PM,Turnover,MIN,FPts,page
0,1712978097-1,https://fantasydata.com/nba/fantasy-basketball...,1.0,Shai Gilgeous-Alexander\n \n \n \n ...,,,,,,,...,,,,,,,,,,
1,1712978097-2,https://fantasydata.com/nba/fantasy-basketball...,2.0,Anthony Davis\n \n \n \n \n ...,,,,,,,...,,,,,,,,,,
2,1712978097-3,https://fantasydata.com/nba/fantasy-basketball...,3.0,Trae Young\n \n \n \n \n ...,,,,,,,...,,,,,,,,,,
3,1712978097-4,https://fantasydata.com/nba/fantasy-basketball...,4.0,Jayson Tatum\n \n \n \n \n ...,,,,,,,...,,,,,,,,,,
4,1712978097-5,https://fantasydata.com/nba/fantasy-basketball...,5.0,Ja Morant\n \n \n \n \n ...,,,,,,,...,,,,,,,,,,


In [295]:
def clean_data(df):
    # Split the DataFrame into two parts
    metadata_df = df[~df['PTS'].isna()].copy()
    stats_df = df[~df['Name'].isna()].copy()

    # Reset index for alignment
    metadata_df.reset_index(drop=True, inplace=True)
    stats_df.reset_index(drop=True, inplace=True)

    # Fill NaN values with corresponding stats data
    for column in stats_df.columns:
        if column in metadata_df.columns:
            metadata_df[column].fillna(stats_df[column], inplace=True)

    return metadata_df.copy()

In [296]:
def extract_date(url):
    from urllib.parse import urlparse, parse_qs
    parsed_url = urlparse(url)
    query_params = parse_qs(parsed_url.query)
    return query_params.get('date', [None])[0]

In [297]:
def calculate_fantasy_points(row):
    # Points scored
    points_scored = row['pts'] * 0.5
    # Rebounds, Assists, Steals, and Blocks
    rebound_points = (row['oreb'] + row['dreb']) * 1
    assist_points = row['ast'] * 1
    steal_points = row['stl'] * 2
    block_points = row['blk'] * 2
    # Turnovers
    turnover_points = row['tov'] * -1
    # Three-points made
    three_points = row['3pm'] * 0.5
    # Double-Double and Triple-Double
    double_digits = sum(i >= 10 for i in [row['pts'], row['oreb'] + row['dreb'], row['ast'], row['stl'], row['blk']])
    double_double_points = 1.0 if double_digits >= 2 else 0
    triple_double_points = 2.0 if double_digits >= 3 else 0
    # 40+ and 50+ bonuses
    fourty_pt_bonus = 2.0 * (row['pts'] >= 40)
    fifty_pt_bonus = 2.0 * (row['pts'] >= 50)
    # Total fantasy points
    total_points = (points_scored + rebound_points + assist_points +
                    steal_points + block_points + turnover_points + three_points +
                    double_double_points + triple_double_points +
                    fourty_pt_bonus + fifty_pt_bonus)
    return total_points

In [298]:
predictor_website_df = clean_data(dirty_predictor_website_df)

# Clean 'Name' column
predictor_website_df['Name'] = predictor_website_df['Name'].str.extract(r'([^\n]+)')

# Extract 'Date' from URL
predictor_website_df['Date'] = predictor_website_df['web-scraper-start-url'].apply(extract_date)
predictor_website_df['Date'] = pd.to_datetime(predictor_website_df['Date'], format='%m-%d-%Y')

# Split REB into OREB and DREB for our convention
predictor_website_df['OREB'] = 0
predictor_website_df['DREB'] = predictor_website_df['REB']

predictor_website_df.head()

Unnamed: 0,web-scraper-order,web-scraper-start-url,Rank,Name,Team,Pos,OPP,PTS,REB,AST,...,FTM,2PM,3PM,Turnover,MIN,FPts,page,Date,OREB,DREB
0,1712978097-51,https://fantasydata.com/nba/fantasy-basketball...,1.0,Shai Gilgeous-Alexander,OKC,PG,BKN,31.22,5.31,5.79,...,8.97,9.79,0.89,2.33,34.0,49.47,,2023-12-31,0,5.31
1,1712978097-52,https://fantasydata.com/nba/fantasy-basketball...,2.0,Anthony Davis,LAL,C,NO,23.53,12.35,2.85,...,5.65,8.4,0.36,1.98,33.0,47.43,,2023-12-31,0,12.35
2,1712978097-53,https://fantasydata.com/nba/fantasy-basketball...,3.0,Trae Young,ATL,PG,WAS,26.57,3.2,10.65,...,7.24,5.66,2.67,3.68,34.0,45.77,,2023-12-31,0,3.2
3,1712978097-54,https://fantasydata.com/nba/fantasy-basketball...,4.0,Jayson Tatum,BOS,SF,SA,26.66,9.0,4.64,...,5.59,6.2,2.89,2.35,35.0,45.39,,2023-12-31,0,9.0
4,1712978097-55,https://fantasydata.com/nba/fantasy-basketball...,5.0,Ja Morant,MEM,PG,SAC,25.09,6.12,8.3,...,5.47,7.59,1.48,2.91,33.0,45.09,,2023-12-31,0,6.12


In [299]:
# Mapping of current column names to the column names we have
column_mapping = {
    'Name': 'player',
    'Team': 'team',
    'Date': 'gamedate',
    'PTS': 'pts',
    'OREB': 'oreb',
    'DREB': 'dreb',
    'AST': 'ast',
    'BLK': 'blk',
    'STL': 'stl',
    '3PM': '3pm',
    'Turnover': 'tov',
    'MIN': 'minuters'
}
predictor_website_df = predictor_website_df[list(column_mapping.keys())]
predictor_website_df.rename(columns=column_mapping, inplace=True)

# Calculate fantasy points, the way we defined them
predictor_website_df['fp'] = predictor_website_df.apply(calculate_fantasy_points, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  predictor_website_df.rename(columns=column_mapping, inplace=True)


In [300]:
predictor_website_df.head(215)

Unnamed: 0,player,team,gamedate,pts,oreb,dreb,ast,blk,stl,3pm,tov,minuters,fp
0,Shai Gilgeous-Alexander,OKC,2023-12-31,31.22,0,5.31,5.79,0.89,1.87,0.89,2.33,34.0,30.345
1,Anthony Davis,LAL,2023-12-31,23.53,0,12.35,2.85,2.46,0.93,0.36,1.98,33.0,32.945
2,Trae Young,ATL,2023-12-31,26.57,0,3.20,10.65,0.17,1.36,2.67,3.68,34.0,28.850
3,Jayson Tatum,BOS,2023-12-31,26.66,0,9.00,4.64,0.61,1.05,2.89,2.35,35.0,29.385
4,Ja Morant,MEM,2023-12-31,25.09,0,6.12,8.30,0.27,1.29,1.48,2.91,33.0,27.915
...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,Markelle Fultz,PHO,2023-12-31,0.00,0,0.00,0.00,0.00,0.00,0.00,0.00,0.0,0.000
211,Jonathan Isaac,DAL,2023-12-30,32.62,0,8.55,8.26,0.47,1.29,3.35,3.34,36.0,34.975
212,Damion Lee,LAL,2023-12-30,23.53,0,12.35,2.85,2.46,0.93,0.36,1.98,33.0,32.945
213,Nassir Little,LAL,2023-12-30,23.16,0,7.71,6.29,0.77,1.06,1.96,2.47,32.0,27.750


In [301]:
len(predictor_website_df)

5578