In [1]:
import requests
import pandas as pd

In [2]:
general_cols = ['name', 'height', 'weight', 'overall', 'position', 'secondary_position']
shooting_cols = ['shot_close', 'shot_mid', 'shot_3pt', 'shot_iq', 'free_throw', 'offensive_consistency']
inside_scoring_cols = ['driving_layup','standing_dunk','driving_dunk','draw_foul','post_moves','post_hook','post_fade','hands']
athleticism_cols = ['speed','acceleration','vertical','strength','stamina','hustle']
playmaking_cols = ['speed_with_ball','ball_handle','passing_accuracy','passing_vision','passing_iq']
defense_cols = ['interior_defense','perimeter_defense','help_defense_iq','lateral_quickness','pass_perception','steal','block','defensive_consistency']
rebounding_cols = ['offensive_rebound','defensive_rebound']
potential_cols = ['potential', 'intangibles']

cols = ['collection']+ general_cols + shooting_cols + inside_scoring_cols + athleticism_cols + playmaking_cols + defense_cols + rebounding_cols + potential_cols

Get player urls to extract 2k ratings, add urls to dataframe

In [None]:
# URL to scrape
from time import sleep

for year in [20,21,22,23]:
    df = pd.DataFrame()
    page = 1
    while True:
        url = f'https://2kdb.net/api/players/{year}/%7B%22freeAgents%22:false,%22page%22:%22{page}%22,%22pageSize%22:%2250%22,%22version%22:%2224%22%7D'
        response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
        data = response.json()
        if len(data['players']) == 0:
            print(f'Finished scraping 2k{year}, {page} pages & {len(df)} of {data["totalPlayers"]} players scraped.')
            break
        df = pd.concat([df, pd.DataFrame(data['players'])])
        page += 1
        sleep(1)

    # Save to CSV
    df.to_csv(f'2k{year}.csv', index=False)

In [7]:
import os

# Collections to keep
collections = {
    20: '20 Current',
    21: '21 Current NBA',
    22: '22 NBA: Series 1',
    23: '\'23 NBA: Series 1'
}

data_folder = os.path.join(os.path.dirname(os.getcwd()), 'data')

# Do this for all 2k games
for year in collections.keys():
    df = pd.read_csv(os.path.join(data_folder, 'raw','2k ratings', f'2k{year}.csv'))

    # Keep only the columns we want
    df = df.drop(columns=[col for col in df.columns if col not in cols])

    # Keep only the rows in the collections we want
    df = df[df['collection'] == collections[year]]
    df = df.drop(columns=['collection'])

    # Aggregate related columns
    df['shooting'] = df[shooting_cols].mean(axis=1).astype(int)
    df['inside_scoring'] = df[inside_scoring_cols].mean(axis=1).astype(int)
    df['athleticism'] = df[athleticism_cols].mean(axis=1).astype(int)
    df['playmaking'] = df[playmaking_cols].mean(axis=1).astype(int)
    df['defense'] = df[defense_cols].mean(axis=1).astype(int)
    df['rebounding'] = df[rebounding_cols].mean(axis=1).astype(int)
    df['potential'] = df[potential_cols].mean(axis=1).astype(int)

    # Drop the original columns
    df = df.drop(columns=shooting_cols + inside_scoring_cols + athleticism_cols + playmaking_cols + defense_cols + rebounding_cols + potential_cols)
    
    df.to_csv(os.path.join(data_folder, 'clean','2k ratings',f'2k{year}_clean.csv'), index=False)


  df = pd.read_csv(os.path.join(data_folder, 'raw','2k ratings', f'2k{year}.csv'))
  df = pd.read_csv(os.path.join(data_folder, 'raw','2k ratings', f'2k{year}.csv'))
