In [8]:
from basketball_reference_web_scraper import client
from basketball_reference_web_scraper.data import OutputType
import pandas as pd
import numpy as np
import requests
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup, Comment
import time
from datetime import datetime
import csv
from collections import defaultdict
import matplotlib.pyplot as plt
import sklearn
from sklearn.neural_network import MLPClassifier
from sklearn.neural_network import MLPRegressor
# Import necessary modules
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.metrics import r2_score
from sklearn.metrics import classification_report,confusion_matrix
#from basketball_reference_web_scraper import 

In [2]:
TEAM_NAME_TO_ABR = {
    "ATLANTA HAWKS": 'ATL',
    "BOSTON CELTICS": 'BOS',
    "BROOKLYN NETS": 'BRK',
    "CHARLOTTE HORNETS": 'CHO',
    "CHICAGO BULLS": 'CHI',
    "CLEVELAND CAVALIERS": 'CLE',
    "DALLAS MAVERICKS": 'DAL',
    "DENVER NUGGETS": 'DEN',
    "DETROIT PISTONS": 'DET',
    "GOLDEN STATE WARRIORS": 'GSW',
    "HOUSTON ROCKETS": 'HOU',
    "INDIANA PACERS": 'IND',
    "LOS ANGELES CLIPPERS": 'LAC',
    "LOS ANGELES LAKERS": 'LAL',
    "MEMPHIS GRIZZLIES": 'MEM',
    "MIAMI HEAT": 'MIA',
    "MILWAUKEE BUCKS": 'MIL',
    "MINNESOTA TIMBERWOLVES": 'MIN',
    "NEW ORLEANS PELICANS": 'NOP',
    "NEW YORK KNICKS": 'NYK',
    "OKLAHOMA CITY THUNDER": 'OKC',
    "ORLANDO MAGIC": 'ORL',
    "PHILADELPHIA 76ERS": 'PHI',
    "PHOENIX SUNS": 'PHO',
    "PORTLAND TRAIL BLAZERS": 'POR',
    "SACRAMENTO KINGS": 'SAC',
    "SAN ANTONIO SPURS": 'SAS',
    "TORONTO RAPTORS": 'TOR',
    "UTAH JAZZ": 'UTA',
    "WASHINGTON WIZARDS": 'WAS',

    # DEPRECATED TEAMS
    # "CHARLOTTE BOBCATS": 'CHA',
    # "KANSAS CITY KINGS": 'KCK',
    # "NEW JERSEY NETS": 'NJN',
    # "NEW ORLEANS HORNETS": 'NOH',
    # "NEW ORLEANS/OKLAHOMA CITY HORNETS": 'NOK',
    # "SEATTLE SUPERSONICS": 'SEA',
    # "ST. LOUIS HAWKS": 'STL',
    # "VANCOUVER GRIZZLIES": 'VAN',
    # "WASHINGTON BULLETS": 'WSB',
}

MONTH_TO_NUM = {
    "Jan" : "01",
    "Feb" : "02",
    "Mar" : "03",
    "Apr" : "04",
    "May" : "05",
    "Jun" : "06",
    "Sep" : "09",
    "Oct" : "10",
    "Nov" : "11",
    "Dec" : "12",
}

TEAM_STATS = defaultdict(list)

TEAM_ON_OFF = defaultdict(dict)


# get team stats for a given season from https://www.basketball-reference.com/teams/
def get_team_stats(team_abr,end_year):
    '''
    team_abr: string abbreviation from basketball-reference
    end_year: string representing end year of season to get stats
    '''
    URL = "https://www.basketball-reference.com/teams/{team_abr}/{end_year}.html".format(team_abr=team_abr,end_year=end_year)
    page = requests.get(URL)
    soup = BeautifulSoup(page.content, 'html.parser')
    results = soup.find('div',id='all_team_misc')
    dec = results.decode_contents()
    new_soup = BeautifulSoup(dec,'lxml')
    comment = new_soup.find(text=lambda text:isinstance(text, Comment))
    com_soup = BeautifulSoup(comment,'lxml')
    table = com_soup.find_all('td')

    team_res = []

    # MOV	SOS	SRS	ORtg	DRtg	Pace	FTr	3PAr	eFG%	TOV%	ORB%	FT/FGA	eFG%	TOV%	DRB%	FT/FGA	
    for i in range(4,20):
        team_res.append(float(table[i].text))

    return team_res
 

# populate team stats based on season year
def pop_team_stats(end_year):
    for abr in TEAM_NAME_TO_ABR.values():
        TEAM_STATS[abr] = get_team_stats(abr,end_year)


# get team on off stats
def get_on_off(team_abr,end_year):
    url = ("https://www.basketball-reference.com/teams/{team_abr}/{end_year}/on-off/").format(team_abr=team_abr,end_year=end_year)
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    results = soup.find('table',id='on_off')
    on_off_list = list(results.find_all('th',scope='row'))
    on_off_dict = defaultdict(list)

    for i in range(0,len(on_off_list)-2,3):
        name = on_off_list[i].text.upper()
        on_off_dict[name] = list(on_off_list[i+2].previous.stripped_strings)[1:]
    
    return on_off_dict


# populate team on off stats
def pop_team_on_off(end_year):
    for abr in TEAM_NAME_TO_ABR.values():
        TEAM_ON_OFF[abr] = get_on_off(abr,end_year)


def calc_injury_impact(injured,home_abr,away_abr):

    home_injured = injured[home_abr]
    away_injured = injured[away_abr]
    home_stats = TEAM_STATS[home_abr]
    away_stats = TEAM_STATS[away_abr]

    # map available on-off stats to respective index in team stats, ignore rest for now
    # mapping [on_off_idx,team_stats_idx]
    affected_stats_idx = [[1,8],[2,10],[3,14],[8,9],[9,5],[10,3],[18,13]]

    # eFG%	ORB%	DRB%	TRB%	AST%	STL%	BLK%	TOV%	Pace	ORtg	eFG%	ORB%	DRB%	TRB%	AST%	STL%	BLK%	TOV%	Pace	ORtg
    # 8      10       14    NAN     NAN      NAN     NAN     9       5       3       NAN     NAN     NAN     NAN     NAN     NAN     NAN     13      NAN     NAN

    for player in home_injured:
        on_off = TEAM_ON_OFF[home_abr][player]
        weight = float(on_off[0].strip('%'))/100
        
        for pair in affected_stats_idx:
            home_stats[pair[1]] -= (weight * on_off[pair[0]])

    for player in away_injured:
        on_off = TEAM_ON_OFF[away_abr][player]
        weight = float(on_off[0].strip('%'))/100
        
        for pair in affected_stats_idx:
            away_stats[pair[1]] -= (weight * on_off[pair[0]])

    return home_stats,away_stats


def check_injured(box_score_page,home_abr,away_abr):
    '''
    checks list of injured players for a given game and returns a dict mapping teams to injured player
    end_year: string representing end year of NBA season
    box_score_page: string representing URL for a given game
    home_team: string abbreviation of home team
    away_team: string abbreviation of away team
    '''
    page = requests.get(box_score_page)
    soup = BeautifulSoup(page.content, 'html.parser')
    results = soup.find_all('strong')
    player_links = results[3].previous.find_all('a')

    curr_team = " "
    players_dict = {home_abr : [], away_abr : []}

    for link in player_links:
        player_name = link.text.upper()
        prev = list(link.previous_sibling.stripped_strings)

        if prev[0] == home_abr:
            curr_team = home_abr

        elif prev[0] == away_abr:
            curr_team = away_abr

        players_dict[curr_team].append(player_name)

    return calc_injury_impact(players_dict,home_abr,away_abr)


def generate_features(end_year,file_name):
    '''
    returns lists containing features, samples
    end_year: string representing end year of NBA season
    file_name: name of CSV containing games for given season in old_games folder
    '''
    features = []
    samples = []
    file_path = "old_games/{file_name}".format(file_name=file_name)
    # construct data set, consisting of team misc stats as features and win/loss as samples
    with open(file_path, mode='r') as f:
        lines = csv.reader(f)
        for date,away_team,away_pt,home_team,home_pt in lines:
            date_list = date.split()
            month = MONTH_TO_NUM[date_list[1]]
            day = date_list[2]
            year = date_list[3]
            results = [1,0] if away_pt > home_pt else [0,1]
            
            # get box score page
            box_score_page = "https://www.basketball-reference.com/boxscores/{YEAR}{MO}{DA}0{HOME}.html".format(YEAR=year,MO=month,DA=day,HOME=TEAM_NAME_TO_ABR[home_team.upper()])
            home_stats,away_stats = check_injured(box_score_page,home_team,away_team)
            
            features.append(np.subtract(away_stats,home_stats))
            samples.append(results)
    return features,samples


# TODO : add end year of season into here somewhere
def get_injuries_for_games(file_name,end_year):
    file_path = '{file_name}'.format(file_name=file_name)
    with open(file_path, mode='r') as f:
        lines = csv.reader(f)
        for date,away_team,away_pt,home_team,home_pt in lines:
            date_list = date.split()
            month = MONTH_TO_NUM[date_list[1]]
            day = date_list[2]
            year = date_list[3]
            winner = away_team if away_pt > home_pt else home_team
            # get box score page
            box_score_page = "https://www.basketball-reference.com/boxscores/{YEAR}{MO}{DA}0{HOME}.html".format(YEAR=year,MO=month,DA=day,HOME=TEAM_NAME_TO_ABR[home_team.upper()])
            home_stats,away_stats = check_injured(box_score_page,home_team,away_team)
            
    return None

In [None]:
page = requests.get("https://www.basketball-reference.com/teams/BRK/2022/on-off/")
soup = BeautifulSoup(page.content, 'html.parser')
results = soup.find('table',id='on_off')
on_off_list = list(results.find_all('th',scope='row'))
on_off_list[0].text = name
list(on_off_list[2].previous.stripped_strings)
#results = soup.find('table',id='on_off')
#on -off, bos , everythin else
#player_link = link.attrs['href'][0:-5]
#player_on_off = "https://www.basketball-reference.com{player_link}/on-off/{year}".format(player_link=player_link,year=end_year)
list(results.find_all('th',scope='row')[2].previous.stripped_strings)

In [6]:
page = requests.get("https://www.basketball-reference.com/boxscores/202110200NOP.html")
soup = BeautifulSoup(page.content, 'html.parser')
results = soup.find_all('strong')
player_links = results[3].previous.find_all('a')

In [7]:
[0] * 8

[0, 0, 0, 0, 0, 0, 0, 0]

## 2016-2023 NBA Seasons

### 0. Generate features and samples using date specific stats

In [None]:
# https://www.basketball-reference.com/friv/standings.fcgi?month=1&day=14&year=2024&lg_id=NBA
# https://www.basketball-reference.com/boxscores/202212100GSW.html
# https://www.basketball-reference.com/boxscores/YEARMODA0WIN.html

### 1. Generate features and samples for each season one at a time, to avoid BBALL reference blocking scraper

In [2]:
features_norm = []
samples = []

pop_team_stats('2017')
features_2017,samples_2017 = generate_features('2017','2016-2017_season.csv')
features_2017_norm = [[float(i)/sum(j) for i in j ]for j in features_2017]

features_norm.extend(features_2017_norm)
samples.extend(samples_2017)

In [3]:
pop_team_stats('2018')
features_2018,samples_2018 = generate_features('2018','2017-2018_season.csv')
features_2018_norm = [[float(i)/sum(j) for i in j ]for j in features_2018]

features_norm.extend(features_2018_norm)
samples.extend(samples_2018)

In [6]:
pop_team_stats('2019')
features_2019,samples_2019 = generate_features('2019','2018-2019_season.csv')
features_2019_norm = [[float(i)/sum(j) for i in j ]for j in features_2019]

features_norm.extend(features_2019_norm)
samples.extend(samples_2019)

In [7]:
pop_team_stats('2020')
features_2020,samples_2020 = generate_features('2020','2019-2020_season.csv')
features_2020_norm = [[float(i)/sum(j) for i in j ]for j in features_2020]

features_norm.extend(features_2020_norm)
samples.extend(samples_2020)

In [8]:
pop_team_stats('2021')
features_2021,samples_2021 = generate_features('2021','2020-2021_season.csv')
features_2021_norm = [[float(i)/sum(j) for i in j ]for j in features_2021]

features_norm.extend(features_2021_norm)
samples.extend(samples_2021)

In [9]:
pop_team_stats('2022')
features_2022,samples_2022 = generate_features('2022','2021-2022_season.csv')
features_2022_norm = [[float(i)/sum(j) for i in j ]for j in features_2022]

features_norm.extend(features_2022_norm)
samples.extend(samples_2022)

In [10]:
pop_team_stats('2023')
features_2023,samples_2023 = generate_features('2023','2022-2023_season.csv')
features_2023_norm = [[float(i)/sum(j) for i in j ]for j in features_2023]

features_norm.extend(features_2023_norm)
samples.extend(samples_2023)

In [2]:
samples_1d = np.genfromtxt('old_samps_feats/2017-2023_nba_samples_1d.csv',delimiter=',')
features_norm = np.genfromtxt('old_samps_feats/2017-2023_nba_features_norm.csv',delimiter=',')

In [3]:
#samples_1d = [0 if j[0] == 0 else 1 for j in samples]
feat_train, feat_test, samp_train, samp_test = train_test_split(features_norm,samples_1d, test_size=0.30, random_state=1)

In [12]:
#solver = lbfgs might be good
#mlp = MLPClassifier(hidden_layer_sizes=(16,32,16), alpha=1, learning_rate_init=0.025, activation='relu', solver='lbfgs', max_iter=10000)
#mlp = MLPClassifier(hidden_layer_sizes=(16,32,64,64,32,16), alpha=0.075, learning_rate='invscaling', activation='tanh', solver='lbfgs',max_iter=5000)
mlp = MLPClassifier(hidden_layer_sizes=(16,32,64,64,32,16), alpha=0.075, learning_rate='invscaling', activation='tanh', solver='lbfgs',max_iter=10000)
mlp.fit(feat_train,samp_train)

predict_train = mlp.predict(feat_train)
predict_test = mlp.predict(feat_test)

print('TN, FP, FN, TP')
print(confusion_matrix(samp_train,predict_train).ravel())
print(classification_report(samp_train,predict_train))
print('TN, FP, FN, TP')
print(confusion_matrix(samp_test,predict_test).ravel())
print(classification_report(samp_test,predict_test))

TN, FP, FN, TP
[2574  692  758 2199]
              precision    recall  f1-score   support

           0       0.77      0.79      0.78      3266
           1       0.76      0.74      0.75      2957

    accuracy                           0.77      6223
   macro avg       0.77      0.77      0.77      6223
weighted avg       0.77      0.77      0.77      6223

TN, FP, FN, TP
[692 655 710 610]
              precision    recall  f1-score   support

           0       0.49      0.51      0.50      1347
           1       0.48      0.46      0.47      1320

    accuracy                           0.49      2667
   macro avg       0.49      0.49      0.49      2667
weighted avg       0.49      0.49      0.49      2667



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [34]:
mlp = MLPClassifier(hidden_layer_sizes=(18,36,72,36,18), alpha=0.0001, learning_rate='invscaling', activation='tanh', solver='adam',max_iter=10000)
mlp.fit(feat_train,samp_train)

predict_train = mlp.predict(feat_train)
predict_test = mlp.predict(feat_test)

print('TN, FP, FN, TP')
print(confusion_matrix(samp_train,predict_train).ravel())
print(classification_report(samp_train,predict_train))
print('TN, FP, FN, TP')
print(confusion_matrix(samp_test,predict_test).ravel())
print(classification_report(samp_test,predict_test))

TN, FP, FN, TP
[2286  980 1160 1797]
              precision    recall  f1-score   support

           0       0.66      0.70      0.68      3266
           1       0.65      0.61      0.63      2957

    accuracy                           0.66      6223
   macro avg       0.66      0.65      0.65      6223
weighted avg       0.66      0.66      0.66      6223

TN, FP, FN, TP
[745 602 737 583]
              precision    recall  f1-score   support

           0       0.50      0.55      0.53      1347
           1       0.49      0.44      0.47      1320

    accuracy                           0.50      2667
   macro avg       0.50      0.50      0.50      2667
weighted avg       0.50      0.50      0.50      2667



In [32]:
arcs = [(128,256,128),(64,128,128,64),(128,256,256,128)]
for arc in arcs:
    mlp = MLPClassifier(hidden_layer_sizes=arc, alpha=0.0075, learning_rate_init=0.001, activation='tanh', solver='adam', epsilon=0.0000001, max_iter=10000)
    mlp.fit(feat_train,samp_train)
    print(arc)
    predict_train = mlp.predict(feat_train)
    predict_test = mlp.predict(feat_test)

    print('TN, FP, FN, TP')
    print(confusion_matrix(samp_train,predict_train).ravel())
    print(classification_report(samp_train,predict_train))
    print('TN, FP, FN, TP')
    print(confusion_matrix(samp_test,predict_test).ravel())
    print(classification_report(samp_test,predict_test))


(128, 256, 128)
TN, FP, FN, TP
[2545  721  907 2050]
              precision    recall  f1-score   support

         0.0       0.74      0.78      0.76      3266
         1.0       0.74      0.69      0.72      2957

    accuracy                           0.74      6223
   macro avg       0.74      0.74      0.74      6223
weighted avg       0.74      0.74      0.74      6223

TN, FP, FN, TP
[743 604 728 592]
              precision    recall  f1-score   support

         0.0       0.51      0.55      0.53      1347
         1.0       0.49      0.45      0.47      1320

    accuracy                           0.50      2667
   macro avg       0.50      0.50      0.50      2667
weighted avg       0.50      0.50      0.50      2667

(64, 128, 128, 64)
TN, FP, FN, TP
[2518  748  814 2143]
              precision    recall  f1-score   support

         0.0       0.76      0.77      0.76      3266
         1.0       0.74      0.72      0.73      2957

    accuracy                           0

In [40]:
mlp = MLPClassifier(hidden_layer_sizes=(64,128,64), alpha=0.0001, learning_rate_init=0.001, activation='tanh', solver='adam', epsilon=0.0000000001, max_iter=10000)
mlp.fit(feat_train,samp_train)

predict_train = mlp.predict(feat_train)
predict_test = mlp.predict(feat_test)

print('TN, FP, FN, TP')
print(confusion_matrix(samp_train,predict_train).ravel())
print(classification_report(samp_train,predict_train))
print('TN, FP, FN, TP')
print(confusion_matrix(samp_test,predict_test).ravel())
print(classification_report(samp_test,predict_test))

TN, FP, FN, TP
[2489  777  836 2121]
              precision    recall  f1-score   support

         0.0       0.75      0.76      0.76      3266
         1.0       0.73      0.72      0.72      2957

    accuracy                           0.74      6223
   macro avg       0.74      0.74      0.74      6223
weighted avg       0.74      0.74      0.74      6223

TN, FP, FN, TP
[705 642 662 658]
              precision    recall  f1-score   support

         0.0       0.52      0.52      0.52      1347
         1.0       0.51      0.50      0.50      1320

    accuracy                           0.51      2667
   macro avg       0.51      0.51      0.51      2667
weighted avg       0.51      0.51      0.51      2667



In [6]:
mlp = MLPClassifier(hidden_layer_sizes=(64,128,64), alpha=0.01, learning_rate_init=0.001, activation='tanh', solver='lbfgs', max_iter=10000)
mlp.fit(feat_train,samp_train)

predict_train = mlp.predict(feat_train)
predict_test = mlp.predict(feat_test)

print('TN, FP, FN, TP')
print(confusion_matrix(samp_train,predict_train).ravel())
print(classification_report(samp_train,predict_train))
print('TN, FP, FN, TP')
print(confusion_matrix(samp_test,predict_test).ravel())
print(classification_report(samp_test,predict_test))

TN, FP, FN, TP
[2558  708  737 2220]
              precision    recall  f1-score   support

         0.0       0.78      0.78      0.78      3266
         1.0       0.76      0.75      0.75      2957

    accuracy                           0.77      6223
   macro avg       0.77      0.77      0.77      6223
weighted avg       0.77      0.77      0.77      6223

TN, FP, FN, TP
[700 647 698 622]
              precision    recall  f1-score   support

         0.0       0.50      0.52      0.51      1347
         1.0       0.49      0.47      0.48      1320

    accuracy                           0.50      2667
   macro avg       0.50      0.50      0.50      2667
weighted avg       0.50      0.50      0.50      2667



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [64]:
np.savetxt('2017-2023_nba_features_norm.csv', features_norm, delimiter=',')
np.savetxt('2017-2023_nba_samples_1d.csv', samples_1d, delimiter=',')

## 2022-2023 NBA Season

In [9]:
pop_team_stats('2023')
features,samples = generate_features('2023','2022-2023_season.csv')

In [10]:
features_norm = [[float(i)/sum(j) for i in j ]for j in features]

In [170]:
#feat_train, feat_test, samp_train, samp_test = train_test_split(features_norm,samples, test_size=0.30, random_state=1)
feat_train, feat_test, samp_train, samp_test = train_test_split(features,samples, test_size=0.30, random_state=1)

In [178]:
arcs = [(4,4,4,4),(4,8,8,8,4),(8,8,8,8,8),(8,16,16,16,8),(16,16,16,16,16)]
for arc in arcs:
    mlp = MLPClassifier(hidden_layer_sizes=arc, alpha=0.0025, learning_rate='invscaling', activation='tanh', solver='lbfgs',max_iter=10000)
    mlp.fit(feat_train,samp_train)
    print(arc)
    predict_train = mlp.predict(feat_train)
    predict_test = mlp.predict(feat_test)
    print(classification_report(samp_train,predict_train))
    print(classification_report(samp_test,predict_test))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


(4, 4, 4, 4)
              precision    recall  f1-score   support

           0       0.67      0.55      0.60       427
           1       0.66      0.77      0.71       497

   micro avg       0.67      0.67      0.67       924
   macro avg       0.67      0.66      0.66       924
weighted avg       0.67      0.67      0.66       924
 samples avg       0.67      0.67      0.67       924

              precision    recall  f1-score   support

           0       0.51      0.46      0.48       174
           1       0.61      0.65      0.63       222

   micro avg       0.57      0.57      0.57       396
   macro avg       0.56      0.55      0.55       396
weighted avg       0.56      0.57      0.56       396
 samples avg       0.57      0.57      0.57       396



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


(4, 8, 8, 8, 4)
              precision    recall  f1-score   support

           0       0.62      0.64      0.63       427
           1       0.68      0.67      0.67       497

   micro avg       0.65      0.65      0.65       924
   macro avg       0.65      0.65      0.65       924
weighted avg       0.66      0.65      0.66       924
 samples avg       0.65      0.65      0.65       924

              precision    recall  f1-score   support

           0       0.49      0.48      0.49       174
           1       0.60      0.61      0.61       222

   micro avg       0.56      0.56      0.56       396
   macro avg       0.55      0.55      0.55       396
weighted avg       0.55      0.56      0.55       396
 samples avg       0.56      0.56      0.56       396



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


(8, 8, 8, 8, 8)
              precision    recall  f1-score   support

           0       0.80      0.74      0.77       427
           1       0.80      0.83      0.81       497

   micro avg       0.80      0.79      0.79       924
   macro avg       0.80      0.79      0.79       924
weighted avg       0.80      0.79      0.79       924
 samples avg       0.79      0.79      0.79       924

              precision    recall  f1-score   support

           0       0.46      0.47      0.46       174
           1       0.58      0.57      0.57       222

   micro avg       0.52      0.52      0.52       396
   macro avg       0.52      0.52      0.52       396
weighted avg       0.53      0.52      0.52       396
 samples avg       0.52      0.52      0.52       396



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
  _warn_prf(average, modifier, msg_start, len(result))


(8, 16, 16, 16, 8)
              precision    recall  f1-score   support

           0       0.83      0.85      0.84       427
           1       0.87      0.86      0.86       497

   micro avg       0.85      0.85      0.85       924
   macro avg       0.85      0.85      0.85       924
weighted avg       0.85      0.85      0.85       924
 samples avg       0.85      0.85      0.85       924

              precision    recall  f1-score   support

           0       0.42      0.40      0.41       174
           1       0.54      0.56      0.55       222

   micro avg       0.49      0.49      0.49       396
   macro avg       0.48      0.48      0.48       396
weighted avg       0.49      0.49      0.49       396
 samples avg       0.49      0.49      0.49       396

(16, 16, 16, 16, 16)
              precision    recall  f1-score   support

           0       0.85      0.86      0.85       427
           1       0.89      0.85      0.87       497

   micro avg       0.87      0.85 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [201]:
#solver = lbfgs might be good
#mlp = MLPClassifier(hidden_layer_sizes=(16,32,16), alpha=1, learning_rate_init=0.025, activation='relu', solver='lbfgs', max_iter=10000)
mlp = MLPClassifier(hidden_layer_sizes=(16,32,64,64,32,16), alpha=0.05, learning_rate='invscaling', activation='tanh', solver='lbfgs',max_iter=5000)
mlp.fit(feat_train,samp_train)

predict_train = mlp.predict(feat_train)
predict_test = mlp.predict(feat_test)

print(classification_report(samp_train,predict_train))
print(classification_report(samp_test,predict_test))

              precision    recall  f1-score   support

           0       0.91      0.80      0.85       427
           1       0.93      0.82      0.87       497

   micro avg       0.92      0.81      0.86       924
   macro avg       0.92      0.81      0.86       924
weighted avg       0.92      0.81      0.86       924
 samples avg       0.81      0.81      0.81       924

              precision    recall  f1-score   support

           0       0.48      0.50      0.49       174
           1       0.59      0.55      0.57       222

   micro avg       0.53      0.53      0.53       396
   macro avg       0.53      0.53      0.53       396
weighted avg       0.54      0.53      0.53       396
 samples avg       0.53      0.53      0.53       396



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [203]:
#solver = lbfgs might be good
#mlp = MLPClassifier(hidden_layer_sizes=(16,32,16), alpha=1, learning_rate_init=0.025, activation='relu', solver='lbfgs', max_iter=10000)
mlp = MLPClassifier(hidden_layer_sizes=(16,32,64,64,32,16), alpha=0.025, learning_rate='invscaling', activation='tanh', solver='lbfgs',max_iter=5000)
mlp.fit(feat_train,samp_train)

predict_train = mlp.predict(feat_train)
predict_test = mlp.predict(feat_test)

print(classification_report(samp_train,predict_train))
print(classification_report(samp_test,predict_test))

              precision    recall  f1-score   support

           0       0.83      0.91      0.87       427
           1       0.86      0.90      0.88       497

   micro avg       0.85      0.90      0.87       924
   macro avg       0.85      0.90      0.87       924
weighted avg       0.85      0.90      0.87       924
 samples avg       0.84      0.90      0.86       924

              precision    recall  f1-score   support

           0       0.50      0.56      0.53       174
           1       0.61      0.58      0.59       222

   micro avg       0.56      0.57      0.56       396
   macro avg       0.55      0.57      0.56       396
weighted avg       0.56      0.57      0.56       396
 samples avg       0.56      0.57      0.56       396



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [213]:
#solver = lbfgs might be good
#mlp = MLPClassifier(hidden_layer_sizes=(16,32,16), alpha=1, learning_rate_init=0.025, activation='relu', solver='lbfgs', max_iter=10000)
mlp = MLPClassifier(hidden_layer_sizes=(16,32,64,64,32,16), alpha=0.075, learning_rate='invscaling', activation='tanh', solver='lbfgs',max_iter=10000)
mlp.fit(feat_train,samp_train)

predict_train = mlp.predict(feat_train)
predict_test = mlp.predict(feat_test)

print(classification_report(samp_train,predict_train))
print(classification_report(samp_test,predict_test))

              precision    recall  f1-score   support

           0       0.90      0.81      0.85       427
           1       0.93      0.82      0.87       497

   micro avg       0.91      0.82      0.86       924
   macro avg       0.91      0.82      0.86       924
weighted avg       0.91      0.82      0.86       924
 samples avg       0.82      0.82      0.82       924

              precision    recall  f1-score   support

           0       0.49      0.49      0.49       174
           1       0.60      0.59      0.59       222

   micro avg       0.55      0.54      0.55       396
   macro avg       0.54      0.54      0.54       396
weighted avg       0.55      0.54      0.55       396
 samples avg       0.54      0.54      0.54       396



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## 2021 - 2023 Combined

In [4]:
#get features and samples for each season, combine into features and samples array
pop_team_stats('2021')
features_2021,samples_2021 = generate_features('2021','2020-2021_season.csv')

In [5]:
pop_team_stats('2022')
features_2022,samples_2022 = generate_features('2022','2021-2022_season.csv')

In [6]:
pop_team_stats('2023')
features_2023,samples_2023 = generate_features('2023','2022-2023_season.csv')

In [7]:
features = []
samples = []
features.extend(features_2021)
features.extend(features_2022)
features.extend(features_2023)
samples.extend(samples_2021)
samples.extend(samples_2022)
samples.extend(samples_2023)

#normalize?
#features_norm = [[float(i)/sum(j) for i in j ]for j in features]

In [78]:
features_norm = [[float(i)/sum(j) for i in j ]for j in features]
samples_1d = [0 if j[0] == 0 else 1 for j in samples]
feat_train, feat_test, samp_train, samp_test = train_test_split(features_norm,samples_1d, test_size=0.30, random_state=1)
#feat_train, feat_test, samp_train, samp_test = train_test_split(features_norm,samples, test_size=0.30, random_state=1)
#feat_train, feat_test, samp_train, samp_test = train_test_split(features,samples, test_size=0.30, random_state=1)

In [55]:
#solver = lbfgs might be good
#mlp = MLPClassifier(hidden_layer_sizes=(16,32,16), alpha=1, learning_rate_init=0.025, activation='relu', solver='lbfgs', max_iter=10000)
#mlp = MLPClassifier(hidden_layer_sizes=(16,32,64,64,32,16), alpha=0.075, learning_rate='invscaling', activation='tanh', solver='lbfgs',max_iter=5000)
mlp = MLPClassifier(hidden_layer_sizes=(16,32,64,64,32,16), alpha=0.075, learning_rate='invscaling', activation='tanh', solver='lbfgs',max_iter=5000)
mlp.fit(feat_train,samp_train)

predict_train = mlp.predict(feat_train)
predict_test = mlp.predict(feat_test)

print(classification_report(samp_train,predict_train))
print(classification_report(samp_test,predict_test))

              precision    recall  f1-score   support

           0       0.75      0.74      0.75      1276
           1       0.77      0.77      0.77      1393

   micro avg       0.76      0.76      0.76      2669
   macro avg       0.76      0.76      0.76      2669
weighted avg       0.76      0.76      0.76      2669
 samples avg       0.75      0.76      0.75      2669

              precision    recall  f1-score   support

           0       0.52      0.51      0.52       551
           1       0.56      0.56      0.56       594

   micro avg       0.54      0.54      0.54      1145
   macro avg       0.54      0.54      0.54      1145
weighted avg       0.54      0.54      0.54      1145
 samples avg       0.54      0.54      0.54      1145



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [63]:
#solver = lbfgs might be good
#mlp = MLPClassifier(hidden_layer_sizes=(16,32,16), alpha=1, learning_rate_init=0.025, activation='relu', solver='lbfgs', max_iter=10000)
mlp = MLPClassifier(hidden_layer_sizes=(16,32,64,32,16), alpha=0.075, activation='tanh', solver='adam',max_iter=5000, epsilon=0.00000001)
mlp.fit(feat_train,samp_train)

# np_samp_train = np.array(samp_train)
# np_samp_test = np.array(samp_test)
predict_train = mlp.predict(feat_train)
predict_test = mlp.predict(feat_test)

print(classification_report(samp_train,predict_train))
print(classification_report(samp_test,predict_test))

              precision    recall  f1-score   support

           0       0.56      0.34      0.43      1276
           1       0.56      0.77      0.65      1393

   micro avg       0.56      0.57      0.56      2669
   macro avg       0.56      0.56      0.54      2669
weighted avg       0.56      0.57      0.54      2669
 samples avg       0.55      0.57      0.56      2669

              precision    recall  f1-score   support

           0       0.47      0.29      0.36       551
           1       0.51      0.70      0.59       594

   micro avg       0.50      0.50      0.50      1145
   macro avg       0.49      0.49      0.47      1145
weighted avg       0.49      0.50      0.48      1145
 samples avg       0.49      0.50      0.49      1145



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [83]:
#solver = lbfgs might be good
#mlp = MLPClassifier(hidden_layer_sizes=(16,32,16), alpha=1, learning_rate_init=0.025, activation='relu', solver='lbfgs', max_iter=10000)
#mlp = MLPClassifier(hidden_layer_sizes=(16,32,64,64,32,16), alpha=0.075, learning_rate='invscaling', activation='tanh', solver='lbfgs',max_iter=5000)
mlp = MLPClassifier(hidden_layer_sizes=(16,32,64,64,32,16), alpha=0.075, learning_rate='invscaling', activation='tanh', solver='lbfgs',max_iter=10000)
mlp.fit(feat_train,samp_train)

predict_train = mlp.predict(feat_train)
predict_test = mlp.predict(feat_test)

print('TN, FP, FN, TP')
print(confusion_matrix(samp_train,predict_train).ravel())
print(classification_report(samp_train,predict_train))
print('TN, FP, FN, TP')
print(confusion_matrix(samp_test,predict_test).ravel())
print(classification_report(samp_test,predict_test))

TN, FP, FN, TP
[1086  307  336  940]
              precision    recall  f1-score   support

           0       0.76      0.78      0.77      1393
           1       0.75      0.74      0.75      1276

    accuracy                           0.76      2669
   macro avg       0.76      0.76      0.76      2669
weighted avg       0.76      0.76      0.76      2669

TN, FP, FN, TP
[323 271 268 283]
              precision    recall  f1-score   support

           0       0.55      0.54      0.55       594
           1       0.51      0.51      0.51       551

    accuracy                           0.53      1145
   macro avg       0.53      0.53      0.53      1145
weighted avg       0.53      0.53      0.53      1145



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [None]:
np.savetxt('2021-2023_nba_features_raw.csv', features, delimiter=',')
np.savetxt('2021-2023_nba_samples_1d.csv', samples_1d, delimiter=',')

In [82]:
print(confusion_matrix(samp_train,predict_train).ravel())

[1086  307  336  940]
