In [5]:
# !pip install nba_api
import time
import ast
from nba_api.stats.static import teams
from nba_api.stats.endpoints import leaguegamefinder
import pandas as pd
import datetime

import os.path
from os import path
import numpy as np
import pickle
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from os import listdir
from os.path import isfile, join

In [6]:
def getData(filename, start_date, end_date):
    nba_teams = teams.get_teams()
    # if the file does not exist, create with a csv that aggregates all raw data
    if (path.exists(filename) != True):
        
        team_id = nba_teams[0]['id']
        gamefinder = leaguegamefinder.LeagueGameFinder(date_from_nullable = start_date , date_to_nullable = end_date , team_id_nullable = team_id)
        games = gamefinder.get_data_frames()[0]
        games.to_csv(filename,index=False)
    
    # if the file exists, start with the row that the csv left off at
    if (path.exists(filename) == True):   
        
        old_df = pd.read_csv(filename)
        last_id = old_df['TEAM_ID'][len(old_df)-1]
        start_id = int(last_id) + 1
        
        while start_id <= 1610612766:
            old_df = pd.read_csv(filename)
            gamefinder = leaguegamefinder.LeagueGameFinder(date_from_nullable = start_date , date_to_nullable = end_date , team_id_nullable = start_id)
            games = gamefinder.get_data_frames()[0]
            new_df = old_df.append(games)
            new_df.to_csv(filename, index=False)
            start_id = start_id + 1
        off_reb_given_up(filename)
        days_rest(filename)

In [7]:
def off_reb_given_up(filename):
    df = pd.read_csv(filename)
    off_reb_given_up = []
    for row in df.iterrows():
        catch = df.loc[df['GAME_ID'] == row[1]['GAME_ID']]
        row[1]['OREB_GIVEN_UP'] = catch.iloc[1]['OREB']
        off_reb_given_up.append(row[1]['OREB_GIVEN_UP'])
    df.insert(loc=(df.columns.get_loc("REB")) + 1, column="OREB_GIVEN_UP", value=off_reb_given_up)
    df.to_csv(filename, index=False)


In [8]:
def days_rest(filename):
    df = pd.read_csv(filename)
    nba_teams = teams.get_teams()
    team_id = nba_teams[0]['id']
    df['GAME_DATE'] = pd.to_datetime(df['GAME_DATE'])

    df['REST'] = df.groupby(['TEAM_ABBREVIATION'])['GAME_DATE'].diff()
    df.to_csv(filename, index=False)

In [9]:
def error_handle(count, filename, start_date, end_date):
    try:
        print("-----try is running-----")
        # put csv name here
        getData(filename, start_date, end_date)
        count = 0
        
    except:
        if count < 25:
            print("-----exception handled-----", count)
            error_handle(count + 1,filename, start_date, end_date)
        else:
            print("-----max tries exceeded-----")
    
    nba_teams = teams.get_teams()
    csv_df = pd.read_csv(filename)
    cdf = csv_df.sort_values(['TEAM_ABBREVIATION','GAME_DATE'] , ascending=[True, True])
    cdf.to_csv(filename, index=False)
    
    #return rolling_average_stats(filename, 'ten_day-' + filename)

In [10]:
#name of csv to read, name of csv to write
def rolling_average_stats(r_filename, w_filename, i):  
    print('Inside rolling_average_stats')
    
    nba_teams = teams.get_teams()
    csv_df = pd.read_csv(r_filename)

    list_points = []
    list_team_points = []
    x = 1
    # for each team in csv, calculate the rolling average based on the parameter, i, that is passed in
    for team in nba_teams:
        team_df = csv_df[csv_df['TEAM_ID'] == team['id']]
        for col in team_df.columns[9:]:
            team_df['AV_'+ col] = team_df[col].rolling(window=i).mean()
            team_df['AV_'+ col] = team_df['AV_'+ col].shift(1)
        head = list(team_df.columns.values)
        if x == 1:
            team_df.to_csv(w_filename, header=head, index=False)
            x = x+1

        else:
            team_df.to_csv(w_filename, mode='a', header=False, index=False)

In [11]:
def combine_team_games(df, keep_method='home'):
    '''Combine a TEAM_ID-GAME_ID unique table into rows by game. Slow.

        Parameters
        ----------
        df : Input DataFrame.
        keep_method : {'home', 'away', 'winner', 'loser', ``None``}, default 'home'
            - 'home' : Keep rows where TEAM_A is the home team.
            - 'away' : Keep rows where TEAM_A is the away team.
            - 'winner' : Keep rows where TEAM_A is the losing team.
            - 'loser' : Keep rows where TEAM_A is the winning team.
            - ``None`` : Keep all rows. Will result in an output DataFrame the same
                length as the input DataFrame.
                
        Returns
        -------
        result : DataFrame
    '''
    # Join every row to all others with the same game ID.
    joined = pd.merge(df, df, suffixes=['_A', '_B'],
                      on=['SEASON_ID', 'GAME_ID', 'GAME_DATE'])
    # Filter out any row that is joined to itself.
    result = joined[joined.TEAM_ID_A != joined.TEAM_ID_B]
    # Take action based on the keep_method flag.
    if keep_method is None:
        # Return all the rows.
        pass
    elif keep_method.lower() == 'home':
        # Keep rows where TEAM_A is the home team.
        result = result[result.MATCHUP_A.str.contains(' vs. ')]
    elif keep_method.lower() == 'away':
        # Keep rows where TEAM_A is the away team.
        result = result[result.MATCHUP_A.str.contains(' @ ')]
    elif keep_method.lower() == 'winner':
        result = result[result.WL_A == 'W']
    elif keep_method.lower() == 'loser':
        result = result[result.WL_A == 'L']
    else:
        raise ValueError(f'Invalid keep_method: {keep_method}')
    return result
    

In [12]:
# uses combine function and cleans csv
# ten_day_csv is the csv with the rolling ten day averages for a year
# combined_csv is the returned csv with teams combined with their matchups
def combine_and_clean(ten_day_csv, combined_csv):
    print('Inside combine_and_clean')
    
    attempt = pd.read_csv(ten_day_csv,index_col=[0])
    # Drop these fields to only leave averages
    attempt = attempt.drop(['PTS','FGM','FGA','FG_PCT','FG3M','FG3A','FG3_PCT','FTM','FTA','FT_PCT','OREB','DREB','REB','AST','STL','BLK','TOV','PF', 'PLUS_MINUS'],axis=1)

    count = 0
    # combine rows from rolling average pdf so that it illustrates TEAM_A attributes vs TEAM_B attributes (side by side)
    for row in attempt.iterrows():
        # creates csv
        if (count == 0):
            catch = attempt.loc[attempt['GAME_ID'] == row[1]['GAME_ID']]
            catch = pd.DataFrame(catch)
            combine = combine_team_games(catch)
            combine.to_csv(combined_csv, index=False)
            count = count + 1
        # appends csv
        else: 
            old_df = pd.read_csv(combined_csv)
            catch = attempt.loc[attempt['GAME_ID'] == row[1]['GAME_ID']]
            catch = pd.DataFrame(catch)
            combine = combine_team_games(catch)
            new_df = old_df.append(combine)
            new_df.to_csv(combined_csv, index=False)
    
    clean = pd.read_csv(combined_csv)
    # drops duplicates, sort by game date, and replace W with 1 and L with 0
    cleaned = clean.drop_duplicates(subset='GAME_ID')
    cleaned = cleaned.sort_values('GAME_DATE')
    cleaned['WL_A'] = cleaned['WL_A'].replace(['W','L'],[1,0])
    cleaned['WL_B'] = cleaned['WL_B'].replace(['W','L'],[1,0])
    cleaned.to_csv(combined_csv, index=False)
    return combined_csv



In [13]:
def get_zscore_for_one_year(cleaned_csv):
    #print("Inside get_zscore_for_one_year")
    
    data = pd.read_csv(cleaned_csv)
    data = data.dropna()

    z_data = pd.DataFrame(columns = ['GAME_ID', 'GAME_DATE', 'MATCHUP','WL', 'PTS', 'FGM' , 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PLUS_MINUS'])
    z_data['WL'] = data['WL_A']
    z_data['GAME_ID'] = data['GAME_ID']
    z_data['GAME_DATE'] = data['GAME_DATE']
    z_data['MATCHUP'] = data['MATCHUP_A']
    # for each of the rolling averages that were calculated, take the difference of TEAM_A's averages - TEAM_B's averages
    for column in z_data.columns[4:]:
        z_data[column] = data['AV_' + column + '_A'] - data['AV_' + column + '_B']
    # Formatting
    z_data = z_data.dropna()
    z_data = z_data.round(decimals=3)
    return z_data

In [14]:
def get_zscores(year1_cleaned_csv, year2_cleaned_csv, year3_cleaned_csv):
    
    df1 = get_zscore_for_one_year(year1_cleaned_csv)
    df2 = get_zscore_for_one_year(year2_cleaned_csv)
    df3 = get_zscore_for_one_year(year3_cleaned_csv)
    
    df1 = df1.append(df2)
    df1 = df1.append(df3)
    df1.to_csv("all_zscores.csv", index=False)
    return performLogReg(df1)

In [15]:
# Creates the logistic regression model and tests accuracy
def performLogReg(dataframe):

    # Update if new stats are added
    featureColumns = ['PTS', 'FGM', 'FGA', 'FG3_PCT', 'FTA','REB', 'AST',  'STL', 'TOV']

    X = dataframe[featureColumns] # Features
    Y = dataframe['WL'] # Target Variable

    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, shuffle=True)
    logreg = LogisticRegression()

    logreg.fit(X_train, Y_train)  # Fits model with data
    filename = 'finalized_model.sav'
    pickle.dump(logreg, open(filename, 'wb'))

    Y_pred = logreg.predict(X_test)

    confusionMatrix = metrics.confusion_matrix(Y_test, Y_pred)  # Diagonals tell you correct predictions

    # Code below prints model accuracy information
    print('Coefficient Information:')

    for i in range(len(featureColumns)):  # Prints each feature next to its corresponding coefficient in the model

        logregCoefficients = logreg.coef_

        currentFeature = featureColumns[i]
        currentCoefficient = logregCoefficients[0][i]

        print(currentFeature + ': ' + str(currentCoefficient))

    print('----------------------------------')

    print("Accuracy:", metrics.accuracy_score(Y_test, Y_pred))
    print("Precision:", metrics.precision_score(Y_test, Y_pred))
    print("Recall:", metrics.recall_score(Y_test, Y_pred))

    print('----------------------------------')

    print('Confusion Matrix:')
    print(confusionMatrix)

    return logreg


In [128]:
def fast_zscore(cleaned_csv):
    data = pd.read_csv(cleaned_csv)

    #### Organize matchup and find index of home team####
    matchup_list = [ast.literal_eval(i) for i in data['MATCHUP']]
    idx_ = [(j, val) for i in matchup_list for j, val in enumerate(i) if 'vs' in val]
    ## if idx is 0 subtract like normal other wise reverse and subtract
    zscore_list_ = []
    for i, index_row in zip(idx_ ,data.iterrows()):
        temp = []
        temp.append(index_row[1][0])
        row = []
        for j,k in enumerate(list(index_row[1])):
            if j != 0:
                if 'nan' in k:
                    temp.append("None")
                else:
                    temp.append(ast.literal_eval((k)))
        if i[0] == 0:
            #home index is '0'
            row.append(temp[0])#game_id
            row.append(temp[5][0]) #date
            row.append(temp[6][0]) #matchup
            w_l = 1 if temp[7][0] == 'W' else 0
            row.append(w_l)
            row.extend(j[0] - j[1]  if (j) != 'None' else 'None' for j in temp[28:])
        else:
            #home index is '1'
            row.append(temp[0])#game_id
            row.append(temp[5][0]) #date
            row.append(temp[6][1]) #matchup
            w_l = 1 if temp[7][1] == 'W' else 0
            row.append(w_l)
            row.extend(j[1] - j[0]  if (j ) != 'None' else 'None' for j in temp[28:])
        zscore_list_.append(row)    
    df = pd.DataFrame(zscore_list_, columns = ['GAME_ID', 'GAME_DATE', 'MATCHUP','WL', 'PTS', 'FGM' , 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PLUS_MINUS'])
    df = df.replace(to_replace='None', value=np.nan).dropna()
    return df.round(decimals=3)
    
    
    
    #

In [None]:
#takes name of rolling csv and new name
def combine_and_clean_fast(rolling, combined_csv):
    df = pd.read_csv(rolling)
    gid_by_team = df.groupby('GAME_ID').agg(lambda x : list(x) )
    gid_by_team.to_csv(combined_csv)
    

In [17]:
season_date_dict = {
    "17-18" : ["10/17/2017", "06/17/2018"],
    "18-19" : ["10/16/2018", "06/13/2019"],
    "19-20" : ["10/22/2019", "10/11/2020"]
}


In [18]:
def big_system(year = "19-20", rolling = [10]):
    """ 
    params:
    year | specify season e.g. "19-20" or range e.g. "17-20"
    rolling | specify amount of days as list e.g. "[3,10] or [10]"
    """
    #get raw data for years specified
    range_years = year.split("-")
    if (int(range_years[0]) - int(range_years[1])) != 1:
        x = range_years[0]
        years_list = []
        while int(x) < int(range_years[1]):
            y = int(x)+1
            years_list.append(x + "-" + str(y))
            x = str(y)
        outdir = './Raw_Data'
        if not os.path.exists(outdir):
            os.mkdir(outdir)
        for y in years_list:
            fname = "20" + y + ".csv"
            fullname = os.path.join(outdir, fname)  
            error_handle(0, fullname, season_date_dict[y][0], season_date_dict[y][1])
    else:
        outdir = './Raw_Data'
        if not os.path.exists(outdir):
            os.mkdir(outdir)
        fname = "20" + year + ".csv"
        fullname = os.path.join(outdir, fname)  
        error_handle(0, fullname, season_date_dict[year][0], season_date_dict[year][1])

    #adjust rolling average deadline
    if type(rolling == list):
        xdir = './Rolling_Averages'
        if not os.path.exists(xdir):
            os.mkdir(xdir)
        files = [ f for f in listdir('./Raw_Data') if isfile(join('./Raw_Data', f))]      

        for f in files:            
            for i in rolling:
                wname =  str(i)+ "_rolling_" + f
                w_filename = os.path.join(xdir, wname) 
                rolling_average_stats('./Raw_Data/' + f, w_filename, i)
                
#     combine and clean team data
    cdir = './Combined_Data'
    if not os.path.exists(cdir):
        os.mkdir(cdir)
    r_files = [ f for f in listdir('./Rolling_Averages') if isfile(join('./Rolling_Averages', f))]      
    for f in r_files:
        combine_and_clean_fast('./Rolling_Averages/'+ f, './Combined_Data/'+'c_' + f )

    #zscore and finalize
    zdir = './Zscores'
    if not os.path.exists(zdir):
        os.mkdir(zdir)
        
    z_files = [ f for f in listdir('./Combined_Data') if isfile(join('./Combined_Data', f))]
   
    z_index = []
    for f in z_files:
        split = f.split('_')
        i = (split[1])
        if i not in z_index:
            z_index.append(i)
    z_2d_files = [[f for f in z_files if f[2:(2+len(i))]==i] for i in z_index ]
    
    for i in z_2d_files:
        #z_score_df = pd.DataFrame()
        #make zscores 
        for j in i:
            df = fast_zscore('./Combined_Data/'+j)
            df = df.sort_values('GAME_DATE')
            df.to_csv(zdir + "/z_" + j[2:], index=False)  
           
            
                   

In [None]:
x = big_system(year = "17-20", rolling = [3,5,10] )

In [15]:
# cdir = './Combined_Data'
# if not os.path.exists(cdir):
#     os.mkdir(cdir)
# r_files = [ f for f in listdir('./Rolling_Averages') if isfile(join('./Rolling_Averages', f))]      
# for f in r_files:
#     before_load = time.perf_counter()
#     combine_and_clean('./Rolling_Averages/'+ f, './Combined_Data/'+'c_' + f )
#     after_load = time.perf_counter()
#     print(after_load - before_load)

Inside combine_and_clean
303.982174988
Inside combine_and_clean
234.12189158599995
Inside combine_and_clean
228.5986795670001
Inside combine_and_clean
219.07340677599996
Inside combine_and_clean
232.92488859900004
Inside combine_and_clean
228.87509998799987
Inside combine_and_clean
216.29968133399984
Inside combine_and_clean
363.20393629
Inside combine_and_clean
290.897709759


SyntaxError: invalid syntax (<ipython-input-41-2419848a1521>, line 4)

3_rolling_2018-19.csv
3.5105376859992248


In [130]:
# df = fast_zscore('test3.csv')
# df.to_csv('z_final.csv')