## Import necessary libraries

In [1]:
import warnings
warnings.simplefilter('ignore')

In [2]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt

## Read and Display NBA data

In [3]:
nbadata = pd.read_csv(r'C:\Users\User\Downloads\nba_data_science_project\data\rawdata\nbaallelo.csv')
nbadata1 = pd.read_csv(r'C:\Users\User\Downloads\nba_data_science_project\data\rawdata\nba_elo.csv')

In [4]:
columnstodrop = ['gameorder', 'game_id', 'lg_id', '_iscopy',
                  'fran_id', 'elo_n', 'win_equiv', 'opp_id', 'opp_pts', 
                  'opp_elo_i', 'opp_fran', 'opp_elo_n', 'game_location', 'forecast', 'notes']

columnstodrop1 = ['neutral', 'elo_prob1', 'elo_prob2', 'elo1_post', 'elo2_post',
       'carm-elo1_pre', 'carm-elo2_pre', 'carm-elo_prob1', 'carm-elo_prob2',
       'carm-elo1_post', 'carm-elo2_post', 'raptor_prob1', 'raptor_prob2', 'quality',
       'importance', 'total_rating']

In [5]:
df = nbadata.drop(columns=columnstodrop)
df1 = nbadata1.drop(columns=columnstodrop1)
display(df.head())
display(df1.head())

Unnamed: 0,year_id,date_game,seasongame,is_playoffs,team_id,pts,elo_i,game_result
0,1947,11/1/1946,1,0,TRH,66,1300.0,L
1,1947,11/1/1946,1,0,NYK,68,1300.0,W
2,1947,11/2/1946,1,0,CHS,63,1300.0,W
3,1947,11/2/1946,2,0,NYK,47,1306.7233,L
4,1947,11/2/1946,1,0,DTF,33,1300.0,L


Unnamed: 0,date,season,playoff,team1,team2,elo1_pre,elo2_pre,raptor1_pre,raptor2_pre,score1,score2
0,1946-11-01,1947,,TRH,NYK,1300.0,1300.0,,,66.0,68.0
1,1946-11-02,1947,,DTF,WSC,1300.0,1300.0,,,33.0,50.0
2,1946-11-02,1947,,CHS,NYK,1300.0,1306.7233,,,63.0,47.0
3,1946-11-02,1947,,PRO,BOS,1300.0,1300.0,,,59.0,53.0
4,1946-11-02,1947,,STB,PIT,1300.0,1300.0,,,56.0,51.0


## NBA 2016 to 2023 Playoffs Data

In [6]:
# Create a 'result' column to diplay the results of each match; win or loss

df1['result'] = np.where((df1['score1'] > df1['score2']), 'W', 'L')

In [7]:
# Separate the orignal dataframe into home and away dataframes to get results for both sides

hometeamdf = df1[['date', 'season', 'playoff', 'team1', 'elo1_pre', 'score1', 'result']]

awayteamdf = df1[['date', 'season', 'playoff', 'team2', 'elo2_pre', 'score2', 'result']]
resultswitch = {'L': 'W', 'W': 'L'}
awayteamdf['result'] = awayteamdf['result'].map(resultswitch)

In [8]:
hometeamdf.rename(columns={'team1': 'team', 'elo1_pre': 'elo_rating', 'score1': 'points'}, inplace=True)
awayteamdf.rename(columns={'team2': 'team', 'elo2_pre': 'elo_rating', 'score2': 'points'}, inplace=True)

## Playoffs Dataframe

* Create a new dataframe to contain rows for the playoffs games from the 2016 to 2019 and 2021 to 2023 seasons.
* The 2020 season was omitted due to the COVID-19 pandemic, which caused the NBA to cut the season short.

* This is to ensure consistency within the dataframe, especially for the ML aspect.

In [9]:
# Create new dataframe with the selected seasons

years = [2016, 2017, 2018, 2019, 2021, 2022, 2023]

dflist =[]
for i in years:
    homedf = hometeamdf.loc[(hometeamdf['season'] == i)]
    awaydf = awayteamdf.loc[(awayteamdf['season'] == i)]
    dflist.append(homedf)
    dflist.append(awaydf)

nbadf = pd.concat(dflist, axis=0, ignore_index=True)
nbadf.head()

nbadf['playoff'].fillna('p', inplace=True)

In [10]:
nbadf2 = nbadf.loc[(nbadf['playoff'] != 'p')]
nbadf2.reset_index(inplace=True, drop=True)
nbadf2.drop(columns=['playoff'], inplace=True)
nbadf2.head()

Unnamed: 0,date,season,team,elo_rating,points,result
0,2016-04-16,2016,TOR,1632.913283,90.0,L
1,2016-04-16,2016,GSW,1788.472611,104.0,W
2,2016-04-16,2016,ATL,1605.928624,102.0,W
3,2016-04-16,2016,OKC,1682.811367,108.0,W
4,2016-04-17,2016,CLE,1642.356507,106.0,W


In [11]:
# Separate the 'result' column into wins and losses columns

nbadf2 = pd.get_dummies(nbadf2, columns=['result'], prefix='')
nbadf2.head()

Unnamed: 0,date,season,team,elo_rating,points,_L,_W
0,2016-04-16,2016,TOR,1632.913283,90.0,1,0
1,2016-04-16,2016,GSW,1788.472611,104.0,0,1
2,2016-04-16,2016,ATL,1605.928624,102.0,0,1
3,2016-04-16,2016,OKC,1682.811367,108.0,0,1
4,2016-04-17,2016,CLE,1642.356507,106.0,0,1


In [12]:
# Group the dataframe by the 'year' and 'teams'
# Calculate the average numerical values
# Calculate the total/sum numerical values

group_averages = nbadf2.groupby(['season', 'team'], as_index=False).mean().round()

group_sums = nbadf2.groupby(['season', 'team'], as_index=False).sum()
group_sums.drop(columns=['elo_rating'], inplace=True)

# Create a league table by merging the total season points, wins, losses and the average elo_rating for each team

nbaplayofftable = group_sums.merge(group_averages['elo_rating'], left_index=True, right_index=True)
elo_column = nbaplayofftable.pop('elo_rating')
nbaplayofftable.insert(2, 'elo_rating', elo_column)
nbaplayofftable.head()

Unnamed: 0,season,team,elo_rating,points,_L,_W
0,2016,ATL,1611.0,1001.0,6,4
1,2016,BOS,1570.0,563.0,4,2
2,2016,CHO,1576.0,632.0,4,3
3,2016,CLE,1695.0,2200.0,5,16
4,2016,DAL,1521.0,469.0,4,1


In [13]:
# Calculate Win percentage for each team and assign the values to 'W%' column

nbaplayofftable['W%'] = ((nbaplayofftable['_W']/(nbaplayofftable['_W'] + nbaplayofftable['_L'])*100)).round(2)
nbaplayofftable.rename(columns={ '_L': 'L', '_W': 'W'}, inplace=True)
nbaplayofftable.head()

Unnamed: 0,season,team,elo_rating,points,L,W,W%
0,2016,ATL,1611.0,1001.0,6,4,40.0
1,2016,BOS,1570.0,563.0,4,2,33.33
2,2016,CHO,1576.0,632.0,4,3,42.86
3,2016,CLE,1695.0,2200.0,5,16,76.19
4,2016,DAL,1521.0,469.0,4,1,20.0


## NBA 2015 Playoffs Data

In [14]:
# Extract NBA 2015 playoffs data from the original dataframe

nbadf2015 = df.loc[(df['is_playoffs'] == 1) & (df['year_id'] == 2015)]
nbadf2015.reset_index(inplace=True, drop=True)
nbadf2015.drop(columns=['is_playoffs', 'seasongame'], inplace=True)
nbadf2015.head()

Unnamed: 0,year_id,date_game,team_id,pts,elo_i,game_result
0,2015,4/18/2015,MIL,91,1472.0676,L
1,2015,4/18/2015,CHI,103,1583.3149,W
2,2015,4/18/2015,NOP,99,1539.7662,L
3,2015,4/18/2015,GSW,106,1772.3469,W
4,2015,4/18/2015,DAL,108,1564.1244,L


In [15]:
# Separate the 'game_result' column into wins and losses columns

nbadf2015 = pd.get_dummies(nbadf2015, columns=['game_result'], prefix='')
nbadf2015.head()

Unnamed: 0,year_id,date_game,team_id,pts,elo_i,_L,_W
0,2015,4/18/2015,MIL,91,1472.0676,1,0
1,2015,4/18/2015,CHI,103,1583.3149,0,1
2,2015,4/18/2015,NOP,99,1539.7662,1,0
3,2015,4/18/2015,GSW,106,1772.3469,0,1
4,2015,4/18/2015,DAL,108,1564.1244,1,0


In [16]:
# Group the dataframe by the 'year' and 'teams'
# Calculate the average numerical values
# Calculate the total/sum numerical values

groupaverages = nbadf2015.groupby(['year_id', 'team_id'], as_index=False).mean().round()

groupsums = nbadf2015.groupby(['year_id', 'team_id'], as_index=False).sum()
groupsums.drop(columns=['elo_i'], inplace=True)

# Create a league table by merging the total season points, wins, losses and the average elo_rating for each team

nbaplayofftable2015 = groupsums.merge(groupaverages['elo_i'], left_index=True, right_index=True)
elocolumn = nbaplayofftable2015.pop('elo_i')
nbaplayofftable2015.insert(2, 'elo_i', elocolumn)
nbaplayofftable2015.head()

Unnamed: 0,year_id,team_id,elo_i,pts,_L,_W
0,2015,ATL,1597.0,1568,8,8
1,2015,BOS,1543.0,379,4,0
2,2015,BRK,1470.0,578,4,2
3,2015,CHI,1603.0,1152,6,6
4,2015,CLE,1676.0,1980,6,14


In [17]:
# Calculate Win percentage for each team and assign the values to 'W%' column

nbaplayofftable2015['W%'] = ((nbaplayofftable2015['_W']/(nbaplayofftable2015['_W'] + nbaplayofftable2015['_L'])*100)).round(2)
nbaplayofftable2015.sort_values('W%', ascending=False, ignore_index=True, inplace=True)

nbaplayofftable2015.rename(columns={'year_id': 'season', 'team_id': 'team', 'elo_i': 'elo_rating', 'pts': 'points', '_L': 'L', '_W': 'W'}, inplace=True)

nbaplayofftable2015.head()

Unnamed: 0,season,team,elo_rating,points,L,W,W%
0,2015,GSW,1788.0,2169,5,16,76.19
1,2015,CLE,1676.0,1980,6,14,70.0
2,2015,WAS,1547.0,1011,4,6,60.0
3,2015,MEM,1611.0,1042,5,6,54.55
4,2015,HOU,1647.0,1839,8,9,52.94


In [18]:
# Create a function to determine the winner of each playoffs season

def playoffwinner(df):
    dflist =[]

    for i in df['season'].unique():
        df1 = df.loc[(df['season'] == i)]
        df1.sort_values('W%', ascending=False, ignore_index=True, inplace=True)
        df1["playoffs_winner"] = df1["W%"].map(lambda x: 0 if x != df1['W%'].max() else 1)
        winners = df1['playoffs_winner'].value_counts()[1]
        if winners == 1:
            pass
        else:
            df2 = df1.loc[(df1['playoffs_winner'] == 1)]
            df2.sort_values('W', ascending=False, ignore_index=True, inplace=True)
            df2.loc[1,'playoffs_winner'] = 0
            df1.update(df2)
        dflist.append(df1)

    playoffdata = pd.concat(dflist, axis=0, ignore_index=True)
    playoffdata = playoffdata.astype({'season':'int', 'L':'int', 'W':'int', 'playoffs_winner':'int'})
    return playoffdata

## NBA 2015 TO 2023 Playoffs Data

In [19]:
# Merge 2015 and 2016-2023 playoffs dataframes

nbaplayoffdata = pd.concat([nbaplayofftable2015, nbaplayofftable], axis=0, ignore_index=True)
nbaplayoffdata.head()

Unnamed: 0,season,team,elo_rating,points,L,W,W%
0,2015,GSW,1788.0,2169.0,5,16,76.19
1,2015,CLE,1676.0,1980.0,6,14,70.0
2,2015,WAS,1547.0,1011.0,4,6,60.0
3,2015,MEM,1611.0,1042.0,5,6,54.55
4,2015,HOU,1647.0,1839.0,8,9,52.94


In [20]:
# Determine winner of each playoffs season form the dataframe

finalplayoffdata = playoffwinner(nbaplayoffdata)
finalplayoffdata.head()

Unnamed: 0,season,team,elo_rating,points,L,W,W%,playoffs_winner
0,2015,GSW,1788.0,2169.0,5,16,76.19,1
1,2015,CLE,1676.0,1980.0,6,14,70.0,0
2,2015,WAS,1547.0,1011.0,4,6,60.0,0
3,2015,MEM,1611.0,1042.0,5,6,54.55,0
4,2015,HOU,1647.0,1839.0,8,9,52.94,0


In [21]:
# Save to csv file

#finalplayoffdata.to_csv(r"C:\Users\User\Downloads\nba_data_science_project\data\transformed_data\nba_playoffs.csv", index=False)