# Process Phase

## 4.1  - Cleanup

#### 4.1.1 - Setup
Before we begin, we need to make sure that we have all the required libraries installed. In this section, we will import the necessary libraries for our data cleaning.

In [684]:
import pandas as pd
import numpy as np

#### 4.1.2 - Preview the dataset
Before we start analyzing the dataset, it's important to get a general idea of what the data looks like. In this section, we'll take a closer look at the dataset and perform some initial data exploration.

In [685]:
# Read csv file
df = pd.read_csv("./data/raw_data.csv", sep=";", index_col=0)

In [686]:
# Preview the dataframe
df.head()

Unnamed: 0,MATCH UP,GAME DATE,W/L,MIN,PTS,FGM,FGA,FG%,3PM,3PA,...,FT%,OREB,DREB,REB,AST,TOV,STL,BLK,PF,+/-
0,ATL vs. DAL,04/02/2023,W,53,132,51,108,47.2,12,35,...,81.8,16,37,53,28,11,10,3,22,2
1,CHA vs. TOR,04/02/2023,L,48,108,42,85,49.4,15,31,...,69.2,10,27,37,26,18,3,4,11,-20
2,PHI @ MIL,04/02/2023,L,48,104,40,87,46.0,12,36,...,92.3,11,25,36,19,11,3,2,17,-13
3,POR @ MIN,04/02/2023,W,48,107,43,93,46.2,9,30,...,60.0,11,31,42,29,10,12,3,26,2
4,MIL vs. PHI,04/02/2023,W,48,117,46,80,57.5,10,28,...,71.4,7,35,42,28,12,8,5,17,13


In [687]:
# Check for null values
df.isnull().values.any()

False

In [688]:
# Show the data types of the dataframe
df.dtypes

MATCH UP      object
GAME DATE     object
W/L           object
MIN            int64
PTS            int64
FGM            int64
FGA            int64
FG%          float64
3PM            int64
3PA            int64
3P%          float64
FTM            int64
FTA            int64
FT%          float64
OREB           int64
DREB           int64
REB            int64
AST            int64
TOV            int64
STL            int64
BLK            int64
PF             int64
+/-            int64
dtype: object

#### 4.1.3 - Cleaning
To achieve this goal, we will perform the following tasks:

* Check for inconsistencies: We will review the dataset for any discrepancies or errors. This includes checking for missing values, incorrect data types, or any other anomalies that may affect the quality of the dataset.

* Clean up columns: We will modify the column names to ensure that they are descriptive, meaningful, and easy to understand. This step involves removing unnecessary spaces, symbols, and special characters from the column names.

* Verify data integrity: We will validate the dataset to ensure that all records are correct, accurate, and complete. This step involves checking for duplicate records, incorrect data entries, and missing values.

By performing these tasks, we can ensure that the dataset is in a usable and user-friendly format, which will facilitate further analysis and interpretation.

In [689]:
# Showcase the dataframes columns
df.columns

Index(['MATCH UP', 'GAME DATE', 'W/L', 'MIN', 'PTS', 'FGM', 'FGA', 'FG%',
       '3PM', '3PA', '3P%', 'FTM', 'FTA', 'FT%', 'OREB', 'DREB', 'REB', 'AST',
       'TOV', 'STL', 'BLK', 'PF', '+/-'],
      dtype='object')

In [690]:
# Drop unwanted columns from the dataframe
df = df.drop(["+/-"], axis=1)

In [691]:
# The new column names
new_cols = ['match_up', 'game_date', 'result', 'min', 'pts', 'fgm',
            'fga', 'fgp', 'tpm', 'tpa', 'tpp', 'ftm', 'fta','ftp','oreb',
            'dreb', 'reb','ast', 'tov', 'stl', 'blk', 'pf']

In [692]:
# We map the old columns to the new ones
rename_cols = dict(zip(df.columns, new_cols))

In [693]:
print(rename_cols)

{'MATCH UP': 'match_up', 'GAME DATE': 'game_date', 'W/L': 'result', 'MIN': 'min', 'PTS': 'pts', 'FGM': 'fgm', 'FGA': 'fga', 'FG%': 'fgp', '3PM': 'tpm', '3PA': 'tpa', '3P%': 'tpp', 'FTM': 'ftm', 'FTA': 'fta', 'FT%': 'ftp', 'OREB': 'oreb', 'DREB': 'dreb', 'REB': 'reb', 'AST': 'ast', 'TOV': 'tov', 'STL': 'stl', 'BLK': 'blk', 'PF': 'pf'}


In [694]:
# Apply the mapper
df = df.rename(rename_cols, axis=1)

In [695]:
# Preview the dataframe at this point
df.head()

Unnamed: 0,match_up,game_date,result,min,pts,fgm,fga,fgp,tpm,tpa,...,fta,ftp,oreb,dreb,reb,ast,tov,stl,blk,pf
0,ATL vs. DAL,04/02/2023,W,53,132,51,108,47.2,12,35,...,22,81.8,16,37,53,28,11,10,3,22
1,CHA vs. TOR,04/02/2023,L,48,108,42,85,49.4,15,31,...,13,69.2,10,27,37,26,18,3,4,11
2,PHI @ MIL,04/02/2023,L,48,104,40,87,46.0,12,36,...,13,92.3,11,25,36,19,11,3,2,17
3,POR @ MIN,04/02/2023,W,48,107,43,93,46.2,9,30,...,20,60.0,11,31,42,29,10,12,3,26
4,MIL vs. PHI,04/02/2023,W,48,117,46,80,57.5,10,28,...,21,71.4,7,35,42,28,12,8,5,17


In [696]:
teams = []
for i in df['match_up']:
    teams.append(i[:3])

In [697]:
df['team'] = teams

In [698]:
df.head()

Unnamed: 0,match_up,game_date,result,min,pts,fgm,fga,fgp,tpm,tpa,...,ftp,oreb,dreb,reb,ast,tov,stl,blk,pf,team
0,ATL vs. DAL,04/02/2023,W,53,132,51,108,47.2,12,35,...,81.8,16,37,53,28,11,10,3,22,ATL
1,CHA vs. TOR,04/02/2023,L,48,108,42,85,49.4,15,31,...,69.2,10,27,37,26,18,3,4,11,CHA
2,PHI @ MIL,04/02/2023,L,48,104,40,87,46.0,12,36,...,92.3,11,25,36,19,11,3,2,17,PHI
3,POR @ MIN,04/02/2023,W,48,107,43,93,46.2,9,30,...,60.0,11,31,42,29,10,12,3,26,POR
4,MIL vs. PHI,04/02/2023,W,48,117,46,80,57.5,10,28,...,71.4,7,35,42,28,12,8,5,17,MIL


In [699]:
def clean_matchup(value):
    """
    This function cleans up the match-up column by splitting it on 'vs.' or '@'
    and reversing the order of the teams if necessary. It also removes any
    spaces in the resulting string.
    """
    parts = value.split('vs.') if 'vs.' in value else value.split('@')
    if 'vs.' in value:
        return '-'.join(parts[::-1]).replace(' ', '')
    else:
        return '-'.join(parts).replace(' ', '')


In [700]:
# Apply the 'clean_matchup' to the 'match_up' column
df['match_up'] = df['match_up'].apply(clean_matchup)

###### This code creates a new list called game_dates by iterating over the values in the 'game_date' column of the dataframe df and removing the forward slashes from the date strings using the replace() method. The new list is created as an empty list at the beginning and values are appended to it using the append() method. The resulting list game_dates will contain date strings without forward slashes, which can be used to create a new column in the dataframe or to replace the original 'game_date' column.

In [701]:
game_dates = []
for i in df['game_date']:
    game_dates.append(i.replace('/', ''))

In [702]:
df['match_id'] = game_dates + df['match_up'].str.split('-').str[-1]

In [703]:
df.head()

Unnamed: 0,match_up,game_date,result,min,pts,fgm,fga,fgp,tpm,tpa,...,oreb,dreb,reb,ast,tov,stl,blk,pf,team,match_id
0,DAL-ATL,04/02/2023,W,53,132,51,108,47.2,12,35,...,16,37,53,28,11,10,3,22,ATL,04022023ATL
1,TOR-CHA,04/02/2023,L,48,108,42,85,49.4,15,31,...,10,27,37,26,18,3,4,11,CHA,04022023CHA
2,PHI-MIL,04/02/2023,L,48,104,40,87,46.0,12,36,...,11,25,36,19,11,3,2,17,PHI,04022023MIL
3,POR-MIN,04/02/2023,W,48,107,43,93,46.2,9,30,...,11,31,42,29,10,12,3,26,POR,04022023MIN
4,PHI-MIL,04/02/2023,W,48,117,46,80,57.5,10,28,...,7,35,42,28,12,8,5,17,MIL,04022023MIL


In [704]:
df['game_date'] = df['game_date'].apply(lambda x: x.replace('/', '-'))

In [705]:
df['game_date'] = pd.to_datetime(df['game_date'])

In [706]:
df.head()

Unnamed: 0,match_up,game_date,result,min,pts,fgm,fga,fgp,tpm,tpa,...,oreb,dreb,reb,ast,tov,stl,blk,pf,team,match_id
0,DAL-ATL,2023-04-02,W,53,132,51,108,47.2,12,35,...,16,37,53,28,11,10,3,22,ATL,04022023ATL
1,TOR-CHA,2023-04-02,L,48,108,42,85,49.4,15,31,...,10,27,37,26,18,3,4,11,CHA,04022023CHA
2,PHI-MIL,2023-04-02,L,48,104,40,87,46.0,12,36,...,11,25,36,19,11,3,2,17,PHI,04022023MIL
3,POR-MIN,2023-04-02,W,48,107,43,93,46.2,9,30,...,11,31,42,29,10,12,3,26,POR,04022023MIN
4,PHI-MIL,2023-04-02,W,48,117,46,80,57.5,10,28,...,7,35,42,28,12,8,5,17,MIL,04022023MIL


In [707]:
# Reindex columns
df = df.reindex(columns=['match_id','team','match_up', 'game_date', 'result', 'min', 'pts', 'fgm', 'fga', 'fgp',
       'tpm', 'tpa', 'tpp', 'ftm', 'fta', 'ftp', 'oreb', 'dreb', 'reb', 'ast',
       'tov', 'stl', 'blk', 'pf'])

In [708]:
df.head()

Unnamed: 0,match_id,team,match_up,game_date,result,min,pts,fgm,fga,fgp,...,fta,ftp,oreb,dreb,reb,ast,tov,stl,blk,pf
0,04022023ATL,ATL,DAL-ATL,2023-04-02,W,53,132,51,108,47.2,...,22,81.8,16,37,53,28,11,10,3,22
1,04022023CHA,CHA,TOR-CHA,2023-04-02,L,48,108,42,85,49.4,...,13,69.2,10,27,37,26,18,3,4,11
2,04022023MIL,PHI,PHI-MIL,2023-04-02,L,48,104,40,87,46.0,...,13,92.3,11,25,36,19,11,3,2,17
3,04022023MIN,POR,POR-MIN,2023-04-02,W,48,107,43,93,46.2,...,20,60.0,11,31,42,29,10,12,3,26
4,04022023MIL,MIL,PHI-MIL,2023-04-02,W,48,117,46,80,57.5,...,21,71.4,7,35,42,28,12,8,5,17


#### 4.1.4 - Database and normalization

Normalization is a process of organizing the data in a database to reduce data redundancy and improve data integrity. It involves breaking down a database into smaller, more manageable tables and establishing relationships between them. The main purpose of normalization is to eliminate redundant data and ensure that each piece of data is stored in only one place. This helps to minimize the possibility of data inconsistencies and anomalies that can occur when data is duplicated or stored in multiple locations.

Normalization is especially important when designing large databases that store a lot of information. Without normalization, data redundancy can quickly become a problem, making it difficult to maintain data consistency and accuracy. By breaking down a database into smaller, more manageable tables and establishing relationships between them, normalization helps to ensure that the data is organized in the most efficient and effective way possible.

There are several levels of normalization, each with its own set of rules and guidelines. The most common levels are first normal form (1NF), second normal form (2NF), and third normal form (3NF). The higher the level of normalization, the more complex the database design becomes, but the greater the benefits in terms of data consistency and accuracy.

In [709]:
df.head()

Unnamed: 0,match_id,team,match_up,game_date,result,min,pts,fgm,fga,fgp,...,fta,ftp,oreb,dreb,reb,ast,tov,stl,blk,pf
0,04022023ATL,ATL,DAL-ATL,2023-04-02,W,53,132,51,108,47.2,...,22,81.8,16,37,53,28,11,10,3,22
1,04022023CHA,CHA,TOR-CHA,2023-04-02,L,48,108,42,85,49.4,...,13,69.2,10,27,37,26,18,3,4,11
2,04022023MIL,PHI,PHI-MIL,2023-04-02,L,48,104,40,87,46.0,...,13,92.3,11,25,36,19,11,3,2,17
3,04022023MIN,POR,POR-MIN,2023-04-02,W,48,107,43,93,46.2,...,20,60.0,11,31,42,29,10,12,3,26
4,04022023MIL,MIL,PHI-MIL,2023-04-02,W,48,117,46,80,57.5,...,21,71.4,7,35,42,28,12,8,5,17


In [710]:
unique_dates = pd.unique(df['game_date'])

In [711]:
# Rearrenge the order, so the first date will get date_id 1
unique_dates = np.sort(unique_dates)

In [712]:
unique_dates[0]

numpy.datetime64('2022-10-18T00:00:00.000000000')

In [713]:
df_gd = pd.DataFrame({'date_id': range(1,len(unique_dates)+1),
                      'game_date': unique_dates})

In [714]:
df_gd

Unnamed: 0,date_id,game_date
0,1,2022-10-18
1,2,2022-10-19
2,3,2022-10-20
3,4,2022-10-21
4,5,2022-10-22
...,...,...
153,154,2023-03-29
154,155,2023-03-30
155,156,2023-03-31
156,157,2023-04-01


In [715]:
df.head()

Unnamed: 0,match_id,team,match_up,game_date,result,min,pts,fgm,fga,fgp,...,fta,ftp,oreb,dreb,reb,ast,tov,stl,blk,pf
0,04022023ATL,ATL,DAL-ATL,2023-04-02,W,53,132,51,108,47.2,...,22,81.8,16,37,53,28,11,10,3,22
1,04022023CHA,CHA,TOR-CHA,2023-04-02,L,48,108,42,85,49.4,...,13,69.2,10,27,37,26,18,3,4,11
2,04022023MIL,PHI,PHI-MIL,2023-04-02,L,48,104,40,87,46.0,...,13,92.3,11,25,36,19,11,3,2,17
3,04022023MIN,POR,POR-MIN,2023-04-02,W,48,107,43,93,46.2,...,20,60.0,11,31,42,29,10,12,3,26
4,04022023MIL,MIL,PHI-MIL,2023-04-02,W,48,117,46,80,57.5,...,21,71.4,7,35,42,28,12,8,5,17


In [716]:
df_merged = pd.merge(df, df_gd, on='game_date')

In [717]:
df_merged.head()

Unnamed: 0,match_id,team,match_up,game_date,result,min,pts,fgm,fga,fgp,...,ftp,oreb,dreb,reb,ast,tov,stl,blk,pf,date_id
0,04022023ATL,ATL,DAL-ATL,2023-04-02,W,53,132,51,108,47.2,...,81.8,16,37,53,28,11,10,3,22,158
1,04022023CHA,CHA,TOR-CHA,2023-04-02,L,48,108,42,85,49.4,...,69.2,10,27,37,26,18,3,4,11,158
2,04022023MIL,PHI,PHI-MIL,2023-04-02,L,48,104,40,87,46.0,...,92.3,11,25,36,19,11,3,2,17,158
3,04022023MIN,POR,POR-MIN,2023-04-02,W,48,107,43,93,46.2,...,60.0,11,31,42,29,10,12,3,26,158
4,04022023MIL,MIL,PHI-MIL,2023-04-02,W,48,117,46,80,57.5,...,71.4,7,35,42,28,12,8,5,17,158


In [718]:
df_merged = df_merged.drop('game_date', axis=1)

In [719]:
df_merged

Unnamed: 0,match_id,team,match_up,result,min,pts,fgm,fga,fgp,tpm,...,ftp,oreb,dreb,reb,ast,tov,stl,blk,pf,date_id
0,04022023ATL,ATL,DAL-ATL,W,53,132,51,108,47.2,12,...,81.8,16,37,53,28,11,10,3,22,158
1,04022023CHA,CHA,TOR-CHA,L,48,108,42,85,49.4,15,...,69.2,10,27,37,26,18,3,4,11,158
2,04022023MIL,PHI,PHI-MIL,L,48,104,40,87,46.0,12,...,92.3,11,25,36,19,11,3,2,17,158
3,04022023MIN,POR,POR-MIN,W,48,107,43,93,46.2,9,...,60.0,11,31,42,29,10,12,3,26,158
4,04022023MIL,MIL,PHI-MIL,W,48,117,46,80,57.5,10,...,71.4,7,35,42,28,12,8,5,17,158
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2345,10192022SAS,CHA,CHA-SAS,W,48,129,48,94,51.1,13,...,87.0,14,37,51,26,15,7,10,20,2
2346,10182022BOS,BOS,PHI-BOS,W,48,126,46,82,56.1,12,...,78.6,6,30,36,24,11,8,3,24,1
2347,10182022BOS,PHI,PHI-BOS,L,48,117,40,80,50.0,13,...,85.7,4,27,31,16,14,8,3,25,1
2348,10182022GSW,LAL,LAL-GSW,L,48,109,40,94,42.6,10,...,76.0,9,39,48,23,22,12,4,18,1


In [720]:
df_merged = df_merged.rename(columns={"team":"team_id"})

In [721]:
df_merged

Unnamed: 0,match_id,team_id,match_up,result,min,pts,fgm,fga,fgp,tpm,...,ftp,oreb,dreb,reb,ast,tov,stl,blk,pf,date_id
0,04022023ATL,ATL,DAL-ATL,W,53,132,51,108,47.2,12,...,81.8,16,37,53,28,11,10,3,22,158
1,04022023CHA,CHA,TOR-CHA,L,48,108,42,85,49.4,15,...,69.2,10,27,37,26,18,3,4,11,158
2,04022023MIL,PHI,PHI-MIL,L,48,104,40,87,46.0,12,...,92.3,11,25,36,19,11,3,2,17,158
3,04022023MIN,POR,POR-MIN,W,48,107,43,93,46.2,9,...,60.0,11,31,42,29,10,12,3,26,158
4,04022023MIL,MIL,PHI-MIL,W,48,117,46,80,57.5,10,...,71.4,7,35,42,28,12,8,5,17,158
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2345,10192022SAS,CHA,CHA-SAS,W,48,129,48,94,51.1,13,...,87.0,14,37,51,26,15,7,10,20,2
2346,10182022BOS,BOS,PHI-BOS,W,48,126,46,82,56.1,12,...,78.6,6,30,36,24,11,8,3,24,1
2347,10182022BOS,PHI,PHI-BOS,L,48,117,40,80,50.0,13,...,85.7,4,27,31,16,14,8,3,25,1
2348,10182022GSW,LAL,LAL-GSW,L,48,109,40,94,42.6,10,...,76.0,9,39,48,23,22,12,4,18,1


In [722]:
df_mi = df_merged[['match_id', 'team_id', 'date_id','pts', 'match_up']]

In [723]:
df_mi[['away_team', 'home_team']] = df_mi['match_up'].str.split('-', expand=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_mi[['away_team', 'home_team']] = df_mi['match_up'].str.split('-', expand=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_mi[['away_team', 'home_team']] = df_mi['match_up'].str.split('-', expand=True)


In [724]:
df_mi

Unnamed: 0,match_id,team_id,date_id,pts,match_up,away_team,home_team
0,04022023ATL,ATL,158,132,DAL-ATL,DAL,ATL
1,04022023CHA,CHA,158,108,TOR-CHA,TOR,CHA
2,04022023MIL,PHI,158,104,PHI-MIL,PHI,MIL
3,04022023MIN,POR,158,107,POR-MIN,POR,MIN
4,04022023MIL,MIL,158,117,PHI-MIL,PHI,MIL
...,...,...,...,...,...,...,...
2345,10192022SAS,CHA,2,129,CHA-SAS,CHA,SAS
2346,10182022BOS,BOS,1,126,PHI-BOS,PHI,BOS
2347,10182022BOS,PHI,1,117,PHI-BOS,PHI,BOS
2348,10182022GSW,LAL,1,109,LAL-GSW,LAL,GSW


In [725]:
# Group by match_id and aggregate the required columns
df_mi = df_mi.groupby('match_id', as_index=False).agg({
    'date_id': 'first',
    'away_team': 'first',
    'pts': ['first', 'last'],
    'home_team': 'last'
})

# Flatten the multi-level column names
df_mi.columns = [col[0] if col[1] == '' else f'{col[0]} {col[1]}' for col in df_mi.columns]

# Rename the columns
df_mi = df_mi.rename(columns={
    'match_id': 'match_id',
    'date_id first': 'date_id',
    'away_team first': 'away_team',
    'pts first': 'away_pts',
    'home_team last': 'home_team',
    'pts last': 'home_pts'
})

In [726]:
df_mi = df_mi.sort_values('date_id')

In [727]:
df_mi = df_mi.reset_index(drop=True)

In [728]:
df_mi

Unnamed: 0,match_id,date_id,away_team,away_pts,home_pts,home_team
0,10182022BOS,1,PHI,126,117,BOS
1,10182022GSW,1,LAL,109,123,GSW
2,10192022PHX,2,DAL,107,105,PHX
3,10192022UTA,2,DEN,102,123,UTA
4,10192022TOR,2,CLE,108,105,TOR
...,...,...,...,...,...,...
1170,04022023CHI,158,MEM,128,107,CHI
1171,04022023BKN,158,UTA,111,110,BKN
1172,04022023ATL,158,DAL,132,130,ATL
1173,04022023HOU,158,LAL,134,109,HOU


In [729]:
df_mi = pd.merge(df_mi, df_gd, on='date_id')

In [730]:
df_mi['game_date'] = df_mi['date_id']

df_mi = df_mi.drop('date_id', axis=1)

In [731]:
df_mi = df_mi.rename(columns={"game_date":"date_id"})

In [732]:
df_mi = df_mi.reindex(columns=['match_id', 'date_id', 'away_team', 'away_pts', 'home_pts', 'home_team'])

In [733]:
df_mi.head()

Unnamed: 0,match_id,date_id,away_team,away_pts,home_pts,home_team
0,10182022BOS,1,PHI,126,117,BOS
1,10182022GSW,1,LAL,109,123,GSW
2,10192022PHX,2,DAL,107,105,PHX
3,10192022UTA,2,DEN,102,123,UTA
4,10192022TOR,2,CLE,108,105,TOR


In [734]:
# Suppose you want to get the team-specific stats for the match with match_id = '10182022GSW' and teams 'LAL' and 'GSW'
match_id = '10182022GSW'
away_team = 'LAL'
home_team = 'GSW'

# Get the away team stats
away_team_stats = df_merged[(df_merged['match_id'] == match_id) & (df_merged['team_id'] == away_team)]

# Get the home team stats
home_team_stats = df_merged[(df_merged['match_id'] == match_id) & (df_merged['team_id'] == home_team)]


In [735]:
away_team_stats

Unnamed: 0,match_id,team_id,match_up,result,min,pts,fgm,fga,fgp,tpm,...,ftp,oreb,dreb,reb,ast,tov,stl,blk,pf,date_id
2348,10182022GSW,LAL,LAL-GSW,L,48,109,40,94,42.6,10,...,76.0,9,39,48,23,22,12,4,18,1


In [736]:
df_merged = df_merged[::-1].reset_index(drop=True)

In [737]:
df_merged = df_merged.drop('match_up', axis=1)

In [738]:
df_merged

Unnamed: 0,match_id,team_id,result,min,pts,fgm,fga,fgp,tpm,tpa,...,ftp,oreb,dreb,reb,ast,tov,stl,blk,pf,date_id
0,10182022GSW,GSW,W,48,123,45,99,45.5,16,45,...,73.9,11,37,48,31,18,11,4,23,1
1,10182022GSW,LAL,L,48,109,40,94,42.6,10,40,...,76.0,9,39,48,23,22,12,4,18,1
2,10182022BOS,PHI,L,48,117,40,80,50.0,13,34,...,85.7,4,27,31,16,14,8,3,25,1
3,10182022BOS,BOS,W,48,126,46,82,56.1,12,35,...,78.6,6,30,36,24,11,8,3,24,1
4,10192022SAS,CHA,W,48,129,48,94,51.1,13,29,...,87.0,14,37,51,26,15,7,10,20,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2345,04022023MIL,MIL,W,48,117,46,80,57.5,10,28,...,71.4,7,35,42,28,12,8,5,17,158
2346,04022023MIN,POR,W,48,107,43,93,46.2,9,30,...,60.0,11,31,42,29,10,12,3,26,158
2347,04022023MIL,PHI,L,48,104,40,87,46.0,12,36,...,92.3,11,25,36,19,11,3,2,17,158
2348,04022023CHA,CHA,L,48,108,42,85,49.4,15,31,...,69.2,10,27,37,26,18,3,4,11,158


In [739]:
df_merged.columns

Index(['match_id', 'team_id', 'result', 'min', 'pts', 'fgm', 'fga', 'fgp',
       'tpm', 'tpa', 'tpp', 'ftm', 'fta', 'ftp', 'oreb', 'dreb', 'reb', 'ast',
       'tov', 'stl', 'blk', 'pf', 'date_id'],
      dtype='object')

In [740]:
df_merged = df_merged.reindex(columns=['match_id', 'team_id','date_id', 'result', 'min', 'pts', 'fgm', 'fga', 'fgp', 'tpm',
       'tpa', 'tpp', 'ftm', 'fta', 'ftp', 'oreb', 'dreb', 'reb', 'ast', 'tov',
       'stl', 'blk', 'pf'])

In [741]:
df_merged.head()

Unnamed: 0,match_id,team_id,date_id,result,min,pts,fgm,fga,fgp,tpm,...,fta,ftp,oreb,dreb,reb,ast,tov,stl,blk,pf
0,10182022GSW,GSW,1,W,48,123,45,99,45.5,16,...,23,73.9,11,37,48,31,18,11,4,23
1,10182022GSW,LAL,1,L,48,109,40,94,42.6,10,...,25,76.0,9,39,48,23,22,12,4,18
2,10182022BOS,PHI,1,L,48,117,40,80,50.0,13,...,28,85.7,4,27,31,16,14,8,3,25
3,10182022BOS,BOS,1,W,48,126,46,82,56.1,12,...,28,78.6,6,30,36,24,11,8,3,24
4,10192022SAS,CHA,2,W,48,129,48,94,51.1,13,...,23,87.0,14,37,51,26,15,7,10,20


In [742]:
team_info = [
    ('ATL','Atlanta Hawks', 'State Farm Arena', 33.7573, -84.3963),
    ("BOS",'Boston Celtics', 'TD Garden', 42.3662, -71.0621),
    ("BKN",'Brooklyn Nets', 'Barclays Center', 40.6826, -73.9754),
    ("CHA",'Charlotte Hornets', 'Spectrum Center', 35.2251, -80.8392),
    ("CHI",'Chicago Bulls', 'United Center', 41.8807, -87.6742),
    ("CLE",'Cleveland Cavaliers', 'Rocket Mortgage FieldHouse', 41.4965, -81.688),
    ("DAL",'Dallas Mavericks', 'American Airlines Center', 32.7906, -96.8101),
    ("DEN",'Denver Nuggets', 'Ball Arena', 39.7487, -105.0077),
    ("DET",'Detroit Pistons', 'Little Caesars Arena', 42.3426, -83.0554),
    ("GSW",'Golden State Warriors', 'Chase Center', 37.768, -122.3862),
    ("HOU",'Houston Rockets', 'Toyota Center', 29.7508, -95.3621),
    ("IND",'Indiana Pacers', 'Bankers Life Fieldhouse', 39.7639, -86.1555),
    ("LAC",'Los Angeles Clippers', 'Staples Center', 34.043, -118.2673),
    ("LAL",'Los Angeles Lakers', 'Staples Center', 34.043, -118.2673),
    ("MEM",'Memphis Grizzlies', 'FedExForum', 35.1381, -90.0507),
    ("MIA",'Miami Heat', 'AmericanAirlines Arena', 25.7814, -80.187),
    ("MIL",'Milwaukee Bucks', 'Fiserv Forum', 43.0451, -87.9173),
    ("MIN",'Minnesota Timberwolves', 'Target Center', 44.9795, -93.2768),
    ("NOP",'New Orleans Pelicans', 'Smoothie King Center', 29.9489, -90.0812),
    ("NYK",'New York Knicks', 'Madison Square Garden', 40.7505, -73.9934),
    ("OKC",'Oklahoma City Thunder', 'Paycom Center', 35.4634, -97.5151),
    ("ORL",'Orlando Magic', 'Amway Center', 28.5392, -81.3839),
    ("PHI",'Philadelphia 76ers', 'Wells Fargo Center', 39.9012, -75.1719),
    ("PHO",'Phoenix Suns', 'Footprint Center', 33.4457, -112.0712),
    ("POR",'Portland Trail Blazers', 'Moda Center', 45.5316, -122.666),
    ("SAC",'Sacramento Kings', 'Golden 1 Center', 38.5802, -121.4991),
    ("SAS",'San Antonio Spurs', 'AT&T Center', 29.4271, -98.4375),
    ("TOR",'Toronto Raptors', 'Scotiabank Arena', 43.6435, -79.3791),
    ("UTA",'Utah Jazz', 'Vivint Arena', 40.7683, -111.9011),
    ("WAS",'Washington Wizards', 'Capital One Arena', 38.898, -77.0209)
]

In [743]:
df_ti= pd.DataFrame(team_info, columns=['team_id', 'team_name', 'arena_name','latitude','longitude'])

In [744]:
df_ti.head()

Unnamed: 0,team_id,team_name,arena_name,latitude,longitude
0,ATL,Atlanta Hawks,State Farm Arena,33.7573,-84.3963
1,BOS,Boston Celtics,TD Garden,42.3662,-71.0621
2,BKN,Brooklyn Nets,Barclays Center,40.6826,-73.9754
3,CHA,Charlotte Hornets,Spectrum Center,35.2251,-80.8392
4,CHI,Chicago Bulls,United Center,41.8807,-87.6742


## 4.2 - Data integrity

In [745]:
df_merged.dtypes

match_id     object
team_id      object
date_id       int64
result       object
min           int64
pts           int64
fgm           int64
fga           int64
fgp         float64
tpm           int64
tpa           int64
tpp         float64
ftm           int64
fta           int64
ftp         float64
oreb          int64
dreb          int64
reb           int64
ast           int64
tov           int64
stl           int64
blk           int64
pf            int64
dtype: object

In [746]:
df_ti.dtypes

team_id        object
team_name      object
arena_name     object
latitude      float64
longitude     float64
dtype: object

In [747]:
df_mi.dtypes

match_id     object
date_id       int64
away_team    object
away_pts      int64
home_pts      int64
home_team    object
dtype: object

In [748]:
df_gd.dtypes

date_id               int64
game_date    datetime64[ns]
dtype: object

In [750]:
# Save our dataframes to csv
df_merged.to_csv('./data/match_stats.csv', sep=";")
df_gd.to_csv('./data/game_dates.csv', sep=";")
df_mi.to_csv('./data/match_info.csv', sep=";")
df_ti.to_csv('./data/team_info.csv', sep=";")