# Data Cleaning Notebook

This notebook includes the code that we used to clean our dataset

In [None]:
# Imports
import numpy as np
import pandas as pd

First we cleaned the games.csv file. This file will be used for the games table and contains information about all the games that were played and both teams' stats for the game

In [None]:
games = pd.read_csv('games.csv')

# Remove all null values
games = games.dropna()

In [None]:
# Change the date column into date time value
games["GAME_DATE_EST"] = pd.to_datetime(games["GAME_DATE_EST"])

In [None]:
# Drop redundant columns
games = games.drop(["TEAM_ID_home", "TEAM_ID_away"], axis=1)

In [None]:
# Check data types of each column
games.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26552 entries, 0 to 26650
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   GAME_DATE_EST     26552 non-null  datetime64[ns]
 1   GAME_ID           26552 non-null  int64         
 2   GAME_STATUS_TEXT  26552 non-null  object        
 3   HOME_TEAM_ID      26552 non-null  int64         
 4   VISITOR_TEAM_ID   26552 non-null  int64         
 5   SEASON            26552 non-null  int64         
 6   PTS_home          26552 non-null  float64       
 7   FG_PCT_home       26552 non-null  float64       
 8   FT_PCT_home       26552 non-null  float64       
 9   FG3_PCT_home      26552 non-null  float64       
 10  AST_home          26552 non-null  float64       
 11  REB_home          26552 non-null  float64       
 12  PTS_away          26552 non-null  float64       
 13  FG_PCT_away       26552 non-null  float64       
 14  FT_PCT_away       2655

In [None]:
# Check to see some values of the table
games[games["SEASON"]==2018]

Unnamed: 0,GAME_DATE_EST,GAME_ID,GAME_STATUS_TEXT,HOME_TEAM_ID,VISITOR_TEAM_ID,SEASON,PTS_home,FG_PCT_home,FT_PCT_home,FG3_PCT_home,AST_home,REB_home,PTS_away,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away,HOME_TEAM_WINS
19652,2019-06-13,41800406,Final,1610612744,1610612761,2018,110.0,0.488,0.700,0.355,28.0,42.0,114.0,0.476,0.793,0.394,25.0,39.0,0
19653,2019-06-10,41800405,Final,1610612761,1610612744,2018,105.0,0.447,0.778,0.250,19.0,43.0,106.0,0.463,0.714,0.476,27.0,37.0,0
19654,2019-06-07,41800404,Final,1610612744,1610612761,2018,92.0,0.449,0.667,0.296,26.0,42.0,105.0,0.419,0.958,0.313,22.0,39.0,0
19655,2019-06-05,41800403,Final,1610612744,1610612761,2018,109.0,0.396,0.833,0.333,25.0,41.0,123.0,0.524,0.952,0.447,30.0,40.0,0
19656,2019-06-02,41800402,Final,1610612761,1610612744,2018,104.0,0.372,0.885,0.289,17.0,49.0,109.0,0.463,0.870,0.382,34.0,42.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21025,2018-09-30,11800009,Final,1610612741,1610612740,2018,128.0,0.495,0.963,0.222,23.0,52.0,116.0,0.454,0.789,0.333,24.0,45.0,1
21026,2018-09-30,11800010,Final,1610612747,1610612743,2018,107.0,0.460,0.731,0.320,26.0,37.0,124.0,0.460,0.886,0.351,25.0,50.0,0
21027,2018-09-29,11800003,Final,1610612761,1610612757,2018,122.0,0.415,0.824,0.343,15.0,45.0,104.0,0.420,0.720,0.387,25.0,46.0,1
21028,2018-09-29,11800005,Final,1610612744,1610612750,2018,110.0,0.473,0.769,0.353,33.0,48.0,114.0,0.426,0.733,0.400,15.0,47.0,0


Next we clean the teams table, this table has 30 rows, each row containing information about a team in the NBA.

In [None]:
# Read the table in from the teams.csv file
teams = pd.read_csv('teams.csv')
# teams = teams.dropna()

# Check the datatypes of all the columns
teams.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   LEAGUE_ID           30 non-null     int64  
 1   TEAM_ID             30 non-null     int64  
 2   MIN_YEAR            30 non-null     int64  
 3   MAX_YEAR            30 non-null     int64  
 4   ABBREVIATION        30 non-null     object 
 5   NICKNAME            30 non-null     object 
 6   YEARFOUNDED         30 non-null     int64  
 7   CITY                30 non-null     object 
 8   ARENA               30 non-null     object 
 9   ARENACAPACITY       26 non-null     float64
 10  OWNER               30 non-null     object 
 11  GENERALMANAGER      30 non-null     object 
 12  HEADCOACH           30 non-null     object 
 13  DLEAGUEAFFILIATION  30 non-null     object 
dtypes: float64(1), int64(5), object(8)
memory usage: 3.4+ KB


In [None]:
teams

Unnamed: 0,LEAGUE_ID,TEAM_ID,MIN_YEAR,MAX_YEAR,ABBREVIATION,NICKNAME,YEARFOUNDED,CITY,ARENA,ARENACAPACITY,OWNER,GENERALMANAGER,HEADCOACH,DLEAGUEAFFILIATION
0,0,1610612737,1949,2019,ATL,Hawks,1949,Atlanta,State Farm Arena,18729.0,Tony Ressler,Travis Schlenk,Lloyd Pierce,Erie Bayhawks
1,0,1610612738,1946,2019,BOS,Celtics,1946,Boston,TD Garden,18624.0,Wyc Grousbeck,Danny Ainge,Brad Stevens,Maine Red Claws
2,0,1610612740,2002,2019,NOP,Pelicans,2002,New Orleans,Smoothie King Center,,Tom Benson,Trajan Langdon,Alvin Gentry,No Affiliate
3,0,1610612741,1966,2019,CHI,Bulls,1966,Chicago,United Center,21711.0,Jerry Reinsdorf,Gar Forman,Jim Boylen,Windy City Bulls
4,0,1610612742,1980,2019,DAL,Mavericks,1980,Dallas,American Airlines Center,19200.0,Mark Cuban,Donnie Nelson,Rick Carlisle,Texas Legends
5,0,1610612743,1976,2019,DEN,Nuggets,1976,Denver,Pepsi Center,19099.0,Stan Kroenke,Tim Connelly,Michael Malone,No Affiliate
6,0,1610612745,1967,2019,HOU,Rockets,1967,Houston,Toyota Center,18104.0,Tilman Fertitta,Daryl Morey,Mike D'Antoni,Rio Grande Valley Vipers
7,0,1610612746,1970,2019,LAC,Clippers,1970,Los Angeles,Staples Center,19060.0,Steve Ballmer,Michael Winger,Doc Rivers,Agua Caliente Clippers of Ontario
8,0,1610612747,1948,2019,LAL,Lakers,1948,Los Angeles,Staples Center,19060.0,Jerry Buss Family Trust,Rob Pelinka,Frank Vogel,South Bay Lakers
9,0,1610612748,1988,2019,MIA,Heat,1988,Miami,AmericanAirlines Arena,19600.0,Micky Arison,Pat Riley,Erik Spoelstra,Sioux Falls Skyforce


Next we clean the games details table. This is a large table that contains data for each player, for each game.

In [None]:
games_details = pd.read_csv('games_details.csv')

  games_details = pd.read_csv('games_details.csv')


First we remove unnecessary columns

In [None]:
# Remove nickname column because most people don't have nickname column
games_details = games_details.drop('NICKNAME', axis=1)

In [None]:
# Remove comment column because we do not need this for what we are doing
games_details = games_details.drop('COMMENT', axis=1)

In [None]:
# Remove the following columns as well because they are not necessary
games_details = games_details.drop(['TEAM_ABBREVIATION', 'TEAM_CITY', 'PLAYER_NAME'], axis=1)

There are many NULL value rows for the column "START_POSITION". This is because if a player did not start the game, the value for that column would be null. We filled it all in with BENCH, because if a player did not start the game it means they were on the bench

In [None]:
games_details[["START_POSITION"]] = games_details[["START_POSITION"]].fillna("BENCH")
games_details

Unnamed: 0,GAME_ID,TEAM_ID,PLAYER_ID,START_POSITION,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,...,OREB,DREB,REB,AST,STL,BLK,TO,PF,PTS,PLUS_MINUS
0,22200477,1610612759,1629641,F,18:06,1.0,1.0,1.000,0.0,0.0,...,1.0,1.0,2.0,0.0,1.0,0.0,2.0,5.0,2.0,-2.0
1,22200477,1610612759,1631110,F,31:01,7.0,14.0,0.500,2.0,4.0,...,6.0,3.0,9.0,6.0,1.0,0.0,2.0,1.0,23.0,-14.0
2,22200477,1610612759,1627751,C,21:42,6.0,9.0,0.667,0.0,0.0,...,1.0,3.0,4.0,1.0,1.0,0.0,2.0,4.0,13.0,-4.0
3,22200477,1610612759,1630170,G,30:20,4.0,13.0,0.308,1.0,6.0,...,0.0,9.0,9.0,5.0,3.0,0.0,2.0,1.0,10.0,-18.0
4,22200477,1610612759,1630200,G,27:44,7.0,12.0,0.583,1.0,3.0,...,0.0,2.0,2.0,3.0,0.0,0.0,2.0,2.0,19.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
668623,11200005,1610612743,202706,BENCH,19,4.0,9.0,0.444,3.0,6.0,...,0.0,2.0,2.0,0.0,2.0,0.0,1.0,3.0,17.0,
668624,11200005,1610612743,202702,BENCH,23,7.0,11.0,0.636,0.0,0.0,...,1.0,0.0,1.0,1.0,1.0,0.0,3.0,3.0,18.0,
668625,11200005,1610612743,201585,BENCH,15,3.0,7.0,0.429,0.0,0.0,...,3.0,5.0,8.0,0.0,1.0,0.0,0.0,3.0,6.0,
668626,11200005,1610612743,202389,BENCH,19,1.0,1.0,1.000,0.0,0.0,...,1.0,2.0,3.0,1.0,0.0,0.0,4.0,2.0,2.0,


Here, we see that the datatype for the MIN column is a string. Some of the rows display the minutes played in the format "mm:ss", while other rows only have minutes. We decide to clean this column by keeping only the integer value of the minute a player played in a game, and converting this to an numeric datatype. We also filled all null values with 0, since null means a player didn't play, or equivalently played 0 minutes

In [None]:
games_details = games_details.astype({'MIN': 'str'})
games_details["MIN"] = games_details["MIN"].apply(lambda x : x.split(':')[0]) # Only keep the minute
games_details["MIN"] = games_details["MIN"].apply(lambda x : pd.to_numeric(x) if x != 'nan' else 0) # Convert it to numeric

In [None]:
games_details.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 668628 entries, 0 to 668627
Data columns (total 24 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   GAME_ID         668628 non-null  int64  
 1   TEAM_ID         668628 non-null  int64  
 2   PLAYER_ID       668628 non-null  int64  
 3   START_POSITION  668628 non-null  object 
 4   MIN             668628 non-null  float64
 5   FGM             558938 non-null  float64
 6   FGA             558938 non-null  float64
 7   FG_PCT          558938 non-null  float64
 8   FG3M            558938 non-null  float64
 9   FG3A            558938 non-null  float64
 10  FG3_PCT         558938 non-null  float64
 11  FTM             558938 non-null  float64
 12  FTA             558938 non-null  float64
 13  FT_PCT          558938 non-null  float64
 14  OREB            558938 non-null  float64
 15  DREB            558938 non-null  float64
 16  REB             558938 non-null  float64
 17  AST       

In [None]:
games_details[(games_details["MIN"] > 0) & (games_details["FGM"].isnull())]

In [None]:
games_details[games_details["MIN"] == 0]

We also see that players that did not play have all their game statistics as null. We fill in these values with 0 for consistency

In [None]:
games_details[(games_details["FGM"].isnull())
        & (games_details["FGA"].isnull())
        & (games_details["FG_PCT"].isnull())
        & (games_details["FG3M"].isnull())
        & (games_details["FG3A"].isnull())
        & (games_details["FG3_PCT"].isnull())
        & (games_details["FTM"].isnull())
        & (games_details["FTA"].isnull())
        & (games_details["FG_PCT"].isnull())
        & (games_details["OREB"].isnull())
        & (games_details["DREB"].isnull())
        & (games_details["REB"].isnull())
        & (games_details["AST"].isnull())
        & (games_details["STL"].isnull())
        & (games_details["BLK"].isnull())
        & (games_details["TO"].isnull())
        & (games_details["PF"].isnull())
        & (games_details["PTS"].isnull())
        & (games_details["PLUS_MINUS"].isnull())] = games_details[(games_details["FGM"].isnull())
        & (games_details["FGA"].isnull())
        & (games_details["FG_PCT"].isnull())
        & (games_details["FG3M"].isnull())
        & (games_details["FG3A"].isnull())
        & (games_details["FG3_PCT"].isnull())
        & (games_details["FTM"].isnull())
        & (games_details["FTA"].isnull())
        & (games_details["FG_PCT"].isnull())
        & (games_details["OREB"].isnull())
        & (games_details["DREB"].isnull())
        & (games_details["REB"].isnull())
        & (games_details["AST"].isnull())
        & (games_details["STL"].isnull())
        & (games_details["BLK"].isnull())
        & (games_details["TO"].isnull())
        & (games_details["PF"].isnull())
        & (games_details["PTS"].isnull())
        & (games_details["PLUS_MINUS"].isnull())].fillna(0)

In [None]:
games_details

Unnamed: 0,GAME_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_CITY,PLAYER_ID,PLAYER_NAME,START_POSITION,MIN,FGM,FGA,...,OREB,DREB,REB,AST,STL,BLK,TO,PF,PTS,PLUS_MINUS
0,22200477,1610612759,SAS,San Antonio,1629641,Romeo Langford,F,18.0,1.0,1.0,...,1.0,1.0,2.0,0.0,1.0,0.0,2.0,5.0,2.0,-2.0
1,22200477,1610612759,SAS,San Antonio,1631110,Jeremy Sochan,F,31.0,7.0,14.0,...,6.0,3.0,9.0,6.0,1.0,0.0,2.0,1.0,23.0,-14.0
2,22200477,1610612759,SAS,San Antonio,1627751,Jakob Poeltl,C,21.0,6.0,9.0,...,1.0,3.0,4.0,1.0,1.0,0.0,2.0,4.0,13.0,-4.0
3,22200477,1610612759,SAS,San Antonio,1630170,Devin Vassell,G,30.0,4.0,13.0,...,0.0,9.0,9.0,5.0,3.0,0.0,2.0,1.0,10.0,-18.0
4,22200477,1610612759,SAS,San Antonio,1630200,Tre Jones,G,27.0,7.0,12.0,...,0.0,2.0,2.0,3.0,0.0,0.0,2.0,2.0,19.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
668623,11200005,1610612743,DEN,Denver,202706,Jordan Hamilton,BENCH,19.0,4.0,9.0,...,0.0,2.0,2.0,0.0,2.0,0.0,1.0,3.0,17.0,
668624,11200005,1610612743,DEN,Denver,202702,Kenneth Faried,BENCH,23.0,7.0,11.0,...,1.0,0.0,1.0,1.0,1.0,0.0,3.0,3.0,18.0,
668625,11200005,1610612743,DEN,Denver,201585,Kosta Koufos,BENCH,15.0,3.0,7.0,...,3.0,5.0,8.0,0.0,1.0,0.0,0.0,3.0,6.0,
668626,11200005,1610612743,DEN,Denver,202389,Timofey Mozgov,BENCH,19.0,1.0,1.0,...,1.0,2.0,3.0,1.0,0.0,0.0,4.0,2.0,2.0,


We notice that some of our data only has information between 2009 and 2019. We want to keep this consistent across all tables, so we clean our games detail table by only keeping games that occurred between 2009 and 2019. This means that we have to join on the games table to get the season of each game, and then only keep the relevant ones

In [None]:
# Join on Games table
games_details = games_details.merge(games[["GAME_DATE_EST", "GAME_ID"]], how="left", left_on = "GAME_ID", right_on = "GAME_ID")

In [None]:
# Only keep the relevant rows within the seasons
games_details = games_details[(games_details["GAME_DATE_EST"].dt.year <= 2019) & (games_details["GAME_DATE_EST"].dt.year >= 2009)]

In [None]:
games_details = games_details.drop("GAME_DATE_EST", axis=1)

In [None]:
games_details = games_details.merge(games[["GAME_ID", "SEASON"]], on = "GAME_ID")

In [None]:
games_details["TURNO"] = games_details["TO"]
games_details.drop("TO", axis=1)

Unnamed: 0,GAME_ID,TEAM_ID,PLAYER_ID,START_POSITION,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,...,DREB,REB,AST,STL,BLK,PF,PTS,PLUS_MINUS,SEASON,TURNO
0,21900497,1610612738,202330,F,34.0,9.0,14.0,0.643,3.0,6.0,...,5.0,10.0,6.0,0.0,0.0,0.0,21.0,12.0,2019,2.0
1,21900497,1610612738,1628369,F,36.0,10.0,18.0,0.556,4.0,7.0,...,5.0,7.0,1.0,3.0,1.0,0.0,24.0,15.0,2019,3.0
2,21900497,1610612738,1628464,C,22.0,2.0,9.0,0.222,0.0,2.0,...,4.0,6.0,0.0,2.0,1.0,2.0,5.0,2.0,2019,0.0
3,21900497,1610612738,203935,G,30.0,3.0,12.0,0.250,1.0,7.0,...,4.0,5.0,7.0,1.0,0.0,4.0,7.0,9.0,2019,1.0
4,21900497,1610612738,202689,G,30.0,9.0,19.0,0.474,3.0,7.0,...,2.0,4.0,7.0,3.0,1.0,0.0,22.0,5.0,2019,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
387015,11200005,1610612743,202706,BENCH,19.0,4.0,9.0,0.444,3.0,6.0,...,2.0,2.0,0.0,2.0,0.0,3.0,17.0,,2012,1.0
387016,11200005,1610612743,202702,BENCH,23.0,7.0,11.0,0.636,0.0,0.0,...,0.0,1.0,1.0,1.0,0.0,3.0,18.0,,2012,3.0
387017,11200005,1610612743,201585,BENCH,15.0,3.0,7.0,0.429,0.0,0.0,...,5.0,8.0,0.0,1.0,0.0,3.0,6.0,,2012,0.0
387018,11200005,1610612743,202389,BENCH,19.0,1.0,1.0,1.000,0.0,0.0,...,2.0,3.0,1.0,0.0,0.0,2.0,2.0,,2012,4.0


In [None]:
games_details[games_details["SEASON"] == 2017]

Unnamed: 0,GAME_ID,TEAM_ID,PLAYER_ID,START_POSITION,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,...,REB,AST,STL,BLK,TO,PF,PTS,PLUS_MINUS,SEASON,TURNO
167335,41700404,1610612744,201142,F,37.0,7.0,17.0,0.412,0.0,3.0,...,12.0,10.0,1.0,3.0,2.0,3.0,20.0,30.0,2017,2.0
167336,41700404,1610612744,203110,F,38.0,4.0,8.0,0.500,1.0,5.0,...,3.0,9.0,1.0,3.0,1.0,5.0,9.0,15.0,2017,1.0
167337,41700404,1610612744,201580,C,16.0,3.0,4.0,0.750,0.0,0.0,...,3.0,0.0,0.0,1.0,0.0,1.0,6.0,21.0,2017,0.0
167338,41700404,1610612744,202691,G,28.0,4.0,10.0,0.400,2.0,5.0,...,6.0,0.0,0.0,0.0,2.0,3.0,10.0,19.0,2017,2.0
167339,41700404,1610612744,201939,G,38.0,12.0,27.0,0.444,7.0,15.0,...,6.0,4.0,3.0,3.0,2.0,4.0,37.0,18.0,2017,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203049,11700002,1610612747,2736,BENCH,0.0,0.0,0.0,0.000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017,0.0
203050,11700002,1610612747,1628404,BENCH,0.0,0.0,0.0,0.000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017,0.0
203051,11700002,1610612747,201572,BENCH,0.0,0.0,0.0,0.000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017,0.0
203052,11700002,1610612747,1627362,BENCH,0.0,0.0,0.0,0.000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017,0.0


Players table does not need much cleaning

In [None]:
players = pd.read_csv("players.csv")
players.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7228 entries, 0 to 7227
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   PLAYER_NAME  7228 non-null   object
 1   TEAM_ID      7228 non-null   int64 
 2   PLAYER_ID    7228 non-null   int64 
 3   SEASON       7228 non-null   int64 
dtypes: int64(3), object(1)
memory usage: 226.0+ KB


We realize that the rankings table contains rankings for each day. We are only interested in the final rankings of each season, so we only keep those where the games played G is 82.

In [None]:
ranking = pd.read_csv("ranking.csv")
ranking = ranking.drop("RETURNTOPLAY", axis=1)

In [None]:
ranking.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142009 entries, 0 to 142008
Data columns (total 12 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   TEAM_ID        142009 non-null  int64  
 1   LEAGUE_ID      142008 non-null  float64
 2   SEASON_ID      142008 non-null  float64
 3   STANDINGSDATE  142008 non-null  object 
 4   CONFERENCE     142008 non-null  object 
 5   TEAM           142008 non-null  object 
 6   G              142008 non-null  float64
 7   W              142008 non-null  float64
 8   L              142008 non-null  float64
 9   W_PCT          142008 non-null  float64
 10  HOME_RECORD    142008 non-null  object 
 11  ROAD_RECORD    142008 non-null  object 
dtypes: float64(6), int64(1), object(5)
memory usage: 13.0+ MB


In [None]:
# Convert the type to datetime
ranking["STANDINGSDATE"] = pd.to_datetime(ranking["STANDINGSDATE"])

In [None]:
# Only keep the rankings that are at the end of the season
ranking = ranking[ranking["G"]==82]

In [None]:
ranking = ranking.sort_values("STANDINGSDATE").groupby(["TEAM_ID", "SEASON_ID"]).tail(1)

In [None]:
ranking["SEASON"] = ranking["STANDINGSDATE"].dt.year

In [None]:
ranking = ranking.drop(["STANDINGSDATE", "SEASON_ID"], axis = 1)

In [None]:
ranking = ranking.drop(["LEAGUE_ID"], axis=1)

In [None]:
ranking

Unnamed: 0,TEAM_ID,CONFERENCE,TEAM,G,W,L,W_PCT,HOME_RECORD,ROAD_RECORD,SEASON
92909,1610612743,West,Denver,82.0,17.0,65.0,0.207,13-28,4-37,2003
92908,1610612746,West,L.A. Clippers,82.0,27.0,55.0,0.329,16-25,11-30,2003
92907,1610612763,West,Memphis,82.0,28.0,54.0,0.341,20-21,8-33,2003
92906,1610612744,West,Golden State,82.0,38.0,44.0,0.463,24-17,14-27,2003
92905,1610612760,West,Seattle,82.0,40.0,42.0,0.488,25-16,15-26,2003
...,...,...,...,...,...,...,...,...,...,...
5543,1610612737,East,Atlanta,82.0,43.0,39.0,0.524,27-14,16-25,2022
5542,1610612739,East,Cleveland,82.0,44.0,38.0,0.537,25-16,19-22,2022
5541,1610612751,East,Brooklyn,82.0,44.0,38.0,0.537,20-21,24-17,2022
5539,1610612761,East,Toronto,82.0,48.0,34.0,0.585,24-17,24-17,2022


Download the cleaned data

In [None]:
games.to_csv("games.csv", index=False)
games_details.to_csv("games_details.csv", index=False)
players.to_csv("players.csv", index=False)
ranking.to_csv("ranking.csv", index=False)
teams.to_csv("teams.csv", index=False)

In [None]:
from google.colab import files
files.download("games.csv")
files.download("games_details.csv")
files.download("players.csv")
files.download("ranking.csv")
files.download("teams.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
games.to_csv("games.csv", index=False)
from google.colab import files
files.download("games.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
games_details.to_csv("games_details.csv", index=False)
from google.colab import files
files.download("games_details.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
ranking.to_csv("ranking.csv", index=False)
from google.colab import files
files.download("ranking.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
teams.to_csv("teams.csv", index=False)
from google.colab import files
files.download("teams.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
players.to_csv("players.csv", index=False)
from google.colab import files
files.download("players.csv")