# **MongoDB to Staging Area**

#### Import Required Libraries

In [1]:
from pymongo import MongoClient
from bson.json_util import dumps
from bson.json_util import loads
import pandas as pd
from tqdm.notebook import tqdm

#### Establish MongoDB Connection

In [2]:
client = MongoClient('localhost', 27017)

db = client['NBA-Stats']

player_cltn = db['players']
games_cltn = db['Games']
player_game_stats = db['player_game_stats']

#### Define Function to Retrieve MongoDB Query Results

In [3]:
def get_results(query):
    return loads(dumps(query))

#### Test Query Functionality

In [4]:
# Get All Relevant Data from "Player_Stats" Collection
query = get_results(player_cltn.find({}, {"resultSets": {"$slice": [0, 1]}}))

# Define Dictionary Keys as List
headers = query[0]['resultSets'][0]['headers']

# Define Dictionary Values as List
results = query[0]['resultSets'][0]['rowSet'][0]

results

[76001,
 '1990-91',
 '00',
 1610612757,
 'POR',
 23.0,
 43,
 0,
 290.0,
 55,
 116,
 0.474,
 0,
 0,
 0.0,
 25,
 44,
 0.568,
 27,
 62,
 89,
 12,
 4,
 12,
 22,
 39,
 135]

#### Initialize Dictionary with First Row

In [5]:
res = {}
for key in headers:
            # res[key] = results[vali]
            for value in results:
                res[key] = value
                results.remove(value)
                break
res

{'PLAYER_ID': 76001,
 'SEASON_ID': '1990-91',
 'LEAGUE_ID': '00',
 'TEAM_ID': 1610612757,
 'TEAM_ABBREVIATION': 'POR',
 'PLAYER_AGE': 23.0,
 'GP': 43,
 'GS': 0,
 'MIN': 290.0,
 'FGM': 55,
 'FGA': 116,
 'FG_PCT': 0.474,
 'FG3M': 0,
 'FG3A': 0,
 'FG3_PCT': 0.0,
 'FTM': 25,
 'FTA': 44,
 'FT_PCT': 0.568,
 'OREB': 27,
 'DREB': 62,
 'REB': 89,
 'AST': 12,
 'STL': 4,
 'BLK': 12,
 'TOV': 22,
 'PF': 39,
 'PTS': 135}

#### Construct Script to Load All Data From "Player_Stats" Collection to Pandas DataFrame

In [6]:
# define DataFrame with test data
df = pd.DataFrame(res, index=[0])

# redefine "query" variable
query = get_results(player_cltn.find({}, {"resultSets": {"$slice": [0, 1]}}))

# redefine "headers" variable
headers = query[0]['resultSets'][0]['headers']

# Use tqdm to view upload progress
for player in tqdm(query, ):
    for results in player['resultSets'][0]['rowSet']:
        res = {}
        for key in headers:
            for value in results:
                res[key] = value
                results.remove(value)
                break
        temp_player_stats = pd.concat([df, pd.DataFrame(res, index=[0])])

temp_player_stats

  0%|          | 0/4902 [00:00<?, ?it/s]

Unnamed: 0,PLAYER_ID,SEASON_ID,LEAGUE_ID,TEAM_ID,TEAM_ABBREVIATION,PLAYER_AGE,GP,GS,MIN,FGM,...,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS
0,76001,1990-91,00,1610612757,POR,23.0,43,0,290.0,55,...,0.568,27,62,89,12,4,12,22,39,135
0,76001,1990-91,00,1610612757,POR,23.0,43,0,290.0,55,...,0.568,27,62,89,12,4,12,22,39,135
0,76001,1991-92,00,1610612757,POR,24.0,71,1,934.0,178,...,0.752,81,179,260,30,25,16,66,132,432
0,76001,1992-93,00,1610612749,MIL,25.0,12,0,159.0,26,...,0.75,12,25,37,10,6,4,13,24,64
0,76001,1992-93,00,1610612738,BOS,25.0,63,52,1152.0,219,...,0.76,114,186,300,17,19,22,84,165,514
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,1627826,2020-21,00,1610612746,LAC,24.0,72,33,1609.0,257,...,0.789,189,330,519,90,24,62,81,187,650
0,1627826,2021-22,00,1610612746,LAC,25.0,76,76,1852.0,310,...,0.727,217,427,644,120,36,77,114,203,785
0,1627826,2022-23,00,1610612746,LAC,26.0,76,76,2169.0,326,...,0.697,236,520,756,77,29,98,117,219,818
0,1627826,2023-24,00,1610612746,LAC,26.0,36,36,976.0,193,...,0.641,115,233,348,45,8,49,41,105,452


#### Add Descriptive Column to Pandas DataFrame

In [46]:
temp_player_stats['Category'] = 'Regular Season Totals - By Season'
temp_player_stats.head()

Unnamed: 0,PLAYER_ID,SEASON_ID,LEAGUE_ID,TEAM_ID,TEAM_ABBREVIATION,PLAYER_AGE,GP,GS,MIN,FGM,...,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS,Category
0,76001,1990-91,00,1610612757,POR,23.0,43,0,290.0,55,...,27,62,89,12,4,12,22,39,135,Regular Season Totals - By Season
0,76001,1990-91,00,1610612757,POR,23.0,43,0,290.0,55,...,27,62,89,12,4,12,22,39,135,Regular Season Totals - By Season
0,76001,1991-92,00,1610612757,POR,24.0,71,1,934.0,178,...,81,179,260,30,25,16,66,132,432,Regular Season Totals - By Season
0,76001,1992-93,00,1610612749,MIL,25.0,12,0,159.0,26,...,12,25,37,10,6,4,13,24,64,Regular Season Totals - By Season
0,76001,1992-93,00,1610612738,BOS,25.0,63,52,1152.0,219,...,114,186,300,17,19,22,84,165,514,Regular Season Totals - By Season
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,1627826,2020-21,00,1610612746,LAC,24.0,72,33,1609.0,257,...,189,330,519,90,24,62,81,187,650,Regular Season Totals - By Season
0,1627826,2021-22,00,1610612746,LAC,25.0,76,76,1852.0,310,...,217,427,644,120,36,77,114,203,785,Regular Season Totals - By Season
0,1627826,2022-23,00,1610612746,LAC,26.0,76,76,2169.0,326,...,236,520,756,77,29,98,117,219,818,Regular Season Totals - By Season
0,1627826,2023-24,00,1610612746,LAC,26.0,36,36,976.0,193,...,115,233,348,45,8,49,41,105,452,Regular Season Totals - By Season


#### Perform the following Data Cleaning Tasks:
##### - Remove NULL Values (as appropriate)
##### - Change Data Types (as appropriate)
###### *I identified these steps when I was uploading the eventual CSV to the Staging Table in PostgreSQL

In [None]:
int_columns = ['GS', 'FGM', 'FGA', 'FG3M', 'FG3A', 'OREB', 'DREB', 'REB', 'STL', 'BLK', 'TOV']
float_columns = ['MIN', 'FG_PCT', 'FG3_PCT', 'FT_PCT']

for column in int_columns:
    temp_player_stats[column] = temp_player_stats.dropna()
    temp_player_stats[column] = temp_player_stats[column].astype(int)
    
for column in float_columns:
    temp_player_stats[column] = temp_player_stats.dropna()
    temp_player_stats[column] = temp_player_stats[column].astype(float)


temp_player_stats.info()

In [None]:
temp_player_stats.to_csv('player_stats1.csv', index=False)

In [None]:
from tqdm.notebook import tqdm
seasons = ["2003-04", "2004-05", "2006-07", "2007-08", "2008-09", "2009-10", "2010-11", "2011-12", "2012-13", "2013-14", "2014-15", "2015-16", "2016-17", "2017-18", "2018-19", "2019-20", "2020-21", "2021-22", "2022-23", "2023-24"]
for season in seasons:
    results = get_results(games_cltn.find({"SEASON_YEAR":season}))
    try:
        res = results[0]
        df = pd.DataFrame(res, index=[0])
    except IndexError:
        pass
    
    i = 1
    for player in tqdm(results, ):
        try:
            res = results[i]
            df = pd.concat([df, pd.DataFrame(res, index=[0])])
        except IndexError:
            pass
        i += 1
    df.to_csv(f"{season} games.csv", index=False)

In [None]:
df1 = pd.read_csv("2003-04 games.csv")
df2 = pd.read_csv("2004-05 games.csv")
df3 = pd.read_csv("2006-07 games.csv")
df4 = pd.read_csv("2007-08 games.csv")
df5 = pd.read_csv("2008-09 games.csv")
df6 = pd.read_csv("2009-10 games.csv")
df7 = pd.read_csv("2010-11 games.csv")
df8 = pd.read_csv("2011-12 games.csv")
df9 = pd.read_csv("2012-13 games.csv")
df10 = pd.read_csv("2013-14 games.csv")
df11 = pd.read_csv("2014-15 games.csv")
df12 = pd.read_csv("2015-16 games.csv")
df13 = pd.read_csv("2016-17 games.csv")
df14 = pd.read_csv("2017-18 games.csv")
df15 = pd.read_csv("2018-19 games.csv")
df16 = pd.read_csv("2019-20 games.csv")
df17 = pd.read_csv("2020-21 games.csv")
df18 = pd.read_csv("2021-22 games.csv")
df19 = pd.read_csv("2022-23 games.csv")
df20 = pd.read_csv("2023-24 games.csv")
df = pd.concat([df1, df2, df3, df4, df5, df6, df7, df8, df9, df10, df11, df12, df13, df14, df15, df16, df17, df18, df19, df20])
df.drop(columns=['_id'], inplace=True)
df.to_csv("all games.csv", index=False)

In [None]:
from datetime import datetime
df['GAME_DATE'] = pd.to_datetime(df['GAME_DATE'])
df['GAME_DATE'] = df['GAME_DATE'].astype(str)
df['MIN'] = df['MIN'].astype(int)
df['TOV'] = df['TOV'].astype(int)
df.to_csv("all games.csv", index=False, header=False)

In [177]:
results = get_results(db.player_game_stats.find({}))

In [179]:
# df1 = pd.DataFrame(results[0], index=[0])
# df1.to_csv('player_game_stats.csv', index=False)
index = 1
for i in tqdm(results,):
    df2 = pd.DataFrame(results[index], index=[0])
    df2.to_csv('player_game_stats.csv', mode='a', index=False, header=False)
    index += 1

  0%|          | 0/486529 [00:00<?, ?it/s]

IndexError: list index out of range

In [181]:
df = pd.read_csv('player_game_stats.csv')

In [182]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 486529 entries, 0 to 486528
Data columns (total 69 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   _id                    486529 non-null  object 
 1   SEASON_YEAR            486529 non-null  object 
 2   PLAYER_ID              486529 non-null  int64  
 3   PLAYER_NAME            486529 non-null  object 
 4   NICKNAME               486529 non-null  object 
 5   TEAM_ID                486529 non-null  int64  
 6   TEAM_ABBREVIATION      486529 non-null  object 
 7   TEAM_NAME              486529 non-null  object 
 8   GAME_ID                486529 non-null  int64  
 9   GAME_DATE              486529 non-null  object 
 10  MATCHUP                486529 non-null  object 
 11  WL                     486529 non-null  object 
 12  MIN                    486529 non-null  float64
 13  FGM                    486529 non-null  int64  
 14  FGA                    486529 non-nu