## Exploratory Data Analysis

In [58]:
import pandas as pd
import matplotlib as mpl
from pathlib import Path

In [59]:
awards = pd.read_csv("project_data/awards_players.csv")
coaches = pd.read_csv(Path("project_data", "coaches.csv"))
players_teams = pd.read_csv(Path("project_data", "players_teams.csv"))
players = pd.read_csv(Path("project_data", "players.csv"))
series_post = pd.read_csv(Path("project_data", "series_post.csv"))
teams_post = pd.read_csv(Path("project_data", "teams_post.csv"))
teams = pd.read_csv(Path("project_data", "teams.csv"))

### 1. Search for irrelevant attributes/features

#### Awards Table

In [61]:
# Count of different lgIDs
print("Count of different lgIDs:")
print(awards['lgID'].value_counts())
print("\n")
# All lgIDs are the same, so we can assume they are irrelevant to our analysis

Count of different lgIDs:
lgID
WNBA    95
Name: count, dtype: int64




#### Coaches Table

In [62]:
# Count of different lgIDs
print("Count of different lgIDs:")
print(coaches['lgID'].value_counts())
print("\n")
# All lgIDs are the same, so we can assume they are irrelevant to our analysis

Count of different lgIDs:
lgID
WNBA    162
Name: count, dtype: int64




#### Players Teams Table

In [63]:
# Count of different lgIDs
print("Count of different lgIDs:")
print(players_teams['lgID'].value_counts())
print("\n")
# All lgIDs are the same, so we can assume they are irrelevant to our analysis

Count of different lgIDs:
lgID
WNBA    1876
Name: count, dtype: int64




#### Players Table

In [66]:
# Player firstSeason and lastSeason Distribution
print("Value Distribution of firstseason")
print(players['firstseason'].value_counts())
print("\n")

print("Value Distribution of lastseason")
print(players['lastseason'].value_counts())
print("\n")

# Devemos preencher estes valores de alguma forma. Podemos olhar para o primeiro ano em que jogaram na tabela players_teams para firstSeason...

# Player Death Date Distribution 0000-00-00 vs not 0000-00-00 (0000-00-00 means alive)
print("Value Distribution of deathDate")
print(players['deathDate'].value_counts())
print("\n")

# Player collegeOther Distribution NULL vs NOT NULL
print("Value Distribution of collegeOther")
print(players['collegeOther'].isnull().value_counts())

Value Distribution of firstseason
firstseason
0    893
Name: count, dtype: int64


Value Distribution of lastseason
lastseason
0    893
Name: count, dtype: int64


Value Distribution of deathDate
deathDate
0000-00-00    889
2011-05-27      1
1999-08-19      1
1999-01-18      1
2001-05-12      1
Name: count, dtype: int64


Value Distribution of collegeOther
collegeOther
True     882
False     11
Name: count, dtype: int64


#### Series Post, Teams_Post and Teams

In [68]:
# Count of different lgIDs in series_post
print("Count of different lgIDs in series_post:")
print(series_post['lgIDWinner'].value_counts())
print(series_post['lgIDLoser'].value_counts())
print("\n")

# Count of different lgIDs in teams_post
print("Count of different lgIDs in teams_post:")
print(teams_post['lgID'].value_counts())
print("\n")

# Count of different lgIDs in teams
print("Count of different lgIDs in teams:")
print(teams['lgID'].value_counts())
# All lgIDs are the same, so we can assume they are irrelevant to our analysis

Count of different lgIDs in series_post:
lgIDWinner
WNBA    70
Name: count, dtype: int64
lgIDLoser
WNBA    70
Name: count, dtype: int64


Count of different lgIDs in teams_post:
lgID
WNBA    80
Name: count, dtype: int64


Count of different lgIDs in teams:
lgID
WNBA    142
Name: count, dtype: int64


List of rookies (by year):

In [None]:
rookies_by_year = {}
min_year = players_teams['year'].min()
max_year = players_teams['year'].max()
previous_years_players = set()

for year in range(min_year + 1,max_year + 1):
    current_years = players_teams[players_teams['year'] == year]
    current_years_players = set(current_years['playerID'])

    rookies = current_years_players - previous_years_players
    rookies_by_year[year] = rookies

    previous_years_players.update(current_years_players)

for year, rookies in rookies_by_year.items():
    print(f"Year: {year}")
    print(f"Rookies: {rookies}")

Year: 2
Rookies: {'pettibr01w', 'salesny01w', 'campbed01w', 'badertr01w', 'iveyni01w', 'radunha01w', 'johnsja01w', 'nagyan01w', 'shakiel01w', 'whitiva01w', 'reddja01w', 'figgsuk01w', 'malcona01w', 'gilloje01w', 'mcculda01w', 'weathte01w', 'vealkr01w', 'barnequ01w', 'grubigo01w', 'baranel01w', 'aldrima01w', 'levanni01w', 'johnssh01w', 'robincr01w', 'goodsad01w', 'ndiayas01w', 'nolande01w', 'burseja01w', 'vodicka01w', 'cantydo01w', 'herriam01w', 'kubikni01w', 'dossaci01w', 'enissh01w', 'rileyru01w', 'walsema01w', 'martima01w', 'hillec01w', 'pridely01w', 'schumke01w', 'luzhe01w', 'douglka01w', 'ferdima01w', 'santoal01w', 'wynneda01w', 'feastal01w', 'streiju01w', 'wyckobr01w', 'folklkr01w', 'crawlsy01w', 'pavlimi01w', 'rizzoje01w', 'mccrini01w', 'smithch03w', 'bauerca01w', 'griffyo01w', 'jacksla01w', 'wolteka01w', 'grginve01w', 'lobore01w', 'taylope01w', 'harrili01w', 'darlihe01w', 'milleke01w', 'williwe01w', 'hammobe01w', 'blackde01w', 'jacksta02w', 'dixonta01w', 'kingija01w', 'hicksje01w

### 2. Data Cleaning

### 3. Visual Data Representation (Plots and Graphs)