In [None]:
import pandas as pd

pd.set_option("display.max_columns", None)
pd.set_option("display.width", None) 
pd.set_option("display.max_colwidth", None)

In [None]:
boxscores_df = pd.read_csv("../tests/test_data/test_cleaned_boxscores.csv")
games_df = pd.read_csv("../tests/test_data/test_cleaned_games.csv")

boxscores_df.shape
# games_df.shape

(741569, 17)

In [69]:
# New DataFrame to get season start year from games
games_subset = games_df[["game_id", "season_start_year"]]
merged_df_to_get_year = boxscores_df.merge(games_subset, how="inner", on="game_id")

merged_df_to_get_year.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151265 entries, 0 to 151264
Data columns (total 18 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   game_id                   151265 non-null  int64  
 1   team_name                 151265 non-null  object 
 2   player_name               151265 non-null  object 
 3   minutes_played            151265 non-null  object 
 4   field_goals               151265 non-null  int64  
 5   field_goals_attempted     151265 non-null  int64  
 6   three_pointers            151265 non-null  int64  
 7   three_pointers_attempted  151265 non-null  int64  
 8   free_throws               151265 non-null  int64  
 9   free_throws_attempted     151265 non-null  int64  
 10  total_rebounds            151265 non-null  int64  
 11  assists                   151265 non-null  int64  
 12  points                    151265 non-null  int64  
 13  is_starter                151265 non-null  i

In [80]:
# New DataFrame which contains in depth player stats
player_stats_df = (
    merged_df_to_get_year.groupby(["player_name", "season_start_year"])
        .agg({
            "points": "mean",
            "assists": "mean",
            "total_rebounds": "mean",
            "field_goals_percentage_%": "mean",
            "three_point_percentage_%": "mean",
            "free_throws_percentage_%": "mean",
            "three_pointers": "sum"
        })
        .round({
            "points": 1,
            "assists": 1,
            "total_rebounds": 1,
            "field_goals_percentage_%": 2,
            "free_throws_percentage_%": 2,
            "three_point_percentage_%": 2,
        })
        .reset_index()
        .rename(columns={
            "points": "points_per_game",
            "assists": "assists_per_game",
            "total_rebounds": "rebounds_per_game",
            "field_goals_percentage_%": "field_goal_%_per_game",
            "three_point_percentage_%": "three_point_%_per_game",
            "free_throws_percentage_%": "free_throws_%_per_game",
            "three_pointers": "total_three_pointers"
        })
)

player_stats_df.head()

Unnamed: 0,player_name,season_start_year,points_per_game,assists_per_game,rebounds_per_game,field_goal_%_per_game,three_point_%_per_game,free_throws_%_per_game,total_three_pointers
0,A.J. Hammons,2016,1.1,0.1,0.8,12.89,8.52,8.33,5
1,A.J. Hammons,2017,0.0,0.0,0.0,0.0,0.0,0.0,0
2,Aaron Brooks,2015,6.0,2.2,1.2,31.2,25.39,22.34,66
3,Aaron Brooks,2016,4.1,1.6,0.9,27.07,22.47,20.25,48
4,Aaron Brooks,2017,0.9,0.2,0.2,12.75,8.33,4.58,11


In [71]:
# New DataFrame to get the team name from boxscores
boxscores_subset = boxscores_df[["game_id", "team_name"]]
merged_df_to_get_team_name = games_df.merge(boxscores_subset, how="inner", on="game_id")

merged_df_to_get_team_name.head()

Unnamed: 0,season_start_year,away_team,points_away,home_team,points_home,date_time,game_id,team_name
0,2015,Cleveland Cavaliers,95,Chicago Bulls,97,27-10-2015,23859,Cleveland Cavaliers
1,2015,Cleveland Cavaliers,95,Chicago Bulls,97,27-10-2015,23859,Cleveland Cavaliers
2,2015,Cleveland Cavaliers,95,Chicago Bulls,97,27-10-2015,23859,Cleveland Cavaliers
3,2015,Cleveland Cavaliers,95,Chicago Bulls,97,27-10-2015,23859,Cleveland Cavaliers
4,2015,Cleveland Cavaliers,95,Chicago Bulls,97,27-10-2015,23859,Cleveland Cavaliers


In [79]:
# Function to determine winner of a game
def determine_winner(row):
    if row["team_name"] == row["home_team"]:
        return int(row["points_home"] > row["points_away"])
    elif row["team_name"] == row["away_team"]:
        return int(row["points_away"] > row["points_home"])
    else:
        return None

# New column
merged_df_to_get_team_name["won_game"] = merged_df_to_get_team_name.apply(determine_winner, axis=1)

# Drop duplicates
team_wins_df = merged_df_to_get_team_name.drop_duplicates()

# Count total number of wins a team has
total_wins_df = (
    team_wins_df.groupby(["season_start_year", "team_name"])["won_game"]
        .sum()
        .reset_index()
        .rename(columns={
            "won_game": "total_wins"
        })
)

# Count total number of games a team played per year
games_played_df = (
    team_wins_df.groupby(["season_start_year", "team_name"])["won_game"]
        .count()
        .reset_index()
        .rename(columns={
            "won_game": "total_games"
        })
)

# Create a new DataFrame to have the team stats for each year
team_stats_df = pd.merge(games_played_df, total_wins_df, on=["season_start_year", "team_name"])
# Add a new column to find out how many losses a team had
team_stats_df["total_losses"] = team_stats_df["total_games"] - team_stats_df["total_wins"]
# Add a new column to calculate the win percentage
team_stats_df["win_%"] = (team_stats_df["total_wins"] / team_stats_df["total_games"] * 100).round(2)


team_stats_df.head()

Unnamed: 0,season_start_year,team_name,total_games,total_wins,total_losses,win_%
0,2015,Atlanta Hawks,82,48,34,58.54
1,2015,Boston Celtics,82,48,34,58.54
2,2015,Brooklyn Nets,82,21,61,25.61
3,2015,Charlotte Hornets,82,48,34,58.54
4,2015,Chicago Bulls,82,42,40,51.22


In [73]:
# Save player stats DataFrame to a CSV
FILE_PATH = "../tests/test_data/test_player_stats.csv"
player_stats_df.to_csv(FILE_PATH, index=False)

In [74]:
# Save player stats DataFrame to a CSV
FILE_PATH = "../tests/test_data/test_team_stats.csv"
team_stats_df.to_csv(FILE_PATH, index=False)