In [43]:
import pandas as pd
import re

# read in data
plays_in = pd.read_csv("../Stage_3/Datasets/plays_in.csv")
games = pd.read_csv("../Stage_3/Datasets/Game.csv")
teams = pd.read_csv("../Stage_3/Datasets/team.csv")

In [44]:
# Create dict from abbrev to team name
abbrev_to_name_map = {}
names = teams["Team name"]
abbrevs = teams["Team Abbrv"]
for name, abbrev in zip(names, abbrevs):
    abbrev_to_name_map[abbrev.strip()] = name.strip()

print("Length of hash-map:", len(abbrev_to_name_map))
print("Length of team ds:", len(teams))

Length of hash-map: 30
Length of team ds: 30


In [49]:
# Convert date column to date-time type
games["Date"] = pd.to_datetime(games["Date"]).dt.date
plays_in["Date"] = pd.to_datetime(plays_in["Date"]).dt.date

# Replace correct with abbreivations
plays_in["Tm"]  = plays_in["Tm"].replace({"CHO": "CHA", "BRK": "BKN", "PHO" : "PHX"})
plays_in["Opp"] = plays_in["Opp"].replace({"CHO": "CHA", "BRK": "BKN", "PHO" : "PHX"})

In [54]:
# Add Game ID to plays in
game_id_lookup = {
    (home.strip(), vistor.strip(), date) : game_id 
    for home, vistor, date, game_id in 
    zip(
        games["home team name"], games["Visitor team name"], 
        games["Date"], games["GameID"]
    )
}

game_ids = []
for player_team_abv, other_team_abv, date in zip(plays_in["Tm"], plays_in["Opp"], plays_in["Date"]):
    player_team = abbrev_to_name_map[player_team_abv].strip()
    other_team = abbrev_to_name_map[other_team_abv].strip()
    game_id = game_id_lookup.get((player_team, other_team, date)) or game_id_lookup.get((other_team, player_team, date))
    game_ids.append(game_id)

plays_in["GameID"] = game_ids

In [55]:
# Reorder columns so PlayerID and GameID are first
cols = ["PlayerID"] + ["GameID"] + [c for c in plays_in if c != "PlayerID" and c != "GameID"]
plays_in = plays_in[cols]

In [None]:
# Sanity Checks
print("Unmapped rows in plays_in (GameID):", plays_in["GameID"].isna().sum())
print("Num rows in games:", len(games))
print("Unique GameIDs in games:", len(games["GameID"].unique()))
print("All GameIDs in plays_in are in games:", plays_in["GameID"].isin(games["GameID"]).all())
print("Unique GameIDs in plays_in:", plays_in["GameID"].nunique())
missing_game_ids = plays_in.loc[~plays_in["GameID"].isin(games["GameID"]), "GameID"].unique()
print("Mismatched GameIDs (if any):", missing_game_ids)

Unmapped rows in plays_in (GameID): 0
Num rows in games: 1275
Unique GameIDs in games: 1275
All GameIDs in plays_in are in games: True
Unique GameIDs in plays_in: 767
Mismatched GameIDs (if any): []
