# Clean up Players data

raw_data_file = "../raw/retrosheet/players.csv"

In [19]:
import pandas as pd
import json
import csv
from collections import defaultdict as ddict

In [20]:
df = pd.read_csv("../raw/retrosheet/players.csv")

In [21]:
# Helper functions for functional dependencies.

# Print violation of functional dependencies.
def print_fd(deps, key, dep):
    print(key, dep, sep="\t")
    for k, v in deps.items():
        if len(v) > 1:
            print(f"{k}:=\t{v}")

# Extract the functional dependencies from the dataframe.
def fd(pdf, key, dep):
    dets = ddict(set)
    pdf.apply(lambda x: dets[x[key]].add(x[dep]), axis=1)
    print_fd(dets, key, dep)
    return dets

# Replace based on functional dependencies.
def replace_fd(pdf, index, key, rmap):
    if pdf.loc[index, key] in rmap:
        pdf.loc[index, key] = rmap[pdf.loc[index, key]]

In [22]:
df.keys()

Index(['playerID', 'birthYear', 'birthMonth', 'birthDay', 'birthCountry',
       'birthState', 'birthCity', 'deathYear', 'deathMonth', 'deathDay',
       'deathCountry', 'deathState', 'deathCity', 'nameFirst', 'nameLast',
       'nameGiven', 'weight', 'height', 'bats', 'throws', 'debut', 'finalGame',
       'retroID', 'bbrefID'],
      dtype='object')

In [23]:
_ = fd(df, "playerID", "retroID")

playerID	retroID


In [24]:
df[["retroID"]].duplicated().sum()

0

In [25]:
df_retroIdMap = df[["retroID"]].copy(deep=True)
df_retroIdMap["ID"] = range(len(df_retroIdMap))
df_retroIdMap.to_csv("../processed/data/playerretroID.csv", index=False)

In [26]:
rMapPlayerRetroID = {
    row["retroID"]: row["ID"] for _, row in df_retroIdMap.iterrows()
}

In [27]:
for index in df.index:
    replace_fd(df, index, "retroID", rMapPlayerRetroID)

df.rename(columns={"retroID": "ID"}, inplace=True)

In [28]:
df_playerIDMap = df[["playerID", "ID"]].copy(deep=True)
df_playerIDMap.to_csv("../processed/data/playerID.csv", index=False)

df_bbrefIDMap = df[["bbrefID", "ID"]].copy(deep=True)
df_bbrefIDMap.to_csv("../processed/data/playerbbrefID.csv", index=False)

df.drop(columns=["playerID", "bbrefID"], inplace=True)

In [29]:
df.keys()

Index(['birthYear', 'birthMonth', 'birthDay', 'birthCountry', 'birthState',
       'birthCity', 'deathYear', 'deathMonth', 'deathDay', 'deathCountry',
       'deathState', 'deathCity', 'nameFirst', 'nameLast', 'nameGiven',
       'weight', 'height', 'bats', 'throws', 'debut', 'finalGame', 'ID'],
      dtype='object')

In [30]:
drop_list = [
    'deathYear', 'deathMonth', 'deathDay',
    'deathCountry', 'deathState', 'deathCity',
]

In [31]:
df.drop(columns=drop_list, inplace=True)

In [32]:
_ = fd(df, "ID", "nameFirst")
_ = fd(df, "ID", "nameLast")

ID	nameFirst
ID	nameLast


In [33]:
df.drop(columns=["finalGame"], inplace=True)

In [34]:
df.to_csv("../processed/data/player.csv", index=False)