In [61]:
import pandas as pd

history = pd.read_csv("../Stage_3/Datasets/draft_history.csv")

In [62]:
# Remove Redundant Columns
history = history.drop(
    columns = [
        "player_name", "team_city", "team_name", "team_abbreviation"
    ]
)

In [63]:
# Make Organization its own table
org_to_type = {
    org : type for org, type in zip(history["organization"], history["organization type"])
    if pd.notna(org)
}


curr_id = 1
ids = []
orgs = []
types = []
for org in history["organization"]:
    if not pd.isna(org) and org not in orgs:
        orgs.append(org)
        type = org_to_type.get(org)
        if not type:
            type = pd.NA
        types.append(type)
        ids.append(curr_id)
        curr_id += 1

orgs_df = pd.DataFrame({
    "OrganizationID" : ids,
    "OrganizationName" : orgs,
    "Type" : types
})


In [64]:
# Sanity Checks
print("Total rows in orgs_df:", len(orgs_df))
print("Unique OrganizationNames:", orgs_df["OrganizationName"].nunique())
dupes = orgs_df[orgs_df["OrganizationName"].duplicated(keep=False)]
print("Duplicate organizations found:", len(dupes))
if not dupes.empty:
    print(dupes.head())

print("Missing OrganizationName:", orgs_df["OrganizationName"].isna().sum())
print("Blank OrganizationName:", (orgs_df["OrganizationName"].astype(str).str.strip() == "").sum())
print("Missing OrganizationType:", orgs_df["Type"].isna().sum())

Total rows in orgs_df: 903
Unique OrganizationNames: 903
Duplicate organizations found: 0
Missing OrganizationName: 0
Blank OrganizationName: 0
Missing OrganizationType: 0


In [65]:
# Add orgs ID to history table
org_to_id = {
    org : id for org, id in zip(orgs_df["OrganizationName"], orgs_df["OrganizationID"])
}

ids = []
for org in history["organization"]:
    if not pd.isna(org):
        ids.append(org_to_id[org])
    else:
        ids.append(pd.NA)

history["OrganizationID"] = ids
history = history.drop(columns = ["organization", "organization type"])
history["DraftID"] = range(1, 1 + len(history))

cols = ["DraftID"] + [c for c in history.columns if c != "DraftID"]
history = history[cols]


In [68]:
history.to_csv("../Stage_3/Datasets/3nf_draft_history.csv", index = False)
orgs_df.to_csv("../Stage_3/Datasets/bcnf_organization.csv", index = False)