# Data Cleaning 

In [5]:
import os
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path

In [6]:
coaches = pd.read_csv("coaches.csv")
players_teams = pd.read_csv("players_teams.csv")
teams = pd.read_csv("teams.csv")

## Table Prep

In [14]:
#This function creates two data sets for each year, one for training and one for testing
def split_yearly_train_test(
    coaches_df, players_teams_df, teams_df,
):
    """
    For every year:
        training = rows where year == year
        test = rows where year == year with limited attributes

    Saves all 6 tables per year into CSV files.
    """
    coaches_test_cols = ["coachID", "year", "tmID", "lgID", "stint"]
    players_teams_test_cols = ["playerID", "tmID", "year", "lgID", "stint"]
    teams_test_cols = ["tmID", "year", "lgID", "confID", "franchID", "name", "arena"]

    output_dir="yearly_data"
    # Create output directory if missing
    os.makedirs(output_dir, exist_ok=True)

    all_years = sorted(players_teams_df["year"].unique())
    
    for year in all_years:
        print(f"Year {year}: ")

        # ===============================
        # COACHES
        # ===============================
        coaches_training = coaches_df[coaches_df["year"] == year].copy()
        coaches_test = coaches_training.copy()

        if coaches_test_cols:
            coaches_test = coaches_test[coaches_test_cols]

        coaches_training.to_csv(f"{output_dir}/year_{year}_coaches_training.csv", index=False)
        coaches_test.to_csv(f"{output_dir}/year_{year}_coaches_test.csv", index=False)

        # ===============================
        # PLAYERS_TEAMS
        # ===============================
        pt_training = players_teams_df[players_teams_df["year"] == year].copy()
        pt_test = pt_training.copy()

        if players_teams_test_cols:
            pt_test = pt_test[players_teams_test_cols]

        pt_training.to_csv(f"{output_dir}/year_{year}_players_teams_training.csv", index=False)
        pt_test.to_csv(f"{output_dir}/year_{year}_players_teams_test.csv", index=False)

        # ===============================
        # TEAMS
        # ===============================
        teams_training = teams_df[teams_df["year"] == year].copy()
        teams_test = teams_training.copy()

        if teams_test_cols:
            teams_test = teams_test[teams_test_cols]

        teams_training.to_csv(f"{output_dir}/year_{year}_teams_training.csv", index=False)
        teams_test.to_csv(f"{output_dir}/year_{year}_teams_test.csv", index=False)

        print(f"âœ” Saved train/test CSVs for year {year}")

    print("\nðŸŽ‰ All yearly splits saved successfully!")

In [13]:
split_yearly_train_test(
    coaches_df=coaches,
    players_teams_df=players_teams,
    teams_df=teams)

Year 1: 
âœ” Saved train/test CSVs for year 1
Year 2: 
âœ” Saved train/test CSVs for year 2
Year 3: 
âœ” Saved train/test CSVs for year 3
Year 4: 
âœ” Saved train/test CSVs for year 4
Year 5: 
âœ” Saved train/test CSVs for year 5
Year 6: 
âœ” Saved train/test CSVs for year 6
Year 7: 
âœ” Saved train/test CSVs for year 7
Year 8: 
âœ” Saved train/test CSVs for year 8
Year 9: 
âœ” Saved train/test CSVs for year 9
Year 10: 
âœ” Saved train/test CSVs for year 10

ðŸŽ‰ All yearly splits saved successfully!
