### Clean Data by excluding certain plays

In [29]:
#Imports
import pandas as pd
from pathlib import Path
import os
import numpy as np


In [30]:
#Read in Data
projectRoot = Path.cwd()
dfPlays = pd.read_csv(f'{projectRoot}/data/supplementary_data.csv')

  dfPlays = pd.read_csv(f'{projectRoot}/data/supplementary_data.csv')


In [31]:
#Exclude Penalty Plays
dfPlaysWithoutPen = dfPlays[dfPlays["penalty_yards"].isna()]
len(dfPlaysWithoutPen)

17549

In [32]:
#Save cleaned
os.makedirs("cleanedData", exist_ok=True)
dfPlaysWithoutPen.to_csv("cleanedData/dfPlaysWithoutPen.csv", index=False)

### Clean cordiante data

In [33]:
dataPath = f"{Path.cwd()}/data/train"
savePath = f"{Path.cwd()}/cleanedData/train"

In [34]:
def rotateLeft180(df,
                  xCol="x", yCol="y",
                  dirCol="dir", oCol="o",
                  playDirectionCol="play_direction"):
    df = df.copy()
    flip = df[playDirectionCol].eq("left")

    # positions (rotate about center)
    df.loc[flip, xCol] = 120 - df.loc[flip, xCol]
    df.loc[flip, yCol] = 53.3 - df.loc[flip, yCol]

    # angles (0Â° right, clockwise)
    if dirCol in df.columns:
        df.loc[flip, dirCol] = (df.loc[flip, dirCol] + 180) % 360
    if oCol in df.columns:
        df.loc[flip, oCol]   = (df.loc[flip, oCol]   + 180) % 360
    
    df.loc[flip, playDirectionCol] = "right"
    return df

In [35]:
def orientSeason(inputDir=".", outputDir="cleanedData",
                 inPattern="input_2023_w{week:02d}.csv",
                 outPattern="input_oriented_2023_w{week:02d}.csv",
                 xCol="x", yCol="y", dirCol="dir", oCol="o",
                 playDirectionCol="play_direction",
                 otherInPattern="output_2023_w{week:02d}.csv",
                 otherOutPattern="output_oriented_2023_w{week:02d}.csv",
                 ballLandXCol="ball_land_x",
                 ballLandYCol="ball_land_y"):
    inputDir = Path(inputDir)
    outputDir = Path(outputDir)
    outputDir.mkdir(parents=True, exist_ok=True)
    
    for week in range(1, 19):
        inPath = inputDir / inPattern.format(week=week)
        if not inPath.exists():
            print(f"[w{week:02d}] SKIP: {inPath} not found.")
            continue

        print(f"[w{week:02d}] Reading {inPath.name} ...")
        df = pd.read_csv(inPath)

        # rows to flip from the main file (left plays)
        flipMask = df[playDirectionCol].astype(str).str.lower().eq("left") if playDirectionCol in df.columns else pd.Series(False, index=df.index)
        toFlip = df.loc[flipMask, ["game_id", "play_id"]].drop_duplicates()
        flipKeys = set(map(tuple, toFlip.itertuples(index=False, name=None))) 

        # --- NEW: rotate ball landing coords ONLY for the first files
        if flipMask.any():
            if ballLandXCol in df.columns:
                df.loc[flipMask, ballLandXCol] = 120 - pd.to_numeric(df.loc[flipMask, ballLandXCol])
            if ballLandYCol in df.columns:
                df.loc[flipMask, ballLandYCol] = 53.3 - pd.to_numeric(df.loc[flipMask, ballLandYCol])

        # rotate left -> right in the main file (as you already had)
        beforeLeft = flipMask.sum()
        dfOut = rotateLeft180(df, xCol=xCol, yCol=yCol, dirCol=dirCol, oCol=oCol, playDirectionCol=playDirectionCol)

        outPath = outputDir / outPattern.format(week=week)  # e.g., oriented_2023_w06.csv
        dfOut.to_csv(outPath, index=False)
        afterLeft = dfOut[playDirectionCol].astype(str).str.lower().eq("left").sum()
        print(f"[w{week:02d}] Wrote {outPath.name} | left->right rows fixed: {beforeLeft} | remaining left: {afterLeft}")

        # --- Apply same flips to the OTHER file if it exists ---
        otherInPath = inputDir / otherInPattern.format(week=week)
        if otherInPath.exists() and flipKeys:
            print(f"[w{week:02d}] Also orienting {otherInPath.name} using toFlip pairs ...")
            dfOther = pd.read_csv(otherInPath)

            # synthesize/override play_direction based on toFlip pairs
            tempCol = None
            if playDirectionCol not in dfOther.columns:
                tempCol = f"__{playDirectionCol}__"
                dfOther[tempCol] = "right"
                dfOther.loc[list(map(lambda gp: gp in flipKeys, zip(dfOther["game_id"], dfOther["play_id"]))), tempCol] = "left"
                usePlayDirCol = tempCol
            else:
                usePlayDirCol = playDirectionCol
                mask = list(map(lambda gp: gp in flipKeys, zip(dfOther["game_id"], dfOther["play_id"])))
                dfOther.loc[mask, usePlayDirCol] = "left"

            # rotate using the column we just prepared
            dfOtherOut = rotateLeft180(dfOther, xCol=xCol, yCol=yCol, dirCol=dirCol, oCol=oCol, playDirectionCol=usePlayDirCol)

            # set all rows to right in the output (optional but common)
            dfOtherOut[playDirectionCol] = "right"

            # drop the temp column if we created one
            if tempCol is not None and tempCol in dfOtherOut.columns:
                dfOtherOut = dfOtherOut.drop(columns=[tempCol])

            otherOutPath = outputDir / otherOutPattern.format(week=week)  # e.g., output_oriented_2023_w06.csv
            dfOtherOut.to_csv(otherOutPath, index=False)
            print(f"[w{week:02d}] Wrote {otherOutPath.name} | used {len(flipKeys)} (game_id, play_id) pairs to rotate.")
        else:
            if not otherInPath.exists():
                print(f"[w{week:02d}] (other) SKIP: {otherInPath} not found.")
            else:
                print(f"[w{week:02d}] (other) No left plays to flip for this week.")

In [36]:
orientSeason(
    inputDir = dataPath,
    outputDir = savePath, 
    inPattern="input_2023_w{week:02d}.csv",
    outPattern="oriented_2023_w{week:02d}.csv",
    xCol="x", yCol="y", dirCol="dir", oCol="o",
    playDirectionCol="play_direction"
)

[w01] Reading input_2023_w01.csv ...
[w01] Wrote oriented_2023_w01.csv | left->right rows fixed: 133495 | remaining left: 0
[w01] Also orienting output_2023_w01.csv using toFlip pairs ...
[w01] Wrote output_oriented_2023_w01.csv | used 388 (game_id, play_id) pairs to rotate.
[w02] Reading input_2023_w02.csv ...
[w02] Wrote oriented_2023_w02.csv | left->right rows fixed: 129919 | remaining left: 0
[w02] Also orienting output_2023_w02.csv using toFlip pairs ...
[w02] Wrote output_oriented_2023_w02.csv | used 393 (game_id, play_id) pairs to rotate.
[w03] Reading input_2023_w03.csv ...
[w03] Wrote oriented_2023_w03.csv | left->right rows fixed: 161661 | remaining left: 0
[w03] Also orienting output_2023_w03.csv using toFlip pairs ...
[w03] Wrote output_oriented_2023_w03.csv | used 486 (game_id, play_id) pairs to rotate.
[w04] Reading input_2023_w04.csv ...
[w04] Wrote oriented_2023_w04.csv | left->right rows fixed: 117773 | remaining left: 0
[w04] Also orienting output_2023_w04.csv using t