In [1]:
import datasets
import numpy as np
import pandas as pd
import os

In [2]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

In [3]:
dataset = datasets.load_dataset(
    "ReonOhashi/RobocupTrajectoryPrediction_8team",
    revision="939d484d29b31c7f7389e451c830fa98fd942284",
)

dataset = dataset["train"]
train_raw, test_raw = dataset.train_test_split(test_size=0.2, seed=42).values()

Resolving data files:   0%|          | 0/42 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/42 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/36 [00:00<?, ?it/s]

In [4]:
len(train_raw), len(test_raw)

(190079, 47520)

In [None]:
class MinMax:
    def __init__(self, min, max):
        self.min = min
        self.max = max
        if self.min >= self.max:
            raise ValueError("min must be less than max")

    def __call__(self, x):
        return (x - self.min) / (self.max - self.min)

    def inverse(self, x):
        return x * (self.max - self.min) + self.min

    def __repr__(self):
        return f"MinMax({self.min}, {self.max})"


def swap_rl(df):
    df["l_name"], df["r_name"] = df["r_name"], df["l_name"]
    df["b_x"] *= -1

    for i in range(1, 12):
        l_x, r_x = f"l{i}_x", f"r{i}_x"
        l_y, r_y = f"l{i}_y", f"r{i}_y"

        df[l_x], df[r_x] = -df[r_x].values, -df[l_x].values
        df[l_y], df[r_y] = df[r_y].values, df[l_y].values

    return df

In [None]:
def clean_and_merge_datasets(datas):
    data_list = []
    for data in datas:
        df = pd.DataFrame(data)
        if df.isnull().values.any():
            continue
        if not np.isfinite(df.select_dtypes(include=[np.number]).values).all():
            continue
        if df["goal_type"].iloc[0] == "goal_r":
            df = swap_rl(df)
        data_list.append(df)
    datas = pd.concat(data_list)
    return datas


def name_onehot(dfs):
    for i in range(10):
        dfs[f"l_name_{i}"] = dfs["l_name"] == i
        dfs[f"r_name_{i}"] = dfs["r_name"] == i
        dfs[f"l_name_{i}"] = (dfs["l_name"] == i).astype(int)
        dfs[f"r_name_{i}"] = (dfs["r_name"] == i).astype(int)
    return dfs


def drop_unnecessary_columns(dfs):
    dfs = dfs.drop(
        columns=[
            "#",
            "cycle",
            "stopped",
            "playmode",
            "l_name",
            "r_name",
            "goal_type",
            "l_score",
            "r_score",
            "l_pen_score",
            "r_pen_score",
        ]
    )
    return dfs


def min_max_normalize(dfs):
    min_max_d = {col: MinMax(min(dfs[col]), max(dfs[col])) for col in dfs.columns}
    for col in dfs.columns:
        dfs[col] = min_max_d[col](dfs[col])
    return dfs, min_max_d


def revert_min_max_normalize(dfs, min_max_d):
    for col in dfs.columns:
        dfs[col] = min_max_d[col].inverse(dfs[col])
    return dfs


def divide_dataframe(dfs, df_size=50):
    df_list = []

    for i in range(0, len(dfs), df_size):
        df = dfs.iloc[i : i + df_size]
        df_list.append(df)
    return df_list


def list_to_numpy(dfs: list) -> np.ndarray:
    cols = dfs[0].columns
    return np.array([df.values for df in dfs]).astype(np.float32), cols


def revert_numpy_from_list(dfs: np.ndarray, cols) -> list[pd.DataFrame]:
    return [pd.DataFrame(df, columns=cols) for df in dfs]


train = train_raw
train = clean_and_merge_datasets(train)
train = name_onehot(train)
train = drop_unnecessary_columns(train)
train, min_max_d = min_max_normalize(train)
train = divide_dataframe(train)

train, cols = list_to_numpy(train)

os.makedirs("datas", exist_ok=True)
np.save("datas/train.npy", train)
np.save("datas/cols.npy", cols)
np.save("datas/min_max_d.npy", min_max_d, allow_pickle=True)

train = np.load("datas/train.npy")
cols = np.load("datas/cols.npy", allow_pickle=True)
min_max_d = np.load("datas/min_max_d.npy", allow_pickle=True).item()

In [None]:
test = test_raw
test = clean_and_merge_datasets(test)
test = name_onehot(test)
test = drop_unnecessary_columns(test)
test, _ = min_max_normalize(test)
test = divide_dataframe(test)

test, _ = list_to_numpy(test)

os.makedirs("datas", exist_ok=True)
np.save("datas/test.npy", test)
test = np.load("datas/test.npy")

KeyboardInterrupt: 