Eve-ning · Eve-ning · Sep 8, 2023 · Sep 5, 2023 · Sep 5, 2023 · Sep 5, 2023
diff --git a/.gitattributes b/.gitattributes
@@ -0,0 +1,2 @@
+*.sh text eol=lf
+
diff --git a/.github/workflows/pipeline-test.yml b/.github/workflows/pipeline-test.yml
@@ -39,14 +39,6 @@ jobs:
           [ -f "$dataset" ] || { echo "Dataset file not found"; exit 1; }
           [ -s "$dataset" ] || { echo "Dataset file is empty"; exit 1; }
 
-      - name: Check Pipeline Cache exists and is not empty
-        if: always()
-        working-directory: src/.pipeline_cache/
-        run: |
-          pipeline_cache=cache.env
-          [ -f "$pipeline_cache" ] || { echo "Pipeline Cache file not found"; exit 1; }
-          [ -s "$pipeline_cache" ] || { echo "Pipeline Cache file is empty"; exit 1; }
-
       - name: Check Dist exists and is not empty
         if: always()
         working-directory: src/dist/

diff --git a/src/.env b/src/.env
@@ -0,0 +1,34 @@
+# DB_URL=https://data.ppy.sh/2023_09_01_performance_mania_top_1000.tar.bz2
+# FILES_URL=https://data.ppy.sh/2023_09_01_osu_files.tar.bz2
+DB_URL=https://github.com/Eve-ning/opal/raw/master/rsc/sample.tar.bz2
+FILES_URL=https://github.com/Eve-ning/opal/raw/master/rsc/sample_files.tar.bz2
+MODEL_NAME=2023.9.4
+DB_NAME=osu
+DB_USERNAME=root
+DB_PASSWORD=p@ssw0rd1
+DB_HOST=osu.mysql
+DB_PORT=3307
+SR_MIN=2
+SR_MAX=15
+ACC_MIN=0.85
+ACC_MAX=1.0
+MIN_SCORES_PER_MID=0
+MIN_SCORES_PER_UID=0
+MAX_SVNESS=0.05
+MYSQL_PASSWORD=p@ssw0rd1
+MYSQL_PORT=3307
+
+OSU_BEATMAP_DIFFICULTY_ATTRIBS=0
+OSU_BEATMAP_DIFFICULTY=0
+OSU_SCORES=1
+OSU_BEATMAP_FAILTIMES=0
+OSU_USER_BEATMAP_PLAYCOUNT=0
+OSU_BEATMAPS=1
+OSU_BEATMAPSETS=1
+OSU_USER_STATS=1
+SAMPLE_USERS=1
+OSU_COUNTS=1
+OSU_DIFFICULTY_ATTRIBS=1
+OSU_BEATMAP_PERFORMANCE_BLACKLIST=1
+DATASET_NAME=sample_dfeb92e3_84947aba.csv
+MODEL_PATH=/src/opal/models/2023.9.4/sample_dfeb92e3_84947aba.csv/lightning_logs/version_0/checkpoints/epoch=0-step=1.ckpt
diff --git a/src/build/run.sh b/src/build/run.sh
@@ -0,0 +1,21 @@
+cd "$(dirname "$0")" || exit 1
+
+# Check if the .env file path is the first argument and source it
+if [ -f "$1" ]; then
+  ENV_FILE_PATH="$1"
+  export ENV_FILE_PATH
+  set -a
+  source "$ENV_FILE_PATH"
+  set +a
+else
+  echo "Usage: ./run.sh [ENV_FILE_PATH]"
+  echo "The ENV_FILE_PATH must be relative to this script."
+  exit 1
+fi
+
+# Load utils for envdotsub and check_env_set
+. ../utils.sh
+check_env_set docker-compose.yml || exit 1
+
+envdotsub docker-compose.yml
+docker compose -f .docker-compose.yml up --build || exit 1
diff --git a/src/evaluate/embedding_eval.py b/src/evaluate/embedding_eval.py
@@ -0,0 +1,70 @@
+from pathlib import Path
+
+import pandas as pd
+from matplotlib import pyplot as plt
+from sklearn.decomposition import PCA
+from sklearn.preprocessing import minmax_scale
+
+from opal import OpalNet
+
+# %%
+
+df_user = pd.read_csv("users.csv", delimiter="\t")
+df_map = pd.read_csv("maps.csv", delimiter="\t")
+# %%
+ckpt_path = Path(
+    "../models/V4/2023_08_01_performance_mania_top_10000_20230819163602.csv"
+    "/lightning_logs/version_1/checkpoints/epoch=6-step=56056.ckpt"
+)
+net = OpalNet.load_from_checkpoint(ckpt_path)
+
+# %%
+# Get embedding as array
+u_emb_wgt = net.model.u_emb.weight.detach().cpu().numpy()
+m_emb_wgt = net.model.m_emb.weight.detach().cpu().numpy()
+# %%
+pca_components = 6
+u_emb_pca = PCA(n_components=pca_components)
+u_emb_ld = minmax_scale(u_emb_pca.fit_transform(u_emb_wgt))
+explained_u_emb_var = u_emb_pca.explained_variance_ratio_.round(3)
+print(f"Explained User Variance: {explained_u_emb_var}")
+
+m_emb_pca = PCA(n_components=pca_components)
+m_emb_ld = minmax_scale(m_emb_pca.fit_transform(m_emb_wgt))
+explained_m_emb_var = m_emb_pca.explained_variance_ratio_.round(3)
+print(f"Explained Map Variance: {explained_m_emb_var}")
+#%%
+plt.hist(u_emb_wgt[:, 1], bins=100)
+plt.show()
+
+# %%
+df_uid = (
+    pd.concat(
+        [
+            pd.DataFrame(
+                [uid.split("/") for e, uid in enumerate(net.uid_le.classes_)],
+                columns=["user_id", "year"]
+            ).astype({'user_id': int}),
+            pd.DataFrame(u_emb_ld).reset_index(drop=True).astype(float)
+        ],
+        # ignore_index=True,
+        axis=1
+    )
+).merge(df_user, on="user_id")
+#%%
+df_mid = (
+    pd.concat(
+        [
+            pd.DataFrame(
+                [mid.split("/") for mid in net.mid_le.classes_],
+                columns=["beatmap_id", "speed"]
+            ).astype({'beatmap_id': int}),
+            pd.DataFrame(m_emb_ld).reset_index(drop=True).astype(float)
+        ],
+        # ignore_index=True,
+        axis=1
+    )
+).merge(df_map[['beatmap_id', 'filename', 'difficultyrating']], on="beatmap_id")
+# %%
+pd.DataFrame(u_emb_ld).astype(float)
+
diff --git a/src/evaluate/run.sh b/src/evaluate/run.sh
@@ -0,0 +1,21 @@
+cd "$(dirname "$0")" || exit 1
+
+# Check if the .env file path is the first argument and source it
+if [ -f "$1" ]; then
+  ENV_FILE_PATH="$1"
+  export ENV_FILE_PATH
+  set -a
+  source "$ENV_FILE_PATH"
+  set +a
+else
+  echo "Usage: ./run.sh [ENV_FILE_PATH]"
+  echo "The ENV_FILE_PATH must be relative to this script."
+  exit 1
+fi
+
+# Load utils for envdotsub and check_env_set
+. ../utils.sh
+check_env_set docker-compose.yml || exit 1
+
+envdotsub docker-compose.yml
+docker compose -f .docker-compose.yml up --build || exit 1
diff --git a/src/opal/models/README.md b/src/opal/models/README.md
@@ -2,12 +2,16 @@
 
 **See below on how to load the model.**
 
-| Model      | R2     | MAE   | RMSE  | Error Distribution                                                                                                                |
-|------------|--------|-------|-------|-----------------------------------------------------------------------------------------------------------------------------------|
-| V2_2023_01 | 81.48% | 1.18% | 1.71% |                                                                                                                                   |
-| V2_2023_04 | 71.88% | 1.14% | 1.68% |                                                                                                                                   |
-| V3_2023_05 | 73.76% | 1.09% | 1.62% |                                                                                                                                   |
-| V4_2023_08 | 62.07% | 1.10% | 1.64% | ![error](V4/2023_08_01_performance_mania_top_10000_20230819163602.csv/lightning_logs/version_1/evaluation/error_distribution.png) |
+| Model      | R2     | MAE   | RMSE  |
+|------------|--------|-------|-------|
+| V2_2023_01 | 81.48% | 1.18% | 1.71% |
+| V2_2023_04 | 71.88% | 1.14% | 1.68% |
+| V3_2023_05 | 73.76% | 1.09% | 1.62% |
+| V4_2023_08 | 62.07% | 1.10% | 1.64% |
+| V4_2023_09 | 60.17% | 1.15% | 1.71% |
+
+**Latest Error Distribution**
+![](2023.9.4/2023_09_01_performance_mania_top_10000_20230904212037.csv/lightning_logs/version_20/evaluation/error_distribution.png)
 
 ## Limitations
 

diff --git a/src/opal/train.py b/src/opal/train.py
@@ -11,7 +11,7 @@
 from opal.score_datamodule import ScoreDataModule
 
 
-def train(model_name: str, dataset_path: Path, pipeline_run_cache: Path = None):
+def train(model_name: str, dataset_path: Path):
     """ Trains the OpalNet
 
     Args:
@@ -63,12 +63,6 @@ def train(model_name: str, dataset_path: Path, pipeline_run_cache: Path = None):
 
     trainer.fit(net, datamodule=dm)
 
-    # This is only used for pipeline runs
-    if pipeline_run_cache:
-        model_path = Path(trainer.checkpoint_callback.best_model_path)
-        with open(pipeline_run_cache, 'a') as f:
-            f.write(f'MODEL_PATH={model_path}\n')
-
 
 if __name__ == '__main__':
     torch.set_float32_matmul_precision('high')
@@ -78,18 +72,15 @@ def train(model_name: str, dataset_path: Path, pipeline_run_cache: Path = None):
                         help='Experiment Name Tag. Can be a tag used before, this will append to the experiment dir.')
     parser.add_argument('--dataset_name', type=str,
                         help='Dataset Name, must be in ../datasets/<DATASET_NAME>')
-    parser.add_argument('--pipeline_run_cache', type=str,
-                        help='Path to the pipeline run cache file. Optional, used for pipeline runs.')
     args = parser.parse_args()
 
-    if not (args.pipeline_run_cache or args.model_name or args.dataset_name):
+    if not (args.model_name or args.dataset_name):
         parser.print_help()
         sys.exit(1)
 
     MODEL_NAME = args.model_name
     DATASET_PATH = DATASET_DIR / args.dataset_name
-    PIPELINE_RUN_CACHE = Path(args.pipeline_run_cache) if args.pipeline_run_cache else None
     assert MODEL_NAME, "Model Name must be provided."
     assert DATASET_PATH.exists(), f"Dataset {DATASET_PATH.as_posix()} does not exist."
 
-    train(MODEL_NAME, DATASET_PATH, PIPELINE_RUN_CACHE)
+    train(MODEL_NAME, DATASET_PATH)
diff --git a/src/pipeline.sh b/src/pipeline.sh
@@ -1,11 +1,6 @@
 #!/usr/bin/env bash
 
 # This script runs the entire pipeline, from preprocessing to publishing.
-#
-# Usage: ./pipeline.sh [PIPELINE_RUN_ID]
-#
-# The PIPELINE_RUN_ID is a unique identifier for this pipeline run.
-# If not specified, it will be set to the current unix timestamp.
 
 # Dev Info
 # On the Docker Compose Substitution:
@@ -16,129 +11,13 @@
 # Change directory to current script directory
 cd "$(dirname "$(realpath "$0")")" || exit 1
 
-# Substitute environment variables in a file
-envdotsub() {
-  filename=$(basename "$1")
-  dir=$(dirname "$1")
-  dotfile=".$filename"
-  dotfilepath="$dir/$dotfile"
-  envsubst <"$1" >"$dotfilepath"
-}
-
 # Preprocesses the Dataset.
-# Sets the DATASET_NAME variable in the pipeline run cache.
-preprocess() {
-  envdotsub preprocess/docker-compose.yml
-  sed -i 's|osu-data-docker/docker-compose.yml|osu-data-docker/.docker-compose.yml|g' \
-    preprocess/.docker-compose.yml || exit 1
-  envdotsub preprocess/osu-data-docker/docker-compose.yml
-
-  # Without -d, this script will hang until the docker compose process is killed
-  # To also include compose stop, we'll peek at the dataset file and wait for it to be created
-  docker compose \
-    --profile files \
-    -f preprocess/.docker-compose.yml \
-    up --build -d >preprocess.log 2>&1 &
-
-  while [ ! -f "./datasets/$DATASET_NAME" ]; do
-    echo "Waiting for dataset to be created... (Showing most recent log)"
-    tail -n 3 preprocess.log
-    sleep 10
-  done
-
-  docker compose \
-    --profile files \
-    -f preprocess/.docker-compose.yml \
-    stop || exit 1
-
-  source "$PIPELINE_RUN_CACHE"
-  if [ -z "$DATASET_NAME" ]; then
-    echo "DATASET_NAME not returned by preprocess"
-    exit 1
-  fi
-}
-
+# Sets the DATASET_NAME variable in the env file.
+(preprocess/run.sh ../.env)|| exit 1
 # Trains the Model.
-# Sets the MODEL_PATH variable in the pipeline run cache.
-train() {
-  envdotsub train/docker-compose.yml
-  docker compose \
-    -f train/.docker-compose.yml \
-    up --build || exit 1
-
-  source "$PIPELINE_RUN_CACHE"
-  if [ -z "$MODEL_PATH" ]; then
-    echo "MODEL_PATH not returned by train"
-    exit 1
-  fi
-}
-
+# Sets the MODEL_PATH variable in the env file.
+(train/run.sh ../.env) || exit 1
 # Evaluates the Model.
-evaluate() {
-  echo "Evaluating Model"
-  envdotsub evaluate/docker-compose.yml
-  docker compose \
-    -f evaluate/.docker-compose.yml \
-    up --build || exit 1
-}
-
-# Publishes the Model via PyPI.
-publish() {
-  echo "Publishing Model"
-  envdotsub build/docker-compose.yml
-  docker compose \
-    -f build/.docker-compose.yml \
-    up --build || exit 1
-}
-
-make_pipeline_cache() {
-  # Create unique pipeline run id
-  PIPELINE_RUN_CACHE=.pipeline_cache/${1:-$(date +%s)}.env
-  mkdir -p .pipeline_cache
-  mkdir -p datasets
-  if [ -f "$PIPELINE_RUN_CACHE" ]; then
-    echo "Pipeline run cache ${PIPELINE_RUN_CACHE} already exists"
-    exit 1
-  fi
-}
-
-load_env() {
-  # Set default values for variables
-  export DB_URL=https://github.com/Eve-ning/opal/raw/master/rsc/sample.tar.bz2
-  export FILES_URL=https://github.com/Eve-ning/opal/raw/master/rsc/sample_files.tar.bz2
-  cat <<EOF >>"$PIPELINE_RUN_CACHE"
-PIPELINE_RUN_CACHE="$PIPELINE_RUN_CACHE"
-DB_URL="$DB_URL"
-FILES_URL="$FILES_URL"
-FILES_DIR="/var/lib/osu/osu.files/$(basename "$FILES_URL" .tar.bz2)/"
-MODEL_NAME="2023.9.4"
-DATASET_NAME="$(basename "$DB_URL" .tar.bz2)_$(date +"%Y%m%d%H%M%S").csv"
-DB_NAME="osu"
-DB_USERNAME="root"
-DB_PASSWORD="p@ssw0rd1"
-DB_HOST="osu.mysql"
-DB_PORT="3307"
-SR_MIN="2"
-SR_MAX="15"
-ACC_MIN="0.85"
-ACC_MAX="1.0"
-MIN_SCORES_PER_MID="0"
-MIN_SCORES_PER_UID="0"
-MAX_SVNESS="0.05"
-EOF
-  # Source and Export variables
-  set -a
-  source "$PIPELINE_RUN_CACHE"
-  source preprocess/osu-data-docker/.env
-  set +a
-}
-
-make_pipeline_cache "$1" || exit 1
-load_env || exit 1
-preprocess || exit 1
-train || exit 1
-set -a
-source "$PIPELINE_RUN_CACHE"
-set +a
-evaluate || exit 1
-publish || exit 1
+(evaluate/run.sh ../.env) || exit 1
+# Build & Publishes the Model via PyPI.
+(build/run.sh ../.env) || exit 1
diff --git a/src/preprocess/4_entrypoint.sh b/src/preprocess/4_entrypoint.sh
@@ -3,6 +3,7 @@
 # Ensure all variables are set
 : "${DATASET_NAME:?DATASET_NAME not set}"
 
+mkdir -p ../datasets/
 mysql -h osu.mysql -P 3307 -u root -pp@ssw0rd1 -D osu < \
 ./4_export.sql | \
 sed 's/\t/,/g' >../datasets/"${DATASET_NAME}"

diff --git a/src/preprocess/docker-compose.yml b/src/preprocess/docker-compose.yml
@@ -1,6 +1,6 @@
 version: "2.20.2"
 include:
-  - osu-data-docker/docker-compose.yml
+  - osu-data-docker/.docker-compose.yml
 
 services:
   1.preprocess: