Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Encapsulate MLOps Stages #44

Merged
merged 36 commits into from
Sep 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
ced42ed
Create a default .env
Eve-ning Sep 5, 2023
0b00661
Create utility file
Eve-ning Sep 5, 2023
4e6af8b
Add function to check if all env files are set
Eve-ning Sep 5, 2023
76a7a7b
Add function to add or replace env
Eve-ning Sep 5, 2023
2fb5644
Replace run.sh with independent entry script
Eve-ning Sep 5, 2023
28e599e
Remove pipeline replaced code
Eve-ning Sep 5, 2023
1ecfe72
Movee all osu-data-docker env vars to local env
Eve-ning Sep 5, 2023
b66130a
Remove setting preprocessing vars in pipeline.sh
Eve-ning Sep 5, 2023
b7c41ae
Default docker compose to use dot appended compose
Eve-ning Sep 5, 2023
559ffb4
Silence any set vars
Eve-ning Sep 5, 2023
ca637b0
Make ENV_FILE_PATH compulsory
Eve-ning Sep 5, 2023
fa4ece7
Remove redundant script for caching env
Eve-ning Sep 5, 2023
bbfc478
Update README.md
Eve-ning Sep 5, 2023
896a1a9
Make .env LF
Eve-ning Sep 5, 2023
e1aaf83
Remove pipeline cache appending for train
Eve-ning Sep 5, 2023
aa9a108
Update pipeline to use simpler train calling syntax
Eve-ning Sep 5, 2023
bd60ea9
Update naming
Eve-ning Sep 5, 2023
6921435
Create run for train
Eve-ning Sep 5, 2023
0c00151
Create run.sh
Eve-ning Sep 5, 2023
bae5cd7
Create run.sh
Eve-ning Sep 5, 2023
7f29621
Fix unexpected pipeline cache arg
Eve-ning Sep 5, 2023
6e98a9e
Update calling syntaxes
Eve-ning Sep 5, 2023
a23292d
Fix sed breaking with pathlike forward slashes
Eve-ning Sep 5, 2023
e930cd1
Remove old documentation
Eve-ning Sep 5, 2023
17bd483
Update doc to env file
Eve-ning Sep 5, 2023
de59342
Update env to use sample files
Eve-ning Sep 5, 2023
e8a058b
Force export entrypoint to make datasets dir
Eve-ning Sep 8, 2023
1f1559b
Add temp emb eval
Eve-ning Sep 8, 2023
8359114
Update run exec perms
Eve-ning Sep 8, 2023
a48d776
Normalize Line Endings for Shell
Eve-ning Sep 8, 2023
046ec60
Remove Pipeline Cache check test
Eve-ning Sep 8, 2023
91ff3c9
Force all run scripts to cd to script dir
Eve-ning Sep 8, 2023
690a9f9
Change calling syntax for runs
Eve-ning Sep 8, 2023
6f6cc6b
Lint code
Eve-ning Sep 8, 2023
4edc942
Fix incorrect .env pathing
Eve-ning Sep 8, 2023
7cf8005
Add help on bad .env path
Eve-ning Sep 8, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
*.sh text eol=lf

8 changes: 0 additions & 8 deletions .github/workflows/pipeline-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,14 +39,6 @@ jobs:
[ -f "$dataset" ] || { echo "Dataset file not found"; exit 1; }
[ -s "$dataset" ] || { echo "Dataset file is empty"; exit 1; }

- name: Check Pipeline Cache exists and is not empty
if: always()
working-directory: src/.pipeline_cache/
run: |
pipeline_cache=cache.env
[ -f "$pipeline_cache" ] || { echo "Pipeline Cache file not found"; exit 1; }
[ -s "$pipeline_cache" ] || { echo "Pipeline Cache file is empty"; exit 1; }

- name: Check Dist exists and is not empty
if: always()
working-directory: src/dist/
Expand Down
34 changes: 34 additions & 0 deletions src/.env
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# DB_URL=https://data.ppy.sh/2023_09_01_performance_mania_top_1000.tar.bz2
# FILES_URL=https://data.ppy.sh/2023_09_01_osu_files.tar.bz2
DB_URL=https://github.com/Eve-ning/opal/raw/master/rsc/sample.tar.bz2
FILES_URL=https://github.com/Eve-ning/opal/raw/master/rsc/sample_files.tar.bz2
MODEL_NAME=2023.9.4
DB_NAME=osu
DB_USERNAME=root
DB_PASSWORD=p@ssw0rd1
DB_HOST=osu.mysql
DB_PORT=3307
SR_MIN=2
SR_MAX=15
ACC_MIN=0.85
ACC_MAX=1.0
MIN_SCORES_PER_MID=0
MIN_SCORES_PER_UID=0
MAX_SVNESS=0.05
MYSQL_PASSWORD=p@ssw0rd1
MYSQL_PORT=3307

OSU_BEATMAP_DIFFICULTY_ATTRIBS=0
OSU_BEATMAP_DIFFICULTY=0
OSU_SCORES=1
OSU_BEATMAP_FAILTIMES=0
OSU_USER_BEATMAP_PLAYCOUNT=0
OSU_BEATMAPS=1
OSU_BEATMAPSETS=1
OSU_USER_STATS=1
SAMPLE_USERS=1
OSU_COUNTS=1
OSU_DIFFICULTY_ATTRIBS=1
OSU_BEATMAP_PERFORMANCE_BLACKLIST=1
DATASET_NAME=sample_dfeb92e3_84947aba.csv
MODEL_PATH=/src/opal/models/2023.9.4/sample_dfeb92e3_84947aba.csv/lightning_logs/version_0/checkpoints/epoch=0-step=1.ckpt
21 changes: 21 additions & 0 deletions src/build/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
cd "$(dirname "$0")" || exit 1

# Check if the .env file path is the first argument and source it
if [ -f "$1" ]; then
ENV_FILE_PATH="$1"
export ENV_FILE_PATH
set -a
source "$ENV_FILE_PATH"
set +a
else
echo "Usage: ./run.sh [ENV_FILE_PATH]"
echo "The ENV_FILE_PATH must be relative to this script."
exit 1
fi

# Load utils for envdotsub and check_env_set
. ../utils.sh
check_env_set docker-compose.yml || exit 1

envdotsub docker-compose.yml
docker compose -f .docker-compose.yml up --build || exit 1
70 changes: 70 additions & 0 deletions src/evaluate/embedding_eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
from pathlib import Path

import pandas as pd
from matplotlib import pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import minmax_scale

from opal import OpalNet

# %%

df_user = pd.read_csv("users.csv", delimiter="\t")
df_map = pd.read_csv("maps.csv", delimiter="\t")
# %%
ckpt_path = Path(
"../models/V4/2023_08_01_performance_mania_top_10000_20230819163602.csv"
"/lightning_logs/version_1/checkpoints/epoch=6-step=56056.ckpt"
)
net = OpalNet.load_from_checkpoint(ckpt_path)

# %%
# Get embedding as array
u_emb_wgt = net.model.u_emb.weight.detach().cpu().numpy()
m_emb_wgt = net.model.m_emb.weight.detach().cpu().numpy()
# %%
pca_components = 6
u_emb_pca = PCA(n_components=pca_components)
u_emb_ld = minmax_scale(u_emb_pca.fit_transform(u_emb_wgt))
explained_u_emb_var = u_emb_pca.explained_variance_ratio_.round(3)
print(f"Explained User Variance: {explained_u_emb_var}")

m_emb_pca = PCA(n_components=pca_components)
m_emb_ld = minmax_scale(m_emb_pca.fit_transform(m_emb_wgt))
explained_m_emb_var = m_emb_pca.explained_variance_ratio_.round(3)
print(f"Explained Map Variance: {explained_m_emb_var}")
#%%
plt.hist(u_emb_wgt[:, 1], bins=100)
plt.show()

# %%
df_uid = (
pd.concat(
[
pd.DataFrame(
[uid.split("/") for e, uid in enumerate(net.uid_le.classes_)],
columns=["user_id", "year"]
).astype({'user_id': int}),
pd.DataFrame(u_emb_ld).reset_index(drop=True).astype(float)
],
# ignore_index=True,
axis=1
)
).merge(df_user, on="user_id")
#%%
df_mid = (
pd.concat(
[
pd.DataFrame(
[mid.split("/") for mid in net.mid_le.classes_],
columns=["beatmap_id", "speed"]
).astype({'beatmap_id': int}),
pd.DataFrame(m_emb_ld).reset_index(drop=True).astype(float)
],
# ignore_index=True,
axis=1
)
).merge(df_map[['beatmap_id', 'filename', 'difficultyrating']], on="beatmap_id")
# %%
pd.DataFrame(u_emb_ld).astype(float)

21 changes: 21 additions & 0 deletions src/evaluate/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
cd "$(dirname "$0")" || exit 1

# Check if the .env file path is the first argument and source it
if [ -f "$1" ]; then
ENV_FILE_PATH="$1"
export ENV_FILE_PATH
set -a
source "$ENV_FILE_PATH"
set +a
else
echo "Usage: ./run.sh [ENV_FILE_PATH]"
echo "The ENV_FILE_PATH must be relative to this script."
exit 1
fi

# Load utils for envdotsub and check_env_set
. ../utils.sh
check_env_set docker-compose.yml || exit 1

envdotsub docker-compose.yml
docker compose -f .docker-compose.yml up --build || exit 1
16 changes: 10 additions & 6 deletions src/opal/models/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,16 @@

**See below on how to load the model.**

| Model | R2 | MAE | RMSE | Error Distribution |
|------------|--------|-------|-------|-----------------------------------------------------------------------------------------------------------------------------------|
| V2_2023_01 | 81.48% | 1.18% | 1.71% | |
| V2_2023_04 | 71.88% | 1.14% | 1.68% | |
| V3_2023_05 | 73.76% | 1.09% | 1.62% | |
| V4_2023_08 | 62.07% | 1.10% | 1.64% | ![error](V4/2023_08_01_performance_mania_top_10000_20230819163602.csv/lightning_logs/version_1/evaluation/error_distribution.png) |
| Model | R2 | MAE | RMSE |
|------------|--------|-------|-------|
| V2_2023_01 | 81.48% | 1.18% | 1.71% |
| V2_2023_04 | 71.88% | 1.14% | 1.68% |
| V3_2023_05 | 73.76% | 1.09% | 1.62% |
| V4_2023_08 | 62.07% | 1.10% | 1.64% |
| V4_2023_09 | 60.17% | 1.15% | 1.71% |

**Latest Error Distribution**
![](2023.9.4/2023_09_01_performance_mania_top_10000_20230904212037.csv/lightning_logs/version_20/evaluation/error_distribution.png)

## Limitations

Expand Down
15 changes: 3 additions & 12 deletions src/opal/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from opal.score_datamodule import ScoreDataModule


def train(model_name: str, dataset_path: Path, pipeline_run_cache: Path = None):
def train(model_name: str, dataset_path: Path):
""" Trains the OpalNet

Args:
Expand Down Expand Up @@ -63,12 +63,6 @@ def train(model_name: str, dataset_path: Path, pipeline_run_cache: Path = None):

trainer.fit(net, datamodule=dm)

# This is only used for pipeline runs
if pipeline_run_cache:
model_path = Path(trainer.checkpoint_callback.best_model_path)
with open(pipeline_run_cache, 'a') as f:
f.write(f'MODEL_PATH={model_path}\n')


if __name__ == '__main__':
torch.set_float32_matmul_precision('high')
Expand All @@ -78,18 +72,15 @@ def train(model_name: str, dataset_path: Path, pipeline_run_cache: Path = None):
help='Experiment Name Tag. Can be a tag used before, this will append to the experiment dir.')
parser.add_argument('--dataset_name', type=str,
help='Dataset Name, must be in ../datasets/<DATASET_NAME>')
parser.add_argument('--pipeline_run_cache', type=str,
help='Path to the pipeline run cache file. Optional, used for pipeline runs.')
args = parser.parse_args()

if not (args.pipeline_run_cache or args.model_name or args.dataset_name):
if not (args.model_name or args.dataset_name):
parser.print_help()
sys.exit(1)

MODEL_NAME = args.model_name
DATASET_PATH = DATASET_DIR / args.dataset_name
PIPELINE_RUN_CACHE = Path(args.pipeline_run_cache) if args.pipeline_run_cache else None
assert MODEL_NAME, "Model Name must be provided."
assert DATASET_PATH.exists(), f"Dataset {DATASET_PATH.as_posix()} does not exist."

train(MODEL_NAME, DATASET_PATH, PIPELINE_RUN_CACHE)
train(MODEL_NAME, DATASET_PATH)
135 changes: 7 additions & 128 deletions src/pipeline.sh
Original file line number Diff line number Diff line change
@@ -1,11 +1,6 @@
#!/usr/bin/env bash

# This script runs the entire pipeline, from preprocessing to publishing.
#
# Usage: ./pipeline.sh [PIPELINE_RUN_ID]
#
# The PIPELINE_RUN_ID is a unique identifier for this pipeline run.
# If not specified, it will be set to the current unix timestamp.

# Dev Info
# On the Docker Compose Substitution:
Expand All @@ -16,129 +11,13 @@
# Change directory to current script directory
cd "$(dirname "$(realpath "$0")")" || exit 1

# Substitute environment variables in a file
envdotsub() {
filename=$(basename "$1")
dir=$(dirname "$1")
dotfile=".$filename"
dotfilepath="$dir/$dotfile"
envsubst <"$1" >"$dotfilepath"
}

# Preprocesses the Dataset.
# Sets the DATASET_NAME variable in the pipeline run cache.
preprocess() {
envdotsub preprocess/docker-compose.yml
sed -i 's|osu-data-docker/docker-compose.yml|osu-data-docker/.docker-compose.yml|g' \
preprocess/.docker-compose.yml || exit 1
envdotsub preprocess/osu-data-docker/docker-compose.yml

# Without -d, this script will hang until the docker compose process is killed
# To also include compose stop, we'll peek at the dataset file and wait for it to be created
docker compose \
--profile files \
-f preprocess/.docker-compose.yml \
up --build -d >preprocess.log 2>&1 &

while [ ! -f "./datasets/$DATASET_NAME" ]; do
echo "Waiting for dataset to be created... (Showing most recent log)"
tail -n 3 preprocess.log
sleep 10
done

docker compose \
--profile files \
-f preprocess/.docker-compose.yml \
stop || exit 1

source "$PIPELINE_RUN_CACHE"
if [ -z "$DATASET_NAME" ]; then
echo "DATASET_NAME not returned by preprocess"
exit 1
fi
}

# Sets the DATASET_NAME variable in the env file.
(preprocess/run.sh ../.env)|| exit 1
# Trains the Model.
# Sets the MODEL_PATH variable in the pipeline run cache.
train() {
envdotsub train/docker-compose.yml
docker compose \
-f train/.docker-compose.yml \
up --build || exit 1

source "$PIPELINE_RUN_CACHE"
if [ -z "$MODEL_PATH" ]; then
echo "MODEL_PATH not returned by train"
exit 1
fi
}

# Sets the MODEL_PATH variable in the env file.
(train/run.sh ../.env) || exit 1
# Evaluates the Model.
evaluate() {
echo "Evaluating Model"
envdotsub evaluate/docker-compose.yml
docker compose \
-f evaluate/.docker-compose.yml \
up --build || exit 1
}

# Publishes the Model via PyPI.
publish() {
echo "Publishing Model"
envdotsub build/docker-compose.yml
docker compose \
-f build/.docker-compose.yml \
up --build || exit 1
}

make_pipeline_cache() {
# Create unique pipeline run id
PIPELINE_RUN_CACHE=.pipeline_cache/${1:-$(date +%s)}.env
mkdir -p .pipeline_cache
mkdir -p datasets
if [ -f "$PIPELINE_RUN_CACHE" ]; then
echo "Pipeline run cache ${PIPELINE_RUN_CACHE} already exists"
exit 1
fi
}

load_env() {
# Set default values for variables
export DB_URL=https://github.com/Eve-ning/opal/raw/master/rsc/sample.tar.bz2
export FILES_URL=https://github.com/Eve-ning/opal/raw/master/rsc/sample_files.tar.bz2
cat <<EOF >>"$PIPELINE_RUN_CACHE"
PIPELINE_RUN_CACHE="$PIPELINE_RUN_CACHE"
DB_URL="$DB_URL"
FILES_URL="$FILES_URL"
FILES_DIR="/var/lib/osu/osu.files/$(basename "$FILES_URL" .tar.bz2)/"
MODEL_NAME="2023.9.4"
DATASET_NAME="$(basename "$DB_URL" .tar.bz2)_$(date +"%Y%m%d%H%M%S").csv"
DB_NAME="osu"
DB_USERNAME="root"
DB_PASSWORD="p@ssw0rd1"
DB_HOST="osu.mysql"
DB_PORT="3307"
SR_MIN="2"
SR_MAX="15"
ACC_MIN="0.85"
ACC_MAX="1.0"
MIN_SCORES_PER_MID="0"
MIN_SCORES_PER_UID="0"
MAX_SVNESS="0.05"
EOF
# Source and Export variables
set -a
source "$PIPELINE_RUN_CACHE"
source preprocess/osu-data-docker/.env
set +a
}

make_pipeline_cache "$1" || exit 1
load_env || exit 1
preprocess || exit 1
train || exit 1
set -a
source "$PIPELINE_RUN_CACHE"
set +a
evaluate || exit 1
publish || exit 1
(evaluate/run.sh ../.env) || exit 1
# Build & Publishes the Model via PyPI.
(build/run.sh ../.env) || exit 1
1 change: 1 addition & 0 deletions src/preprocess/4_entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
# Ensure all variables are set
: "${DATASET_NAME:?DATASET_NAME not set}"

mkdir -p ../datasets/
mysql -h osu.mysql -P 3307 -u root -pp@ssw0rd1 -D osu < \
./4_export.sql | \
sed 's/\t/,/g' >../datasets/"${DATASET_NAME}"
Expand Down
2 changes: 1 addition & 1 deletion src/preprocess/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
version: "2.20.2"
include:
- osu-data-docker/docker-compose.yml
- osu-data-docker/.docker-compose.yml

services:
1.preprocess:
Expand Down
Loading