diff --git a/.github/workflows/export_constants.py b/.github/workflows/export_constants.py new file mode 100644 index 00000000..6421498a --- /dev/null +++ b/.github/workflows/export_constants.py @@ -0,0 +1,22 @@ +import json + +from chebai.preprocessing.reader import ( + CLS_TOKEN, + EMBEDDING_OFFSET, + MASK_TOKEN_INDEX, + PADDING_TOKEN_INDEX, +) + +# Define the constants you want to export +# Any changes in the key names here should also follow the same change in verify_constants.yml code +constants = { + "EMBEDDING_OFFSET": EMBEDDING_OFFSET, + "CLS_TOKEN": CLS_TOKEN, + "PADDING_TOKEN_INDEX": PADDING_TOKEN_INDEX, + "MASK_TOKEN_INDEX": MASK_TOKEN_INDEX, +} + +if __name__ == "__main__": + # Write constants to a JSON file + with open("constants.json", "w") as f: + json.dump(constants, f) diff --git a/.github/workflows/token_consistency.yaml b/.github/workflows/token_consistency.yaml new file mode 100644 index 00000000..06c3a42e --- /dev/null +++ b/.github/workflows/token_consistency.yaml @@ -0,0 +1,128 @@ +name: Check consistency of tokens.txt file + +# Define the file paths under `paths` to trigger this check only when specific files are modified. +# This script will then execute checks only on files that have changed, rather than all files listed in `paths`. + +# **Note** : To add a new token file for checks, include its path in: +# - `on` -> `push` and `pull_request` sections +# - `jobs` -> `check_tokens` -> `steps` -> Set global variable for multiple tokens.txt paths -> `TOKENS_FILES` + +on: + push: + paths: + - "chebai/preprocessing/bin/smiles_token/tokens.txt" + - "chebai/preprocessing/bin/smiles_token_unlabeled/tokens.txt" + - "chebai/preprocessing/bin/selfies/tokens.txt" + - "chebai/preprocessing/bin/protein_token/tokens.txt" + - "chebai/preprocessing/bin/graph_properties/tokens.txt" + - "chebai/preprocessing/bin/graph/tokens.txt" + - "chebai/preprocessing/bin/deepsmiles_token/tokens.txt" + - "chebai/preprocessing/bin/protein_token_3_gram/tokens.txt" + pull_request: + paths: + - "chebai/preprocessing/bin/smiles_token/tokens.txt" + - "chebai/preprocessing/bin/smiles_token_unlabeled/tokens.txt" + - "chebai/preprocessing/bin/selfies/tokens.txt" + - "chebai/preprocessing/bin/protein_token/tokens.txt" + - "chebai/preprocessing/bin/graph_properties/tokens.txt" + - "chebai/preprocessing/bin/graph/tokens.txt" + - "chebai/preprocessing/bin/deepsmiles_token/tokens.txt" + - "chebai/preprocessing/bin/protein_token_3_gram/tokens.txt" + +jobs: + check_tokens: + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v2 + + - name: Get list of changed files + id: changed_files + run: | + git fetch origin dev + + # Get the list of changed files compared to origin/dev and save them to a file + git diff --name-only origin/dev > changed_files.txt + + # Print the names of changed files on separate lines + echo "Changed files:" + while read -r line; do + echo "Changed File name : $line" + done < changed_files.txt + + - name: Set global variable for multiple tokens.txt paths + run: | + # All token files that needs to checked must be included here too, same as in `paths`. + TOKENS_FILES=( + "chebai/preprocessing/bin/smiles_token/tokens.txt" + "chebai/preprocessing/bin/smiles_token_unlabeled/tokens.txt" + "chebai/preprocessing/bin/selfies/tokens.txt" + "chebai/preprocessing/bin/protein_token/tokens.txt" + "chebai/preprocessing/bin/graph_properties/tokens.txt" + "chebai/preprocessing/bin/graph/tokens.txt" + "chebai/preprocessing/bin/deepsmiles_token/tokens.txt" + "chebai/preprocessing/bin/protein_token_3_gram/tokens.txt" + ) + echo "TOKENS_FILES=${TOKENS_FILES[*]}" >> $GITHUB_ENV + + - name: Process only changed tokens.txt files + run: | + # Convert the TOKENS_FILES environment variable into an array + TOKENS_FILES=(${TOKENS_FILES}) + + # Iterate over each token file path + for TOKENS_FILE_PATH in "${TOKENS_FILES[@]}"; do + # Check if the current token file path is in the list of changed files + if grep -q "$TOKENS_FILE_PATH" changed_files.txt; then + echo "----------------------- Processing $TOKENS_FILE_PATH -----------------------" + + # Get previous tokens.txt version + git fetch origin dev + git diff origin/dev -- $TOKENS_FILE_PATH > tokens_diff.txt || echo "No previous tokens.txt found for $TOKENS_FILE_PATH" + + # Check for deleted or added lines in tokens.txt + if [ -f tokens_diff.txt ]; then + + # Check for deleted lines (lines starting with '-') + deleted_lines=$(grep '^-' tokens_diff.txt | grep -v '^---' | sed 's/^-//' || true) + if [ -n "$deleted_lines" ]; then + echo "Error: Lines have been deleted from $TOKENS_FILE_PATH." + echo -e "Deleted Lines: \n$deleted_lines" + exit 1 + fi + + # Check for added lines (lines starting with '+') + added_lines=$(grep '^+' tokens_diff.txt | grep -v '^+++' | sed 's/^+//' || true) + if [ -n "$added_lines" ]; then + + # Count how many lines have been added + num_added_lines=$(echo "$added_lines" | wc -l) + + # Get last `n` lines (equal to num_added_lines) of tokens.txt + last_lines=$(tail -n "$num_added_lines" $TOKENS_FILE_PATH) + + # Check if the added lines are at the end of the file + if [ "$added_lines" != "$last_lines" ]; then + + # Find lines that were added but not appended at the end of the file + non_appended_lines=$(diff <(echo "$added_lines") <(echo "$last_lines") | grep '^<' | sed 's/^< //') + + echo "Error: New lines have been added to $TOKENS_FILE_PATH, but they are not at the end of the file." + echo -e "Added lines that are not at the end of the file: \n$non_appended_lines" + exit 1 + fi + fi + + if [ "$added_lines" == "" ]; then + echo "$TOKENS_FILE_PATH validation successful: No lines were deleted, and no new lines were added." + else + echo "$TOKENS_FILE_PATH validation successful: No lines were deleted, and new lines were correctly appended at the end." + fi + else + echo "No previous version of $TOKENS_FILE_PATH found." + fi + else + echo "$TOKENS_FILE_PATH was not changed, skipping." + fi + done diff --git a/.github/workflows/verify_constants.yml b/.github/workflows/verify_constants.yml new file mode 100644 index 00000000..3246f64d --- /dev/null +++ b/.github/workflows/verify_constants.yml @@ -0,0 +1,116 @@ +name: Verify Constants + +# Define the file paths under `paths` to trigger this check only when specific files are modified. +# This script will then execute checks only on files that have changed, rather than all files listed in `paths`. + +# **Note** : To add a new file for checks, include its path in: +# - `on` -> `push` and `pull_request` sections +# - `jobs` -> `verify-constants` -> `steps` -> Verify constants -> Add a new if else for your file, with check logic inside it. + + +on: + push: + paths: + - "chebai/preprocessing/reader.py" + pull_request: + paths: + - "chebai/preprocessing/reader.py" + +jobs: + verify-constants: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: [ +# Only use 3.10 as of now +# "3.9", + "3.10", +# "3.11" + ] + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set PYTHONPATH + run: echo "PYTHONPATH=$PWD" >> $GITHUB_ENV + + - name: Get list of changed files + id: changed_files + run: | + git fetch origin dev + + # Get the list of changed files compared to origin/dev and save them to a file + git diff --name-only origin/dev > changed_files.txt + + # Print the names of changed files on separate lines + echo "Changed files:" + while read -r line; do + echo "Changed File name : $line" + done < changed_files.txt + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + # Setting a fix version for torch due to an error with latest version (2.5.1) + # ImportError: cannot import name 'T_co' from 'torch.utils.data.dataset' + run: | + python -m pip install --upgrade pip + python -m pip install --upgrade pip setuptools wheel + python -m pip install torch==2.4.1 --index-url https://download.pytorch.org/whl/cpu + python -m pip install -e . + + - name: Export constants + run: python .github/workflows/export_constants.py + + - name: Load constants into environment variables + id: load_constants + # "E_" is appended as suffix to every constant, to protect overwriting other sys env variables with same name + run: | + constants=$(cat constants.json) + echo "$constants" | jq -r 'to_entries|map("E_\(.key)=\(.value|tostring)")|.[]' >> $GITHUB_ENV + + - name: Print all environment variables + run: printenv + + - name: Verify constants + run: | + file_name="chebai/preprocessing/reader.py" + if grep -q "$file_name" changed_files.txt; then + echo "----------------------- Checking file : $file_name ----------------------- " + + # Define expected values for constants + exp_embedding_offset="10" + exp_cls_token="2" + exp_padding_token_index="0" + exp_mask_token_index="1" + + # Debugging output to check environment variables + echo "Current Environment Variables:" + echo "E_EMBEDDING_OFFSET = $E_EMBEDDING_OFFSET" + echo "Expected: $exp_embedding_offset" + + # Verify constants match expected values + if [ "$E_EMBEDDING_OFFSET" != "$exp_embedding_offset" ]; then + echo "EMBEDDING_OFFSET ($E_EMBEDDING_OFFSET) does not match expected value ($exp_embedding_offset)!" + exit 1 + fi + if [ "$E_CLS_TOKEN" != "$exp_cls_token" ]; then + echo "CLS_TOKEN ($E_CLS_TOKEN) does not match expected value ($exp_cls_token)!" + exit 1 + fi + if [ "$E_PADDING_TOKEN_INDEX" != "$exp_padding_token_index" ]; then + echo "PADDING_TOKEN_INDEX ($E_PADDING_TOKEN_INDEX) does not match expected value ($exp_padding_token_index)!" + exit 1 + fi + if [ "$E_MASK_TOKEN_INDEX" != "$exp_mask_token_index" ]; then + echo "MASK_TOKEN_INDEX ($E_MASK_TOKEN_INDEX) does not match expected value ($exp_mask_token_index)!" + exit 1 + fi + else + echo "$file_name not found in changed_files.txt; skipping check." + fi diff --git a/.isort.cfg b/.isort.cfg new file mode 100644 index 00000000..88469734 --- /dev/null +++ b/.isort.cfg @@ -0,0 +1,2 @@ +[settings] +known_third_party = Bio,deepsmiles,fastobo,iterstrat,jsonargparse,lightning,lightning_utilities,lnn,matplotlib,model,molecule,networkx,numpy,owlready2,pandas,pyhornedowl,pysmiles,pytorch_lightning,rdkit,requests,scipy,seaborn,selfies,setuptools,sklearn,torch,torch_geometric,torchmetrics,tqdm,transformers,utils,wandb diff --git a/chebai/preprocessing/reader.py b/chebai/preprocessing/reader.py index e220e1e4..e1f17fb9 100644 --- a/chebai/preprocessing/reader.py +++ b/chebai/preprocessing/reader.py @@ -8,7 +8,7 @@ from chebai.preprocessing.collate import DefaultCollator, RaggedCollator -EMBEDDING_OFFSET = 10 +EMBEDDING_OFFSET = 102 PADDING_TOKEN_INDEX = 0 MASK_TOKEN_INDEX = 1 CLS_TOKEN = 2