🚧 fix ci runs (mamba and numpy related) (#81)

enryH · web-flow · commit ab6b1b2169a3 · 2024-10-22T20:15:34.000+02:00
* 🚧 switch mamba installation - see if snakemake envs are somehow cached * 🐛 specify python version, move ls * 🚧 deactivate some workflow, run relatvie ls command * try not to cache * 🚧 test using venv created by codespace with python 3.12 - might be that I need to create (not sure what change in runner configurations) * try to use full snakemake installation * 🚧 use miniconda for pypi installation test * try miniconda again - snakemake environment has it's own mamba installation - auto-activate environment "test" * install build dependencies, fix ubuntu first * 🐛 try to put mamba below 2.0 snakemake/snakemake#3108 * test should be activate per default * 🚧 conda env not activated... * 🐛 pip does not install in environment * 🚧 experiment * 🐛 shell was not iniated * 🐛 test installing njab separately * 🐛 order matters! * try again new order, add umap-learn explicitly * 🐛 do not re-install njab * restrict scipy (trapz missing in lifelines) latest scipy not supported by lifelines * 🐛 exclude numpy 2.0 for now * numpy try two * swap numpy and njab, adapt other pkg to what it was before * add back umap learn, relax constraints * 🚧 in package single requirement single packages cannot be specified to just ignore the dependencies. * ➖ remove scipy dependency - leave it to njab to install dependencies in a second step. * ⬆️ remove support for python 3.8 (end-of-life) * 🎨 setuptools_scm uses tags to determine version, add tags * 🐛 tags not fetched without entire history see actions/checkout#1471 * 🎨 clean-up workflow file * ✨ add njab after update to requirements - enable again more workflows (using mamba constraint snakemake environement) * 🔥 remove comments, ⏪ add back tests * 🐛 make order explicit (by feat freq or bin and bin count) * 🐛 fix order of example more explicitly. * 🐛 actually test latest version of pimms, remove comments * 🐛 runs natively in colab without issues
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -21,35 +21,26 @@ jobs:
              "macos-13",
              # "windows-latest" # rrcovNA cannot be build from source on windows-server
              ]
-        python-version: ["3.8", "3.9", "3.10"]
+        python-version: ["3.9", "3.10", "3.11", "3.12"]
     steps:
     - name: Checkout
       uses: actions/checkout@v4
     - name: Set up Miniconda
-      # ! change action https://github.com/mamba-org/setup-micromamba
       uses: conda-incubator/setup-miniconda@v3
       with: 
-        miniforge-variant: Mambaforge
-        # miniforge-version: latest
-        use-mamba: true
-        channel-priority: disabled
         python-version: ${{ matrix.python-version }}
+        channel-priority: strict
         environment-file: snakemake_env.yml
         activate-environment: snakemake
         auto-activate-base: true
-        # auto-update-conda: true
+        auto-update-conda: true
     - name: inspect-conda-environment
       run: |
         conda info
         conda list
         conda env export --from-history --no-builds > environment.yml
         conda env export --no-builds
         conda env export --no-builds > environment_w_versions.yml
-    # - name: test-r-kernel-imports
-    #   run: |
-    #     Rscript -e "library(stringi)"
-    #     Rscript -e "library(stringr)"
-    #     Rscript -e "library(reshape2)"
     - name: Dry-Run demo workflow (integration test)
       run: | 
        cd project
@@ -75,8 +66,8 @@ jobs:
         name: ${{ matrix.os }}-${{ matrix.python-version }}-example-workflow-results
         path: |
           project/runs/example/
-          environment.yml
-          environment_w_versions.yml
+          snakemake_env
+          project/.snakemake/conda/
 
   run-unit-local-pip-installation:
     runs-on: ${{ matrix.os }}
@@ -85,25 +76,28 @@ jobs:
       fail-fast: false
       matrix:
         os: ["ubuntu-latest", "macos-latest", "windows-latest"]
-        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
+        python-version: ["3.9", "3.10", "3.11", "3.12"]
     steps:
       - uses: actions/checkout@v4
+        with:
+          fetch-tags: true
+          fetch-depth: 0
 
       - uses: actions/setup-python@v5
         with:
           python-version: ${{ matrix.python-version }}
 
       - name: install pimms
-        run: python -m pip install .
-      
+        run: pip install .
+
       - name: Install pytest
-        run: python -m pip install pytest pytest-cov
+        run: pip install pytest pytest-cov
       
       - name: Run pytest
         run: pytest .
 
       - name: Install papermill
-        run: python -m pip install papermill ipykernel
+        run: pip install papermill ipykernel
       
       - name: View papermill help message for notebooks (as scripts)
         run: |
@@ -141,4 +135,4 @@ jobs:
       - uses: pypa/gh-action-pypi-publish@release/v1
         with:
           user: __token__
-          password: ${{ secrets.PYPI_API_TOKEN }}
+          password: ${{ secrets.PYPI_API_TOKEN }}
diff --git a/.github/workflows/ci_workflow.yaml b/.github/workflows/ci_workflow.yaml
@@ -1,4 +1,4 @@
-name: run workflow with conda envs
+name: run workflow (v1) with conda envs
 on:
   push:
     branches: [main, dev]
@@ -31,13 +31,12 @@ jobs:
         # ! change action https://github.com/mamba-org/setup-micromamba
         uses: conda-incubator/setup-miniconda@v3
         with:
-          miniforge-variant: Mambaforge
-          use-mamba: true
-          channel-priority: disabled
+          channel-priority: strict
           python-version: ${{ matrix.python-version }}
           environment-file: snakemake_env.yml
           activate-environment: snakemake
           auto-activate-base: true
+          auto-update-conda: true
       - name: inspect-conda-environment
         run: |
           conda info
diff --git a/.github/workflows/test_pkg_on_colab.yaml b/.github/workflows/test_pkg_on_colab.yaml
@@ -20,11 +20,12 @@ jobs:
         - name: Install pimms-learn (from branch) and papermill
           if: github.event_name == 'pull_request'
           run: | 
-            python3 -m pip install pimms-learn papermill
+            pip install .
+            pip install papermill 
         - name: Install pimms-learn (from PyPI) and papermill
           if: github.event_name == 'schedule'
           run: | 
-            python3 -m pip install pimms-learn papermill
+            pip install pimms-learn papermill
         - name: Run tutorial
           run: |
             cd project
diff --git a/.github/workflows/workflow_website.yaml b/.github/workflows/workflow_website.yaml
@@ -1,4 +1,4 @@
-name: Build workflow website on public Alzheimer dataset (for protein groups)
+name: Build workflow (v2) website on public Alzheimer dataset (for protein groups)
 on:
   pull_request:
     branches: [main, dev]
@@ -73,4 +73,4 @@ jobs:
       uses: peaceiris/actions-gh-pages@v4
       with:
         github_token: ${{ secrets.GITHUB_TOKEN }}
-        publish_dir: project/runs/alzheimer_study/_build/
+        publish_dir: project/runs/alzheimer_study/_build/
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
@@ -9,7 +9,7 @@ version: 2
 build:
   os: ubuntu-22.04
   tools:
-    python: "3.8"
+    python: "3.10"
     # You can also specify other tool versions:
     # nodejs: "19"
     # rust: "1.64"
@@ -32,4 +32,4 @@ python:
    - method: pip
      path: .
      extra_requirements:
-      - docs
+      - docs
diff --git a/environment.yml b/environment.yml
@@ -9,7 +9,7 @@ channels:
   - plotly
   # - defaults
 dependencies:
-  - python>=3.8,<=3.12
+  - python>=3.9,<=3.12
   - numpy
   - pandas>=1
   - scipy>=1.6
diff --git a/pimmslearn/imputation.py b/pimmslearn/imputation.py
@@ -5,165 +5,18 @@
 
 
 """
-from typing import Tuple, Dict
-from sklearn.neighbors import NearestNeighbors
-import scipy
+import logging
+from typing import Dict, Tuple
+
 import numpy as np
 import pandas as pd
-import logging
 
 logger = logging.getLogger(__name__)
 
 
 RANDOMSEED = 123
 
 
-def impute_missing(protein_values, mean=None, std=None):
-    """
-    Imputation is based on the mean and standard deviation
-    from the protein_values.
-    If mean and standard deviation (std) are given,
-    missing values are imputed and protein_values are returned imputed.
-    If no mean and std are given, the mean and std are computed from
-    the non-missing protein_values.
-
-    Parameters
-    ----------
-    protein_values: Iterable
-    mean: float
-    std: float
-
-    Returns
-    ------
-    protein_values: pandas.Series
-    """
-    raise NotImplementedError('Will be the main function combining features')
-    # clip by zero?
-
-
-def _select_data(data: pd.DataFrame, threshold: float):
-    """Select (protein-) columns for imputation.
-
-    Based on the threshold representing the minimum proportion of available
-    data per protein, the columns of a `pandas.DataFrame` are selected.
-
-    Parameters
-    ----------
-    data: pandas.DataFrame
-    threshold: float
-        Threshold of percentage of non-missing values to select a column/feature.
-    """
-    columns_to_impute = data.notnull().mean() >= threshold
-    return columns_to_impute
-
-
-def _sparse_coo_array(data: pd.DataFrame):
-    """Return a sparse scipy matrix from dense `pandas.DataFrame` with many
-    missing values.
-    """
-    indices = np.nonzero(~np.isnan(data.to_numpy()))
-    data_selected_sparse = data.to_numpy()
-    data_selected_sparse = scipy.sparse.coo_matrix(
-        (data_selected_sparse[indices], indices),
-        shape=data_selected_sparse.shape)
-    return data_selected_sparse
-
-
-def _get_weighted_mean(distances, data):
-    """Compute weighted mean ignoring
-    identical entries"""
-    mask = distances > 0.0
-    weights = distances[mask] / distances[mask].sum()
-    weighted_sum = data.loc[mask].mul(weights, axis=0)
-    mean_imputed = weighted_sum.sum() / sum(mask)
-    return mean_imputed
-
-
-# define imputation methods
-# could be done in PCA transformed space
-def imputation_KNN(data, alone=True, threshold=0.5):
-    """
-
-
-    Parameters
-    ----------
-    data: pandas.DataFrame
-    alone: bool  # is not used
-    threshold: float
-        Threshold of missing data by column in interval (0, 1)
-    """
-    mask_selected = _select_data(data=data, threshold=threshold)
-    data_selected = data.loc[:, mask_selected].copy()
-    data_selected_sparse = _sparse_coo_array(data_selected)
-    # impute
-    knn_fitted = NearestNeighbors(n_neighbors=3, algorithm='brute').fit(
-        data_selected_sparse)
-    fit_distances, fit_neighbors = knn_fitted.kneighbors(data_selected_sparse)
-    for i, (distances, ids) in enumerate(zip(fit_distances, fit_neighbors)):
-        mean_imputed = _get_weighted_mean(distances, data_selected.loc[ids])
-        if all(distances == 0.0):
-            logger.warning(f"Did not find any neighbor for int-id: {i}")
-        else:
-            assert i == ids[distances == 0.0], (
-                "None or more then one identical data points "
-                "for ids: {}".format(ids[distances == 0.0])
-            )
-        mask = data_selected.iloc[i].isna()
-        data_selected.loc[i, mask] = mean_imputed.loc[mask]  # SettingWithCopyError
-
-    data.update(data_selected)
-    return data
-
-
-def imputation_normal_distribution(log_intensities: pd.Series,
-                                   mean_shift=1.8,
-                                   std_shrinkage=0.3,
-                                   copy=True):
-    """Impute missing log-transformed intensity values of a single feature.
-    Samples one value for imputation for all samples.
-
-    Parameters
-    ----------
-    log_intensities: pd.Series
-        Series of normally distributed values of a single feature (for all samples/runs).
-        Here usually log-transformed intensities.
-    mean_shift: integer, float
-        Shift the mean of the log_intensities by factors of their standard
-        deviation to the negative.
-    std_shrinkage: float
-        Value greater than zero by which to shrink (or inflate) the
-        standard deviation of the log_intensities.
-    """
-    np.random.seed(RANDOMSEED)
-    if not isinstance(log_intensities, pd.Series):
-        try:
-            log_intensities.Series(log_intensities)
-            logger.warning("Series created of Iterable.")
-        except BaseException:
-            raise ValueError(
-                "Plese provided data which is a pandas.Series or an Iterable")
-    if mean_shift < 0:
-        raise ValueError(
-            "Please specify a positive float as the std.-dev. is non-negative.")
-    if std_shrinkage <= 0:
-        raise ValueError(
-            "Please specify a positive float as shrinkage factor for std.-dev.")
-    if std_shrinkage >= 1:
-        logger.warning("Standard Deviation will increase for imputed values.")
-
-    mean = log_intensities.mean()
-    std = log_intensities.std()
-
-    mean_shifted = mean - (std * mean_shift)
-    std_shrinked = std * std_shrinkage
-
-    if copy:
-        log_intensities = log_intensities.copy(deep=True)
-
-    return log_intensities.where(log_intensities.notna(),
-                                 np.random.normal(mean_shifted, std_shrinked))
-
-
 def impute_shifted_normal(df_wide: pd.DataFrame,
                           mean_shift: float = 1.8,
                           std_shrinkage: float = 0.3,
@@ -224,15 +77,6 @@ def impute_shifted_normal(df_wide: pd.DataFrame,
     return imputed_shifted_normal
 
 
-def imputation_mixed_norm_KNN(data):
-    # impute columns with less than 50% missing values with KNN
-    data = imputation_KNN(data, alone=False)  # ToDo: Alone is not used.
-    # impute remaining columns based on the distribution of the protein
-    data = imputation_normal_distribution(
-        data, mean_shift=1.8, std_shrinkage=0.3)
-    return data
-
-
 def compute_moments_shift(observed: pd.Series, imputed: pd.Series,
                           names: Tuple[str, str] = ('observed', 'imputed')) -> Dict[str, float]:
     """Summary of overall shift of mean and std. dev. of predictions for a imputation method."""
diff --git a/pimmslearn/pandas/__init__.py b/pimmslearn/pandas/__init__.py
@@ -7,7 +7,8 @@
 import omegaconf
 import pandas as pd
 
-from pimmslearn.pandas.calc_errors import calc_errors_per_feat, get_absolute_error
+from pimmslearn.pandas.calc_errors import (calc_errors_per_feat,
+                                           get_absolute_error)
 
 __all__ = [
     'calc_errors_per_feat',
diff --git a/project/workflow/envs/pimms.yaml b/project/workflow/envs/pimms.yaml
@@ -9,7 +9,7 @@ channels:
   - plotly
   # - defaults
 dependencies:
-  - python>=3.8,<=3.12
+  - python>=3.9,<=3.12
   - numpy
   - pandas>=1
   - scipy>=1.6
diff --git a/pyproject.toml b/pyproject.toml
diff --git a/snakemake_env.yml b/snakemake_env.yml
diff --git a/tests/pandas/test_calc_errors.py b/tests/pandas/test_calc_errors.py
diff --git a/tests/test_imputation.py b/tests/test_imputation.py