Merge pull request #28 from joshuawe/pipeline

Add Pipeline - get metrics with one line of code
joshuawe · Dec 27, 2023 · 7bc618b · 7bc618b
2 parents a1caedd + 37d51a3
commit 7bc618b
Show file tree

Hide file tree

Showing 12 changed files with 878 additions and 193 deletions.
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -0,0 +1,8 @@
+{
+    "python.testing.pytestArgs": [
+        "tests"
+    ],
+    "python.testing.unittestEnabled": false,
+    "python.testing.pytestEnabled": true,
+    "python.analysis.typeCheckingMode": "basic"
+}
diff --git a/README.md b/README.md
@@ -43,7 +43,7 @@ Furthermore, this library presents other useful visualizations, such as **compar
     - Classification Report
     - Confusion Matrix
     - ROC curve (AUROC)
-    - y_prob histogram
+    - y_score histogram
 
 - *multi-class classifier*
 
@@ -61,7 +61,7 @@ Furthermore, this library presents other useful visualizations, such as **compar
 
 | <img src="https://github.com/joshuawe/plots_and_graphs/blob/main/images/roc_curve_bootstrap.png?raw=true" width="300" alt="Your Image">        | <img src="https://github.com/joshuawe/plots_and_graphs/blob/main/images/pr_curve.png?raw=true" width="300" alt="Your Image">        | <img src="https://github.com/joshuawe/plots_and_graphs/blob/main/images/y_prob_histogram.png?raw=true" width="300" alt="Your Image">  |
 |:--------------------------------------------------:|:----------------------------------------------------------:|:-------------------------------------------------:|
-|                    ROC Curve (AUROC) with bootstrapping             |                 Precision-Recall Curve                          |                  y_prob histogram                                 |
+|                    ROC Curve (AUROC) with bootstrapping             |                 Precision-Recall Curve                          |                  y_score histogram                                 |
 
 
 | <img src="https://github.com/joshuawe/plots_and_graphs/blob/main/images/multiclass/histogram_4_classes.png?raw=true" width="300" alt="Your Image">        |  <img src="https://github.com/joshuawe/plots_and_graphs/blob/main/images/multiclass/roc_curves_multiclass.png?raw=true" width="300" alt=""> | <img src="data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7" width="300" height="300" alt=""> |
@@ -95,34 +95,28 @@ pip install -e .
 
 # Usage
 
-Example usage of results from a binary classifier for a calibration curve.
+Get all classification metrics with **ONE** line of code. Here, for a binary classifier:
 
 ```python
-import matplotlib.pyplot as plt
-import numpy as np
 import plotsandgraphs as pandg
+# ...
+pandg.pipeline.binary_classifier(y_true, y_score)
+```
 
-# create some predictions of a hypothetical binary classifier
-n_samples = 1000
-y_true = np.random.choice([0,1], n_samples, p=[0.4, 0.6])   # the true class labels 0 or 1, with class imbalance 40:60
-
-y_prob = np.zeros(y_true.shape)   # a model's probability of class 1 predictions
-y_prob[y_true==1] = np.random.beta(1, 0.6, y_prob[y_true==1].shape)
-y_prob[y_true==0] = np.random.beta(0.5, 1, y_prob[y_true==0].shape)
-
-# show prob distribution
-fig_hist = pandg.binary_classifier.plot_y_prob_histogram(y_prob, y_true, save_fig_path=None)
-
-# create calibration curve
-fig_auroc = pandg.binary_classifier.plot_calibration_curve(y_prob, y_true, save_fig_path=None)
+Or with some more configs:
+```Python
+configs = {
+  'roc': {'n_bootstraps': 10000},
+  'pr': {'figsize': (8,10)}
+}
+pandg.pipeline.binary_classifier(y_true, y_score, save_fig_path='results/metrics', file_type='png', plot_kwargs=configs)
+```
 
+For multiclass classification:
 
-# --- OPTIONAL: Customize figure ---
-# get axis of figure and change title
-axes = fig_auroc.get_axes()
-ax0 = axes[0]
-ax0.set_title('New Title for Calibration Plot')
-fig_auroc.show()
+```Python
+# with multiclass data y_true (one-hot encoded) and y_score
+pandg.pipeline.multiclass_classifier(y_true, y_score)
 ```
 
 # Requirements

diff --git a/notebooks/pipeline.ipynb b/notebooks/pipeline.ipynb
diff --git a/plotsandgraphs/__init__.py b/plotsandgraphs/__init__.py
@@ -1,3 +1,4 @@
 from . import binary_classifier
 from . import compare_distributions
 from . import multiclass_classifier
+from . import pipeline
diff --git a/plotsandgraphs/binary_classifier.py b/plotsandgraphs/binary_classifier.py
@@ -1,5 +1,5 @@
 from pathlib import Path
-from typing import Optional
+from typing import Optional, Any, Union
 import matplotlib.pyplot as plt
 from matplotlib.colors import to_rgba
 from matplotlib.figure import Figure
@@ -45,7 +45,9 @@ def plot_accuracy(y_true, y_pred, name="", save_fig_path=None) -> Figure:
     return fig
 
 
-def plot_confusion_matrix(y_true: np.ndarray, y_pred: np.ndarray, save_fig_path=None) -> Figure:
+def plot_confusion_matrix(
+    y_true: np.ndarray, y_pred: np.ndarray, save_fig_path=None
+) -> Figure:
     import matplotlib.colors as colors
 
     # Compute the confusion matrix
@@ -54,7 +56,9 @@ def plot_confusion_matrix(y_true: np.ndarray, y_pred: np.ndarray, save_fig_path=
     cm = cm.astype("float") / cm.sum(axis=1)[:, np.newaxis]
 
     # Create the ConfusionMatrixDisplay instance and plot it
-    cmd = ConfusionMatrixDisplay(cm, display_labels=["class 0\nnegative", "class 1\npositive"])
+    cmd = ConfusionMatrixDisplay(
+        cm, display_labels=["class 0\nnegative", "class 1\npositive"]
+    )
     fig, ax = plt.subplots(figsize=(4, 4))
     cmd.plot(
         cmap="YlOrRd",
@@ -144,6 +148,11 @@ def plot_classification_report(
         ax : Matplotlib.pyplot.Axe
             Axe object from matplotlib
     """
+    print(
+        "Warning: plot_classification_report is not experiencing a bug and is, hence, currently skipped."
+    )
+    return
+
     import matplotlib as mpl
     import matplotlib.colors as colors
     import seaborn as sns
@@ -153,7 +162,11 @@ def plot_classification_report(
     cmap = "YlOrRd"
 
     clf_report = classification_report(y_true, y_pred, output_dict=True, **kwargs)
-    keys_to_plot = [key for key in clf_report.keys() if key not in ("accuracy", "macro avg", "weighted avg")]
+    keys_to_plot = [
+        key
+        for key in clf_report.keys()
+        if key not in ("accuracy", "macro avg", "weighted avg")
+    ]
     df = pd.DataFrame(clf_report, columns=keys_to_plot).T
     # the following line ensures that dataframe are sorted from the majority classes to the minority classes
     df.sort_values(by=["support"], inplace=True)
@@ -322,7 +335,9 @@ def plot_roc_curve(
             auc_upper = np.quantile(bootstrap_aucs, CI_upper)
             auc_lower = np.quantile(bootstrap_aucs, CI_lower)
             label = f"{confidence_interval:.0%} CI: [{auc_lower:.2f}, {auc_upper:.2f}]"
-            plt.fill_between(base_fpr, tprs_lower, tprs_upper, alpha=0.3, label=label, zorder=2)
+            plt.fill_between(
+                base_fpr, tprs_lower, tprs_upper, alpha=0.3, label=label, zorder=2
+            )
 
         if highlight_roc_area is True:
             print(
@@ -354,16 +369,18 @@ def plot_roc_curve(
     return fig
 
 
-def plot_calibration_curve(y_prob: np.ndarray, y_true: np.ndarray, n_bins=10, save_fig_path=None) -> Figure:
+def plot_calibration_curve(
+    y_true: np.ndarray, y_score: np.ndarray, n_bins=10, save_fig_path=None
+) -> Figure:
     """
     Creates calibration plot for a binary classifier and calculates the ECE.
 
     Parameters
     ----------
-    y_prob : np.ndarray
-        The output probabilities of the classifier. Between 0 and 1.
     y_true : np.ndarray
         The actual labels of the data. Either 0 or 1.
+    y_score : np.ndarray
+        The output probabilities of the classifier. Between 0 and 1.
     n_bins : int
         The number of bins to use for the calibration curve.
     save_fig_path : str, optional
@@ -376,13 +393,15 @@ def plot_calibration_curve(y_prob: np.ndarray, y_true: np.ndarray, n_bins=10, sa
     ece : float
         The expected calibration error.
     """
-    prob_true, prob_pred = calibration_curve(y_true, y_prob, n_bins=n_bins, strategy="uniform")
+    prob_true, prob_pred = calibration_curve(
+        y_true, y_score, n_bins=n_bins, strategy="uniform"
+    )
 
     # Find the number of samples in each bin
-    bin_counts = np.histogram(y_prob, bins=n_bins, range=(0, 1))[0]
+    bin_counts = np.histogram(y_score, bins=n_bins, range=(0, 1))[0]
 
     # Calculate the weighted absolute difference (ECE)
-    ece = np.abs(prob_pred - prob_true) * (bin_counts / len(y_prob))
+    ece = np.abs(prob_pred - prob_true) * (bin_counts / len(y_score))
     ece = ece.sum().round(2)
 
     fig = plt.figure(figsize=(5, 5))
@@ -449,16 +468,18 @@ def plot_calibration_curve(y_prob: np.ndarray, y_true: np.ndarray, n_bins=10, sa
     return fig
 
 
-def plot_y_prob_histogram(y_prob: np.ndarray, y_true: Optional[np.ndarray] = None, save_fig_path=None) -> Figure:
+def plot_y_score_histogram(
+    y_true: Union[np.ndarray[Any, Any], None], y_score: np.ndarray[Any, Any], save_fig_path=None
+) -> Figure:
     """
-    Provides a histogram for the predicted probabilities of a binary classifier. If ```y_true``` is provided, it divides the ```y_prob``` values into the two classes and plots them jointly into the same plot with different colors.
+    Provides a histogram for the predicted probabilities of a binary classifier. If ```y_true``` is provided, it divides the ```y_score``` values into the two classes and plots them jointly into the same plot with different colors.
 
     Parameters
     ----------
-    y_prob : np.ndarray
-        The output probabilities of the classifier. Between 0 and 1.
     y_true : Optional[np.ndarray], optional
         The true class labels, by default None
+    y_score : np.ndarray
+        The output probabilities of the classifier. Between 0 and 1.
     save_fig_path : _type_, optional
         Path where to save figure, by default None
 
@@ -471,13 +492,15 @@ def plot_y_prob_histogram(y_prob: np.ndarray, y_true: Optional[np.ndarray] = Non
     ax = fig.add_subplot(111)
 
     if y_true is None:
-        ax.hist(y_prob, bins=10, alpha=0.9, edgecolor="midnightblue", linewidth=2, rwidth=1)
+        ax.hist(
+            y_score, bins=10, alpha=0.9, edgecolor="midnightblue", linewidth=2, rwidth=1
+        )
         # same histogram as above, but with border lines
         # ax.hist(y_prob, bins=10, alpha=0.5, edgecolor='black', linewidth=1.2)
     else:
         alpha = 0.6
         ax.hist(
-            y_prob[y_true == 0],
+            y_score[y_true == 0],
             bins=10,
             alpha=alpha,
             edgecolor="midnightblue",
@@ -486,7 +509,7 @@ def plot_y_prob_histogram(y_prob: np.ndarray, y_true: Optional[np.ndarray] = Non
             label="$\\hat{y} = 0$",
         )
         ax.hist(
-            y_prob[y_true == 1],
+            y_score[y_true == 1],
             bins=10,
             alpha=alpha,
             edgecolor="darkred",
@@ -577,6 +600,8 @@ def plot_pr_curve(
 
     # Save the figure if save_fig_path is specified
     if save_fig_path:
-        plt.savefig(save_fig_path, bbox_inches="tight")
+        path = Path(save_fig_path)
+        path.parent.mkdir(parents=True, exist_ok=True)
+        fig.savefig(save_fig_path, bbox_inches="tight")
 
     return fig
diff --git a/plotsandgraphs/multiclass_classifier.py b/plotsandgraphs/multiclass_classifier.py
@@ -20,7 +20,12 @@
 from sklearn.utils import resample
 from tqdm import tqdm
 
-from plotsandgraphs.utils import bootstrap, set_black_title_boxes, scale_ax_bbox, get_cmap
+from plotsandgraphs.utils import (
+    bootstrap,
+    set_black_title_boxes,
+    scale_ax_bbox,
+    get_cmap,
+)
 
 
 def plot_roc_curve(
@@ -31,8 +36,8 @@ def plot_roc_curve(
     n_bootstraps: int = 1,
     figsize: Optional[Tuple[float, float]] = None,
     class_labels: Optional[List[str]] = None,
-    split_plots: bool = True,
-    save_fig_path:Optional[Union[str, Tuple[str, str]]] = None,
+    split_plots: bool = False,
+    save_fig_path: Optional[Union[str, Tuple[str, str]]] = None,
 ) -> Tuple[Figure, Union[Figure, None]]:
     """
     Creates two plots.
@@ -59,7 +64,7 @@ def plot_roc_curve(
     class_labels : List[str], optional
         The labels of the classes. By default None.
     split_plots : bool, optional
-        Whether to split the plots into two separate figures. By default True.
+        Whether to split the plots into two separate figures. By default False.
     save_fig_path : Optional[Union[str, Tuple[str, str]]], optional
         Path to folder where the figure(s) should be saved. If None then plot is not saved, by default None. If `split_plots` is False, then a single str is required. If True, then a tuple of strings (Pos 1 Roc curves comparison, Pos 2 AUROCs comparison). E.g. `save_fig_path=('figures/roc_curves.png', 'figures/aurocs_comparison.png')`.
 
@@ -190,9 +195,6 @@ def roc_metric_function(y_true, y_score):
 
     # create the subplot tiles (and black boxes)
     set_black_title_boxes(axes.flat[:num_classes], class_labels)
-
-
-
 
     # ---------- AUROC overview plot comparing classes ----------
     # Make an AUROC overview plot comparing the aurocs per class and combined
@@ -260,18 +262,20 @@ def auroc_metric_function(y_true, y_score, average, multi_class):
         fig_aurocs.savefig(path, bbox_inches="tight")
     # save roc curves plot
     if save_fig_path is not None:
-        path = save_fig_path[0] if split_plots is True else save_fig_path
-        path = Path(path)
+        path = save_fig_path[0] if split_plots is True else save_fig_path # type: ignore 
+        path = Path(path) # type: ignore
         path.parent.mkdir(parents=True, exist_ok=True)
         fig.savefig(path, bbox_inches="tight")
     return fig, fig_aurocs
 
 
-def plot_y_prob_histogram(
-    y_true: np.ndarray, y_score: Optional[np.ndarray] = None, save_fig_path: Optional[str]=None
+def plot_y_score_histogram(
+    y_true: np.ndarray,
+    y_score: Optional[np.ndarray] = None,
+    save_fig_path: Optional[str] = None,
 ) -> Figure:
     """
-    Histogram plot that is intended to show the distribution of the predicted probabilities for different classes, where the the different classes (y_true==0 and y_true==1) are plotted in different colors. 
+    Histogram plot that is intended to show the distribution of the predicted probabilities for different classes, where the the different classes (y_true==0 and y_true==1) are plotted in different colors.
     Limitations: Does not work for samples, that can be part of multiple classes (e.g. multilabel classification).
 
     Parameters
@@ -288,15 +292,19 @@ def plot_y_prob_histogram(
     Figure
         The figure of the histogram plot.
     """
-    
+
     num_classes = y_true.shape[-1]
     class_labels = [f"Class {i}" for i in range(num_classes)]
-    
+
     cmap, colors = get_cmap("roma", n_colors=2)  # 2 colors for y==0 and y==1 per class
-    
+
     # Aiming for a square plot
-    plot_cols = np.ceil(np.sqrt(num_classes)).astype(int)  # Number of plots in a row # noqa
-    plot_rows = np.ceil(num_classes / plot_cols).astype(int)  # Number of plots in a column # noqa
+    plot_cols = np.ceil(np.sqrt(num_classes)).astype(
+        int
+    )  # Number of plots in a row # noqa
+    plot_rows = np.ceil(num_classes / plot_cols).astype(
+        int
+    )  # Number of plots in a column # noqa
     fig, axes = plt.subplots(
         nrows=plot_rows,
         ncols=plot_cols,