make necessary changes to spurious_correlation function and add unit …

…test
01PrathamS · Oct 25, 2023 · 8312f7e · 8312f7e
1 parent f6af4d4
commit 8312f7e
Show file tree

Hide file tree

Showing 3 changed files with 24 additions and 16 deletions.
diff --git a/cleanlab/datalab/datalab.py b/cleanlab/datalab/datalab.py
@@ -40,6 +40,7 @@
 )
 from cleanlab.datalab.internal.issue_finder import IssueFinder
 from cleanlab.datalab.internal.serialize import _Serializer
+from cleanlab.datalab.internal.spurious_correlation import SpuriousCorrelations
 
 if TYPE_CHECKING:  # pragma: no cover
     import numpy.typing as npt
@@ -314,7 +315,7 @@ def _spurious_correlations(self) -> pd.DataFrame:
         A DataFrame where each row corresponds to image_property ('dark', 'grayscale')
         and overall datascore for that image_property
         """
-        return self.SpuriousCorrelations.spurious_correlation    
+        return SpuriousCorrelations.spurious_correlations(self)
 
     def report(
         self,

diff --git a/cleanlab/datalab/internal/spurious_correlation.py b/cleanlab/datalab/internal/spurious_correlation.py
@@ -13,7 +13,7 @@ def __init__(self, data: Datalab) -> None:
         self.issue_summary: pd.DataFrame = data.issue_summary
 
     def spurious_correlations(self) -> pd.DataFrame:
-        baseline_accuracy = np.bincount(self.labels).argmax() / len(self.labels)
+        baseline_accuracy = np.bincount(self.labels).max() / len(self.labels)
         property_scores = {}
         image_properties_of_interest = ['outlier','near_duplicate','non_iid','low_information','dark','blurry','light','grayscale','odd_aspect_ratio','odd_size']
         image_properties = [i for i in image_properties_of_interest if i in self.issue_summary['issue_type'].tolist()]

diff --git a/tests/datalab/test_datalab.py b/tests/datalab/test_datalab.py
@@ -21,11 +21,13 @@
 import pickle
 import timeit
 from pathlib import Path
+from PIL import Image
 from unittest.mock import MagicMock, Mock, patch
 
 import numpy as np
 import pandas as pd
 import pytest
+from datasets import Dataset
 from datasets.dataset_dict import DatasetDict
 from scipy.sparse import csr_matrix
 from sklearn.neighbors import NearestNeighbors
@@ -1114,22 +1116,27 @@ def test_find_imbalance_issues_no_args(self, imbalance_labels):
         class_imbalance_summary = lab.get_issue_summary("class_imbalance")
         assert class_imbalance_summary["num_issues"].values[0] > 0
 
-class TestSpuriousCorrelations: 
+class TestSpuriousCorrelation:
+    def create_data(self):
+        images = [Image.new("RGB", (32, 32), (25, 25, 25))] * 5 + \
+                 [Image.new("RGB", (32, 32), (255, 255, 255))] * 5
 
-    def test_spurious_correlations(self): 
-        dark_scores = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
-        labels = np.array([0, 1, 0, 1, 0, 1, 0, 1, 0, 1])
-        issue_summary = pd.DataFrame({
-            'issue_type': ['dark'],
-            'num_issues': 10,
-        })
+        rand_images = (np.random.rand(40, 32, 32, 3) * 255).astype(np.uint8)
+        images = images + [Image.fromarray(img) for img in rand_images]
 
+        labels = np.array([0] * 5 + [1] * 5 + [2] * 40)
         data = {
-            'issues': pd.DataFrame({'dark_score': dark_scores}),
-            'labels': labels,
-            'issue_summary': issue_summary,
+            "image": images,
+            "label": labels
         }
-
-        pass
-
+        dataset = Dataset.from_dict(data)
+        lab = Dataset(data=dataset, image_key="image", label_name="label")
+        features = np.array([np.array(img).flatten() for img in images])
+        return lab, features
+
+    def test_spurious_correlation(self):
+        imagelab, features = self.create_data()
+        imagelab.find_issues(features=features)
+        corrs = imagelab._spurious_correlation()
+        assert corrs