Skip to content

Commit

Permalink
make necessary changes to spurious_correlation function and add unit …
Browse files Browse the repository at this point in the history
…test
  • Loading branch information
01PrathamS committed Oct 25, 2023
1 parent f6af4d4 commit 8312f7e
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 16 deletions.
3 changes: 2 additions & 1 deletion cleanlab/datalab/datalab.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
)
from cleanlab.datalab.internal.issue_finder import IssueFinder
from cleanlab.datalab.internal.serialize import _Serializer
from cleanlab.datalab.internal.spurious_correlation import SpuriousCorrelations

if TYPE_CHECKING: # pragma: no cover
import numpy.typing as npt
Expand Down Expand Up @@ -314,7 +315,7 @@ def _spurious_correlations(self) -> pd.DataFrame:
A DataFrame where each row corresponds to image_property ('dark', 'grayscale')
and overall datascore for that image_property
"""
return self.SpuriousCorrelations.spurious_correlation
return SpuriousCorrelations.spurious_correlations(self)

def report(
self,
Expand Down
2 changes: 1 addition & 1 deletion cleanlab/datalab/internal/spurious_correlation.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def __init__(self, data: Datalab) -> None:
self.issue_summary: pd.DataFrame = data.issue_summary

def spurious_correlations(self) -> pd.DataFrame:
baseline_accuracy = np.bincount(self.labels).argmax() / len(self.labels)
baseline_accuracy = np.bincount(self.labels).max() / len(self.labels)
property_scores = {}
image_properties_of_interest = ['outlier','near_duplicate','non_iid','low_information','dark','blurry','light','grayscale','odd_aspect_ratio','odd_size']
image_properties = [i for i in image_properties_of_interest if i in self.issue_summary['issue_type'].tolist()]
Expand Down
35 changes: 21 additions & 14 deletions tests/datalab/test_datalab.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,13 @@
import pickle
import timeit
from pathlib import Path
from PIL import Image
from unittest.mock import MagicMock, Mock, patch

import numpy as np
import pandas as pd
import pytest
from datasets import Dataset
from datasets.dataset_dict import DatasetDict
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
Expand Down Expand Up @@ -1114,22 +1116,27 @@ def test_find_imbalance_issues_no_args(self, imbalance_labels):
class_imbalance_summary = lab.get_issue_summary("class_imbalance")
assert class_imbalance_summary["num_issues"].values[0] > 0

class TestSpuriousCorrelations:
class TestSpuriousCorrelation:
def create_data(self):
images = [Image.new("RGB", (32, 32), (25, 25, 25))] * 5 + \
[Image.new("RGB", (32, 32), (255, 255, 255))] * 5

def test_spurious_correlations(self):
dark_scores = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
labels = np.array([0, 1, 0, 1, 0, 1, 0, 1, 0, 1])
issue_summary = pd.DataFrame({
'issue_type': ['dark'],
'num_issues': 10,
})
rand_images = (np.random.rand(40, 32, 32, 3) * 255).astype(np.uint8)
images = images + [Image.fromarray(img) for img in rand_images]

labels = np.array([0] * 5 + [1] * 5 + [2] * 40)
data = {
'issues': pd.DataFrame({'dark_score': dark_scores}),
'labels': labels,
'issue_summary': issue_summary,
"image": images,
"label": labels
}

pass

dataset = Dataset.from_dict(data)
lab = Dataset(data=dataset, image_key="image", label_name="label")
features = np.array([np.array(img).flatten() for img in images])
return lab, features

def test_spurious_correlation(self):
imagelab, features = self.create_data()
imagelab.find_issues(features=features)
corrs = imagelab._spurious_correlation()
assert corrs

0 comments on commit 8312f7e

Please sign in to comment.