From fb68b553f2988cc454615a7c1edecc0dc23ce21c Mon Sep 17 00:00:00 2001 From: Nadav-Barak Date: Sat, 3 Jun 2023 17:00:34 +0300 Subject: [PATCH 1/4] added default_label argument and functionality + isort formatting --- requirements-dev.txt | 3 + skllm/models/gpt_zero_shot_clf.py | 101 ++++++++++++++++++++++++-- skllm/openai/base_gpt.py | 22 +++--- skllm/openai/credentials.py | 1 + skllm/openai/embeddings.py | 5 +- skllm/openai/mixin.py | 2 + skllm/preprocessing/gpt_vectorizer.py | 16 ++-- tests/__init__.py | 3 +- tests/test_chatgpt.py | 11 ++- tests/test_gpt_zero_shot_clf.py | 73 ++++++++++++++++--- 10 files changed, 195 insertions(+), 42 deletions(-) diff --git a/requirements-dev.txt b/requirements-dev.txt index 3164f42..5afd37d 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -4,3 +4,6 @@ isort ruff docformatter interrogate +numpy +pandas +pytest \ No newline at end of file diff --git a/skllm/models/gpt_zero_shot_clf.py b/skllm/models/gpt_zero_shot_clf.py index fe64cfb..560143c 100644 --- a/skllm/models/gpt_zero_shot_clf.py +++ b/skllm/models/gpt_zero_shot_clf.py @@ -19,18 +19,47 @@ class _BaseZeroShotGPTClassifier(ABC, BaseEstimator, ClassifierMixin, _OAIMixin): + """Base class for zero-shot classifiers. + + Parameters + ---------- + openai_key : Optional[str] , default : None + Your OpenAI API key. If None, the key will be read from the SKLLM_CONFIG_OPENAI_KEY environment variable. + openai_org : Optional[str] , default : None + Your OpenAI organization. If None, the organization will be read from the SKLLM_CONFIG_OPENAI_ORG + environment variable. + openai_model : str , default : "gpt-3.5-turbo" + The OpenAI model to use. See https://beta.openai.com/docs/api-reference/available-models for a list of + available models. + default_label : Optional[str] , default : None + The default label to use if the LLM could not generate a response for a sample. If set to 'Random' a random + label will be chosen based on probabilities from the training set. Is set to None by default. + random_state : int, default 42 + A seed to set for pseudo-random functions, primarily random selection. + """ + def __init__( self, openai_key: Optional[str] = None, openai_org: Optional[str] = None, openai_model: str = "gpt-3.5-turbo", + default_label: Optional[str] = None, + random_state: int = 42, ): self._set_keys(openai_key, openai_org) self.openai_model = openai_model + self.default_label = default_label + random.seed(random_state) + np.random.seed(random_state) def _to_np(self, X): return _to_numpy(X) + @abstractmethod + def _get_default_label(self): + """ Returns the default label based on the default_label argument. """ + raise NotImplementedError() + def fit( self, X: Optional[Union[np.ndarray, pd.Series, List[str]]], @@ -77,13 +106,34 @@ def _get_chat_completion(self, x): class ZeroShotGPTClassifier(_BaseZeroShotGPTClassifier): + """Zero-shot classifier for multiclass classification. + + Parameters + ---------- + openai_key : Optional[str] , default : None + Your OpenAI API key. If None, the key will be read from the SKLLM_CONFIG_OPENAI_KEY environment variable. + openai_org : Optional[str] , default : None + Your OpenAI organization. If None, the organization will be read from the SKLLM_CONFIG_OPENAI_ORG + environment variable. + openai_model : str , default : "gpt-3.5-turbo" + The OpenAI model to use. See https://beta.openai.com/docs/api-reference/available-models for a list of + available models. + default_label : Optional[str] , default : None + The default label to use if the LLM could not generate a response for a sample. If set to 'Random' a random + label will be chosen based on probabilities from the training set. Is set to None by default. + random_state : int, default 42 + A seed to set for pseudo-random functions, primarily random selection. + """ + def __init__( self, openai_key: Optional[str] = None, openai_org: Optional[str] = None, openai_model: str = "gpt-3.5-turbo", + default_label: Optional[str] = None, + random_state: int = 42, ): - super().__init__(openai_key, openai_org, openai_model) + super().__init__(openai_key, openai_org, openai_model, default_label, random_state) def _extract_labels(self, y: Any) -> List[str]: if isinstance(y, (pd.Series, np.ndarray)): @@ -95,6 +145,13 @@ def _extract_labels(self, y: Any) -> List[str]: def _get_prompt(self, x) -> str: return build_zero_shot_prompt_slc(x, repr(self.classes_)) + def _get_default_label(self): + """ Returns the default label based on the default_label argument. """ + if self.default_label == "Random": + return random.choices(self.classes_, self.probabilities_)[0] + else: + return self.default_label + def _predict_single(self, x): completion = self._get_chat_completion(x) try: @@ -116,7 +173,7 @@ def _predict_single(self, x): if label not in self.classes_: label = label.replace("'", "").replace('"', "") if label not in self.classes_: # try again - label = random.choices(self.classes_, self.probabilities_)[0] + label = self._get_default_label() return label def fit( @@ -129,14 +186,36 @@ def fit( class MultiLabelZeroShotGPTClassifier(_BaseZeroShotGPTClassifier): + """Zero-shot classifier for multilabel classification. + + Parameters + ---------- + openai_key : Optional[str] , default : None + Your OpenAI API key. If None, the key will be read from the SKLLM_CONFIG_OPENAI_KEY environment variable. + openai_org : Optional[str] , default : None + Your OpenAI organization. If None, the organization will be read from the SKLLM_CONFIG_OPENAI_ORG + environment variable. + openai_model : str , default : "gpt-3.5-turbo" + The OpenAI model to use. See https://beta.openai.com/docs/api-reference/available-models for a list of + available models. + default_label : Optional[str] , default : None + The default label to use if the LLM could not generate a response for a sample. If set to 'Random' a random + label will be chosen based on probabilities from the training set. Is set to None by default. + max_labels : int , default : 3 + The maximum number of labels to predict for each sample. + random_state : int, default 42 + A seed to set for pseudo-random functions, primarily random selection. + """ def __init__( self, openai_key: Optional[str] = None, openai_org: Optional[str] = None, openai_model: str = "gpt-3.5-turbo", + default_label: Optional[str] = None, max_labels: int = 3, + random_state: int = 42, ): - super().__init__(openai_key, openai_org, openai_model) + super().__init__(openai_key, openai_org, openai_model, default_label, random_state) if max_labels < 2: raise ValueError("max_labels should be at least 2") self.max_labels = max_labels @@ -152,6 +231,18 @@ def _extract_labels(self, y) -> List[str]: def _get_prompt(self, x) -> str: return build_zero_shot_prompt_mlc(x, repr(self.classes_), self.max_labels) + def _get_default_label(self): + """ Returns the default label based on the default_label argument. """ + result = [] + if self.default_label == "Random": + for cls, probability in zip(self.classes_, self.probabilities_): + coin_flip = random.choices([0,1], [1-probability, probability])[0] + if coin_flip == 1: + result.append(cls) + else: + result = self.default_label + return result + def _predict_single(self, x): completion = self._get_chat_completion(x) try: @@ -165,8 +256,8 @@ def _predict_single(self, x): if len(labels) > self.max_labels: labels = labels[: self.max_labels - 1] - elif len(labels) < 1: - labels = [random.choices(self.classes_, self.probabilities_)[0]] + elif len(labels) == 0: + labels = self._get_default_label() return labels def fit( diff --git a/skllm/openai/base_gpt.py b/skllm/openai/base_gpt.py index 49c3570..8741a7d 100644 --- a/skllm/openai/base_gpt.py +++ b/skllm/openai/base_gpt.py @@ -1,18 +1,16 @@ +from typing import Any, List, Optional, Union + import numpy as np -from numpy import ndarray import pandas as pd -from skllm.utils import to_numpy as _to_numpy -from typing import Any, Optional, Union, List -from skllm.openai.mixin import OpenAIMixin as _OAIMixin +from numpy import ndarray +from sklearn.base import BaseEstimator as _BaseEstimator +from sklearn.base import TransformerMixin as _TransformerMixin from tqdm import tqdm -from sklearn.base import ( - BaseEstimator as _BaseEstimator, - TransformerMixin as _TransformerMixin, -) -from skllm.openai.chatgpt import ( - construct_message, - get_chat_completion, -) + +from skllm.openai.chatgpt import construct_message, get_chat_completion +from skllm.openai.mixin import OpenAIMixin as _OAIMixin +from skllm.utils import to_numpy as _to_numpy + class BaseZeroShotGPTTransformer(_BaseEstimator, _TransformerMixin, _OAIMixin): diff --git a/skllm/openai/credentials.py b/skllm/openai/credentials.py index 6abdc42..733274a 100644 --- a/skllm/openai/credentials.py +++ b/skllm/openai/credentials.py @@ -1,5 +1,6 @@ import openai + def set_credentials(key: str, org: str): openai.api_key = key openai.organization = org diff --git a/skllm/openai/embeddings.py b/skllm/openai/embeddings.py index cd45b8a..b06f3e8 100644 --- a/skllm/openai/embeddings.py +++ b/skllm/openai/embeddings.py @@ -1,7 +1,10 @@ -import openai from time import sleep + +import openai + from skllm.openai.credentials import set_credentials + def get_embedding( text, key: str, org: str, model="text-embedding-ada-002", max_retries=3 ): diff --git a/skllm/openai/mixin.py b/skllm/openai/mixin.py index c446207..1ab07cd 100644 --- a/skllm/openai/mixin.py +++ b/skllm/openai/mixin.py @@ -1,6 +1,8 @@ from typing import Optional + from skllm.config import SKLLMConfig as _Config + class OpenAIMixin: def _set_keys(self, key: Optional[str] = None, org: Optional[str] = None) -> None: diff --git a/skllm/preprocessing/gpt_vectorizer.py b/skllm/preprocessing/gpt_vectorizer.py index 3bf72a5..18c17b6 100644 --- a/skllm/preprocessing/gpt_vectorizer.py +++ b/skllm/preprocessing/gpt_vectorizer.py @@ -1,14 +1,14 @@ -from sklearn.base import ( - BaseEstimator as _BaseEstimator, - TransformerMixin as _TransformerMixin, -) -from typing import Any, Optional, Union, List -from tqdm import tqdm +from typing import Any, List, Optional, Union + import numpy as np -from numpy import ndarray import pandas as pd -from skllm.openai.mixin import OpenAIMixin as _OAIMixin +from numpy import ndarray +from sklearn.base import BaseEstimator as _BaseEstimator +from sklearn.base import TransformerMixin as _TransformerMixin +from tqdm import tqdm + from skllm.openai.embeddings import get_embedding as _get_embedding +from skllm.openai.mixin import OpenAIMixin as _OAIMixin from skllm.utils import to_numpy as _to_numpy diff --git a/tests/__init__.py b/tests/__init__.py index 19f714a..369b152 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1,2 +1 @@ -from . import test_chatgpt -from . import test_gpt_zero_shot_clf +from . import test_chatgpt, test_gpt_zero_shot_clf diff --git a/tests/test_chatgpt.py b/tests/test_chatgpt.py index 12706cf..41ff1b2 100644 --- a/tests/test_chatgpt.py +++ b/tests/test_chatgpt.py @@ -1,6 +1,12 @@ import unittest from unittest.mock import patch -from skllm.openai.chatgpt import construct_message, get_chat_completion, extract_json_key + +from skllm.openai.chatgpt import ( + construct_message, + extract_json_key, + get_chat_completion, +) + class TestChatGPT(unittest.TestCase): @@ -16,7 +22,8 @@ def test_get_chat_completion(self, mock_create, mock_set_credentials): result = get_chat_completion(messages, key, org, model) self.assertTrue(mock_set_credentials.call_count <= 1, "set_credentials should be called at most once") - self.assertEqual(mock_create.call_count, 2, "ChatCompletion.create should be called twice due to an exception on the first call") + self.assertEqual(mock_create.call_count, 2, "ChatCompletion.create should be called twice due to an exception " + "on the first call") self.assertEqual(result, "success") def test_construct_message(self): diff --git a/tests/test_gpt_zero_shot_clf.py b/tests/test_gpt_zero_shot_clf.py index b48da9c..f9a45a7 100644 --- a/tests/test_gpt_zero_shot_clf.py +++ b/tests/test_gpt_zero_shot_clf.py @@ -1,42 +1,91 @@ -import unittest import json -from unittest.mock import patch, MagicMock +import unittest +from unittest.mock import MagicMock, patch + import numpy as np -from skllm.models.gpt_zero_shot_clf import ZeroShotGPTClassifier, MultiLabelZeroShotGPTClassifier + +from skllm.models.gpt_zero_shot_clf import ( + MultiLabelZeroShotGPTClassifier, + ZeroShotGPTClassifier, +) + class TestZeroShotGPTClassifier(unittest.TestCase): - @patch("skllm.models.gpt_zero_shot_clf.get_chat_completion", return_value=MagicMock()) - def test_fit_predict(self, mock_get_chat_completion): + def get_mock_clf_model(self): clf = ZeroShotGPTClassifier(openai_key="mock_key", openai_org="mock_org") # Mock keys X = np.array(["text1", "text2", "text3"]) y = np.array(["class1", "class2", "class1"]) clf.fit(X, y) self.assertEqual(set(clf.classes_), set(["class1", "class2"])) - self.assertEqual(clf.probabilities_, [2/3, 1/3]) + self.assertEqual(clf.probabilities_, [2 / 3, 1 / 3]) + return clf + @patch("skllm.models.gpt_zero_shot_clf.get_chat_completion", return_value=MagicMock()) + def test_fit_predict(self, mock_get_chat_completion): + clf = self.get_mock_clf_model() mock_get_chat_completion.return_value.choices[0].message = {"content": json.dumps({"label": "class1"})} - predictions = clf.predict(X) + predictions = clf.predict(["text1", "text2", "text3"]) self.assertEqual(predictions, ["class1", "class1", "class1"]) + @patch("skllm.models.gpt_zero_shot_clf.get_chat_completion", return_value=MagicMock()) + def test_fit_predict_unknown_label_set_default(self, mock_get_chat_completion): + clf = self.get_mock_clf_model() + mock_get_chat_completion.return_value.choices[0].message = {"content": json.dumps({"label": "new_class"})} + + predictions = clf.predict(["text1", "text2", "text3"]) + self.assertEqual(predictions, [None, None, None]) + + clf.default_label = "default_class" + predictions = clf.predict(["text1", "text2", "text3"]) + self.assertEqual(predictions, ["default_class", "default_class", "default_class"]) + + clf.default_label = "Random" + predictions = clf.predict(["text1", "text2", "text3"]) + self.assertEqual(predictions, ["class1", "class1", "class1"]) + + + class TestMultiLabelZeroShotGPTClassifier(unittest.TestCase): - @patch("skllm.models.gpt_zero_shot_clf.get_chat_completion", return_value=MagicMock()) - def test_fit_predict(self, mock_get_chat_completion): + def get_mock_clf_model(self): clf = MultiLabelZeroShotGPTClassifier(openai_key="mock_key", openai_org="mock_org") # Mock keys X = np.array(["text1", "text2", "text3"]) - y = [["class1", "class2"], ["class1", "class2"], ["class1", "class2"]] # Adjusted y to ensure [0.5, 0.5] probability + y = [["class1", "class2"], ["class1", "class2"], + ["class1", "class2"]] # Adjusted y to ensure [0.5, 0.5] probability clf.fit(X, y) self.assertEqual(set(clf.classes_), set(["class1", "class2"])) self.assertEqual(clf.probabilities_, [0.5, 0.5]) + return clf - mock_get_chat_completion.return_value.choices[0].message = {"content": json.dumps({"label": ["class1", "class2"]})} - predictions = clf.predict(X) + @patch("skllm.models.gpt_zero_shot_clf.get_chat_completion", return_value=MagicMock()) + def test_fit_predict(self, mock_get_chat_completion): + clf = self.get_mock_clf_model() + mock_get_chat_completion.return_value.choices[0].message = { + "content": json.dumps({"label": ["class1", "class2"]})} + predictions = clf.predict(["text1", "text2", "text3"]) self.assertEqual(predictions, [["class1", "class2"], ["class1", "class2"], ["class1", "class2"]]) + @patch("skllm.models.gpt_zero_shot_clf.get_chat_completion", return_value=MagicMock()) + def test_fit_predict_unknown_label_set_default(self, mock_get_chat_completion): + clf = self.get_mock_clf_model() + mock_get_chat_completion.return_value.choices[0].message = {"content": json.dumps({"label": "new_class"})} + + predictions = clf.predict(["text1", "text2", "text3"]) + self.assertEqual(predictions, [None, None, None]) + + clf.default_label = "default_class" + predictions = clf.predict(["text1", "text2", "text3"]) + self.assertEqual(predictions, ["default_class", "default_class", "default_class"]) + + clf.default_label = "Random" + predictions = clf.predict(["text1", "text2", "text3"]) + self.assertEqual(predictions, [['class1'], [], ['class1', 'class2']]) + + if __name__ == '__main__': unittest.main() From aba29304f308aca2850e0c42db19fe1bdfeca029 Mon Sep 17 00:00:00 2001 From: Nadav-Barak Date: Sat, 3 Jun 2023 17:10:20 +0300 Subject: [PATCH 2/4] formatting --- requirements-dev.txt | 1 - tests/test_gpt_zero_shot_clf.py | 1 - 2 files changed, 2 deletions(-) diff --git a/requirements-dev.txt b/requirements-dev.txt index 5afd37d..c2de4e7 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -6,4 +6,3 @@ docformatter interrogate numpy pandas -pytest \ No newline at end of file diff --git a/tests/test_gpt_zero_shot_clf.py b/tests/test_gpt_zero_shot_clf.py index f9a45a7..e8cda08 100644 --- a/tests/test_gpt_zero_shot_clf.py +++ b/tests/test_gpt_zero_shot_clf.py @@ -47,7 +47,6 @@ def test_fit_predict_unknown_label_set_default(self, mock_get_chat_completion): self.assertEqual(predictions, ["class1", "class1", "class1"]) - class TestMultiLabelZeroShotGPTClassifier(unittest.TestCase): def get_mock_clf_model(self): From 21484d4c7205d16a203dbaf1d7f988062e9998ee Mon Sep 17 00:00:00 2001 From: Nadav-Barak Date: Sun, 4 Jun 2023 09:45:04 +0300 Subject: [PATCH 3/4] CR Comments - changed default to 'Random' --- skllm/models/gpt_zero_shot_clf.py | 30 ++++++++++++++++-------------- tests/test_gpt_zero_shot_clf.py | 16 ++++++++-------- 2 files changed, 24 insertions(+), 22 deletions(-) diff --git a/skllm/models/gpt_zero_shot_clf.py b/skllm/models/gpt_zero_shot_clf.py index 560143c..cd9e079 100644 --- a/skllm/models/gpt_zero_shot_clf.py +++ b/skllm/models/gpt_zero_shot_clf.py @@ -31,9 +31,9 @@ class _BaseZeroShotGPTClassifier(ABC, BaseEstimator, ClassifierMixin, _OAIMixin) openai_model : str , default : "gpt-3.5-turbo" The OpenAI model to use. See https://beta.openai.com/docs/api-reference/available-models for a list of available models. - default_label : Optional[str] , default : None + default_label : Optional[Union[List[str], str]] , default : 'Random' The default label to use if the LLM could not generate a response for a sample. If set to 'Random' a random - label will be chosen based on probabilities from the training set. Is set to None by default. + label will be chosen based on probabilities from the training set. random_state : int, default 42 A seed to set for pseudo-random functions, primarily random selection. """ @@ -43,7 +43,7 @@ def __init__( openai_key: Optional[str] = None, openai_org: Optional[str] = None, openai_model: str = "gpt-3.5-turbo", - default_label: Optional[str] = None, + default_label: Optional[Union[List[str], str]] = 'Random', random_state: int = 42, ): self._set_keys(openai_key, openai_org) @@ -118,9 +118,9 @@ class ZeroShotGPTClassifier(_BaseZeroShotGPTClassifier): openai_model : str , default : "gpt-3.5-turbo" The OpenAI model to use. See https://beta.openai.com/docs/api-reference/available-models for a list of available models. - default_label : Optional[str] , default : None + default_label : Optional[str] , default : 'Random' The default label to use if the LLM could not generate a response for a sample. If set to 'Random' a random - label will be chosen based on probabilities from the training set. Is set to None by default. + label will be chosen based on probabilities from the training set. random_state : int, default 42 A seed to set for pseudo-random functions, primarily random selection. """ @@ -130,7 +130,7 @@ def __init__( openai_key: Optional[str] = None, openai_org: Optional[str] = None, openai_model: str = "gpt-3.5-turbo", - default_label: Optional[str] = None, + default_label: Optional[str] = 'Random', random_state: int = 42, ): super().__init__(openai_key, openai_org, openai_model, default_label, random_state) @@ -198,9 +198,9 @@ class MultiLabelZeroShotGPTClassifier(_BaseZeroShotGPTClassifier): openai_model : str , default : "gpt-3.5-turbo" The OpenAI model to use. See https://beta.openai.com/docs/api-reference/available-models for a list of available models. - default_label : Optional[str] , default : None + default_label : Optional[Union[List[str], str]] , default : 'Random' The default label to use if the LLM could not generate a response for a sample. If set to 'Random' a random - label will be chosen based on probabilities from the training set. Is set to None by default. + label will be chosen based on probabilities from the training set. max_labels : int , default : 3 The maximum number of labels to predict for each sample. random_state : int, default 42 @@ -211,13 +211,15 @@ def __init__( openai_key: Optional[str] = None, openai_org: Optional[str] = None, openai_model: str = "gpt-3.5-turbo", - default_label: Optional[str] = None, + default_label: Optional[Union[List[str], str]] = 'Random', max_labels: int = 3, random_state: int = 42, ): super().__init__(openai_key, openai_org, openai_model, default_label, random_state) if max_labels < 2: raise ValueError("max_labels should be at least 2") + if isinstance(default_label, str) and default_label != "Random": + raise ValueError("default_label should be a list of strings or 'Random'") self.max_labels = max_labels def _extract_labels(self, y) -> List[str]: @@ -234,13 +236,14 @@ def _get_prompt(self, x) -> str: def _get_default_label(self): """ Returns the default label based on the default_label argument. """ result = [] - if self.default_label == "Random": + if isinstance(self.default_label, str) and self.default_label == "Random": for cls, probability in zip(self.classes_, self.probabilities_): coin_flip = random.choices([0,1], [1-probability, probability])[0] if coin_flip == 1: result.append(cls) else: result = self.default_label + return result def _predict_single(self, x): @@ -253,11 +256,10 @@ def _predict_single(self, x): labels = [] labels = list(filter(lambda l: l in self.classes_, labels)) - - if len(labels) > self.max_labels: - labels = labels[: self.max_labels - 1] - elif len(labels) == 0: + if len(labels) == 0: labels = self._get_default_label() + if labels is not None and len(labels) > self.max_labels: + labels = random.choices(labels, k=self.max_labels) return labels def fit( diff --git a/tests/test_gpt_zero_shot_clf.py b/tests/test_gpt_zero_shot_clf.py index e8cda08..1667db1 100644 --- a/tests/test_gpt_zero_shot_clf.py +++ b/tests/test_gpt_zero_shot_clf.py @@ -36,15 +36,15 @@ def test_fit_predict_unknown_label_set_default(self, mock_get_chat_completion): mock_get_chat_completion.return_value.choices[0].message = {"content": json.dumps({"label": "new_class"})} predictions = clf.predict(["text1", "text2", "text3"]) - self.assertEqual(predictions, [None, None, None]) + self.assertEqual(predictions, ["class1", "class1", "class1"]) # Random choice clf.default_label = "default_class" predictions = clf.predict(["text1", "text2", "text3"]) self.assertEqual(predictions, ["default_class", "default_class", "default_class"]) - clf.default_label = "Random" + clf.default_label = None predictions = clf.predict(["text1", "text2", "text3"]) - self.assertEqual(predictions, ["class1", "class1", "class1"]) + self.assertEqual(predictions, [None, None, None]) class TestMultiLabelZeroShotGPTClassifier(unittest.TestCase): @@ -75,15 +75,15 @@ def test_fit_predict_unknown_label_set_default(self, mock_get_chat_completion): mock_get_chat_completion.return_value.choices[0].message = {"content": json.dumps({"label": "new_class"})} predictions = clf.predict(["text1", "text2", "text3"]) - self.assertEqual(predictions, [None, None, None]) + self.assertEqual(predictions, [['class1'], [], ['class1', 'class2']]) # Random choice - clf.default_label = "default_class" + clf.default_label = ["default_class"] predictions = clf.predict(["text1", "text2", "text3"]) - self.assertEqual(predictions, ["default_class", "default_class", "default_class"]) + self.assertEqual(predictions, [["default_class"], ["default_class"], ["default_class"]]) - clf.default_label = "Random" + clf.default_label = None predictions = clf.predict(["text1", "text2", "text3"]) - self.assertEqual(predictions, [['class1'], [], ['class1', 'class2']]) + self.assertEqual(predictions, [None, None, None]) if __name__ == '__main__': From be7b50f3ecfb146ea54264b4d8636706e3132f34 Mon Sep 17 00:00:00 2001 From: Nadav-Barak Date: Sun, 4 Jun 2023 15:40:26 +0300 Subject: [PATCH 4/4] CR Comments v2 --- skllm/models/gpt_zero_shot_clf.py | 23 ++++++----------------- tests/test_gpt_zero_shot_clf.py | 10 ++++++++-- 2 files changed, 14 insertions(+), 19 deletions(-) diff --git a/skllm/models/gpt_zero_shot_clf.py b/skllm/models/gpt_zero_shot_clf.py index cd9e079..a948c77 100644 --- a/skllm/models/gpt_zero_shot_clf.py +++ b/skllm/models/gpt_zero_shot_clf.py @@ -1,7 +1,7 @@ import random from abc import ABC, abstractmethod from collections import Counter -from typing import Any, List, Optional, Union +from typing import Any, List, Optional, Union, Literal import numpy as np import pandas as pd @@ -34,8 +34,6 @@ class _BaseZeroShotGPTClassifier(ABC, BaseEstimator, ClassifierMixin, _OAIMixin) default_label : Optional[Union[List[str], str]] , default : 'Random' The default label to use if the LLM could not generate a response for a sample. If set to 'Random' a random label will be chosen based on probabilities from the training set. - random_state : int, default 42 - A seed to set for pseudo-random functions, primarily random selection. """ def __init__( @@ -44,13 +42,10 @@ def __init__( openai_org: Optional[str] = None, openai_model: str = "gpt-3.5-turbo", default_label: Optional[Union[List[str], str]] = 'Random', - random_state: int = 42, ): self._set_keys(openai_key, openai_org) self.openai_model = openai_model self.default_label = default_label - random.seed(random_state) - np.random.seed(random_state) def _to_np(self, X): return _to_numpy(X) @@ -121,8 +116,6 @@ class ZeroShotGPTClassifier(_BaseZeroShotGPTClassifier): default_label : Optional[str] , default : 'Random' The default label to use if the LLM could not generate a response for a sample. If set to 'Random' a random label will be chosen based on probabilities from the training set. - random_state : int, default 42 - A seed to set for pseudo-random functions, primarily random selection. """ def __init__( @@ -131,9 +124,8 @@ def __init__( openai_org: Optional[str] = None, openai_model: str = "gpt-3.5-turbo", default_label: Optional[str] = 'Random', - random_state: int = 42, ): - super().__init__(openai_key, openai_org, openai_model, default_label, random_state) + super().__init__(openai_key, openai_org, openai_model, default_label) def _extract_labels(self, y: Any) -> List[str]: if isinstance(y, (pd.Series, np.ndarray)): @@ -198,24 +190,21 @@ class MultiLabelZeroShotGPTClassifier(_BaseZeroShotGPTClassifier): openai_model : str , default : "gpt-3.5-turbo" The OpenAI model to use. See https://beta.openai.com/docs/api-reference/available-models for a list of available models. - default_label : Optional[Union[List[str], str]] , default : 'Random' + default_label : Optional[Union[List[str], Literal['Random']] , default : 'Random' The default label to use if the LLM could not generate a response for a sample. If set to 'Random' a random label will be chosen based on probabilities from the training set. max_labels : int , default : 3 The maximum number of labels to predict for each sample. - random_state : int, default 42 - A seed to set for pseudo-random functions, primarily random selection. """ def __init__( self, openai_key: Optional[str] = None, openai_org: Optional[str] = None, openai_model: str = "gpt-3.5-turbo", - default_label: Optional[Union[List[str], str]] = 'Random', + default_label: Optional[Union[List[str], Literal['Random']]] = 'Random', max_labels: int = 3, - random_state: int = 42, ): - super().__init__(openai_key, openai_org, openai_model, default_label, random_state) + super().__init__(openai_key, openai_org, openai_model, default_label) if max_labels < 2: raise ValueError("max_labels should be at least 2") if isinstance(default_label, str) and default_label != "Random": @@ -259,7 +248,7 @@ def _predict_single(self, x): if len(labels) == 0: labels = self._get_default_label() if labels is not None and len(labels) > self.max_labels: - labels = random.choices(labels, k=self.max_labels) + labels = labels[:self.max_labels - 1] return labels def fit( diff --git a/tests/test_gpt_zero_shot_clf.py b/tests/test_gpt_zero_shot_clf.py index 1667db1..78c2e45 100644 --- a/tests/test_gpt_zero_shot_clf.py +++ b/tests/test_gpt_zero_shot_clf.py @@ -1,4 +1,5 @@ import json +import random import unittest from unittest.mock import MagicMock, patch @@ -36,7 +37,9 @@ def test_fit_predict_unknown_label_set_default(self, mock_get_chat_completion): mock_get_chat_completion.return_value.choices[0].message = {"content": json.dumps({"label": "new_class"})} predictions = clf.predict(["text1", "text2", "text3"]) - self.assertEqual(predictions, ["class1", "class1", "class1"]) # Random choice + random.seed(42) + np.random.seed(42) + self.assertEqual(predictions, ["class2", "class1", "class1"]) # Random choice clf.default_label = "default_class" predictions = clf.predict(["text1", "text2", "text3"]) @@ -50,7 +53,8 @@ def test_fit_predict_unknown_label_set_default(self, mock_get_chat_completion): class TestMultiLabelZeroShotGPTClassifier(unittest.TestCase): def get_mock_clf_model(self): - clf = MultiLabelZeroShotGPTClassifier(openai_key="mock_key", openai_org="mock_org") # Mock keys + clf = MultiLabelZeroShotGPTClassifier(openai_key="mock_key", openai_org="mock_org", + default_label='Random') # Mock keys X = np.array(["text1", "text2", "text3"]) y = [["class1", "class2"], ["class1", "class2"], ["class1", "class2"]] # Adjusted y to ensure [0.5, 0.5] probability @@ -75,6 +79,8 @@ def test_fit_predict_unknown_label_set_default(self, mock_get_chat_completion): mock_get_chat_completion.return_value.choices[0].message = {"content": json.dumps({"label": "new_class"})} predictions = clf.predict(["text1", "text2", "text3"]) + random.seed(42) + np.random.seed(42) self.assertEqual(predictions, [['class1'], [], ['class1', 'class2']]) # Random choice clf.default_label = ["default_class"]