BeastByteAI · OKUA1 · Jun 4, 2023 · Jun 3, 2023 · Jun 3, 2023 · Jun 4, 2023
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -4,3 +4,5 @@ isort
 ruff
 docformatter
 interrogate
+numpy
+pandas
diff --git a/skllm/models/gpt_zero_shot_clf.py b/skllm/models/gpt_zero_shot_clf.py
@@ -1,7 +1,7 @@
 import random
 from abc import ABC, abstractmethod
 from collections import Counter
-from typing import Any, List, Optional, Union
+from typing import Any, List, Optional, Union, Literal
 
 import numpy as np
 import pandas as pd
@@ -19,18 +19,42 @@
 
 
 class _BaseZeroShotGPTClassifier(ABC, BaseEstimator, ClassifierMixin, _OAIMixin):
+    """Base class for zero-shot classifiers.
+
+    Parameters
+    ----------
+    openai_key : Optional[str] , default : None
+        Your OpenAI API key. If None, the key will be read from the SKLLM_CONFIG_OPENAI_KEY environment variable.
+    openai_org : Optional[str] , default : None
+        Your OpenAI organization. If None, the organization will be read from the SKLLM_CONFIG_OPENAI_ORG
+         environment variable.
+    openai_model : str , default : "gpt-3.5-turbo"
+        The OpenAI model to use. See https://beta.openai.com/docs/api-reference/available-models for a list of
+        available models.
+    default_label : Optional[Union[List[str], str]] , default : 'Random'
+        The default label to use if the LLM could not generate a response for a sample. If set to 'Random' a random
+        label will be chosen based on probabilities from the training set.
+    """
+
     def __init__(
         self,
         openai_key: Optional[str] = None,
         openai_org: Optional[str] = None,
         openai_model: str = "gpt-3.5-turbo",
+        default_label: Optional[Union[List[str], str]] = 'Random',
     ):
         self._set_keys(openai_key, openai_org)
         self.openai_model = openai_model
+        self.default_label = default_label
 
     def _to_np(self, X):
         return _to_numpy(X)
 
+    @abstractmethod
+    def _get_default_label(self):
+        """ Returns the default label based on the default_label argument. """
+        raise NotImplementedError()
+
     def fit(
         self,
         X: Optional[Union[np.ndarray, pd.Series, List[str]]],
@@ -77,13 +101,31 @@ def _get_chat_completion(self, x):
 
 
 class ZeroShotGPTClassifier(_BaseZeroShotGPTClassifier):
+    """Zero-shot classifier for multiclass classification.
+
+    Parameters
+    ----------
+    openai_key : Optional[str] , default : None
+        Your OpenAI API key. If None, the key will be read from the SKLLM_CONFIG_OPENAI_KEY environment variable.
+    openai_org : Optional[str] , default : None
+        Your OpenAI organization. If None, the organization will be read from the SKLLM_CONFIG_OPENAI_ORG
+         environment variable.
+    openai_model : str , default : "gpt-3.5-turbo"
+        The OpenAI model to use. See https://beta.openai.com/docs/api-reference/available-models for a list of
+        available models.
+    default_label : Optional[str] , default : 'Random'
+        The default label to use if the LLM could not generate a response for a sample. If set to 'Random' a random
+        label will be chosen based on probabilities from the training set.
+    """
+
     def __init__(
         self,
         openai_key: Optional[str] = None,
         openai_org: Optional[str] = None,
         openai_model: str = "gpt-3.5-turbo",
+        default_label: Optional[str] = 'Random',
     ):
-        super().__init__(openai_key, openai_org, openai_model)
+        super().__init__(openai_key, openai_org, openai_model, default_label)
 
     def _extract_labels(self, y: Any) -> List[str]:
         if isinstance(y, (pd.Series, np.ndarray)):
@@ -95,6 +137,13 @@ def _extract_labels(self, y: Any) -> List[str]:
     def _get_prompt(self, x) -> str:
         return build_zero_shot_prompt_slc(x, repr(self.classes_))
 
+    def _get_default_label(self):
+        """ Returns the default label based on the default_label argument. """
+        if self.default_label == "Random":
+            return random.choices(self.classes_, self.probabilities_)[0]
+        else:
+            return self.default_label
+
     def _predict_single(self, x):
         completion = self._get_chat_completion(x)
         try:
@@ -116,7 +165,7 @@ def _predict_single(self, x):
         if label not in self.classes_:
             label = label.replace("'", "").replace('"', "")
             if label not in self.classes_:  # try again
-                label = random.choices(self.classes_, self.probabilities_)[0]
+                label = self._get_default_label()
         return label
 
     def fit(
@@ -129,16 +178,37 @@ def fit(
 
 
 class MultiLabelZeroShotGPTClassifier(_BaseZeroShotGPTClassifier):
+    """Zero-shot classifier for multilabel classification.
+
+    Parameters
+    ----------
+    openai_key : Optional[str] , default : None
+        Your OpenAI API key. If None, the key will be read from the SKLLM_CONFIG_OPENAI_KEY environment variable.
+    openai_org : Optional[str] , default : None
+        Your OpenAI organization. If None, the organization will be read from the SKLLM_CONFIG_OPENAI_ORG
+         environment variable.
+    openai_model : str , default : "gpt-3.5-turbo"
+        The OpenAI model to use. See https://beta.openai.com/docs/api-reference/available-models for a list of
+        available models.
+    default_label : Optional[Union[List[str], Literal['Random']] , default : 'Random'
+        The default label to use if the LLM could not generate a response for a sample. If set to 'Random' a random
+        label will be chosen based on probabilities from the training set.
+    max_labels : int , default : 3
+        The maximum number of labels to predict for each sample.
+    """
     def __init__(
         self,
         openai_key: Optional[str] = None,
         openai_org: Optional[str] = None,
         openai_model: str = "gpt-3.5-turbo",
+        default_label: Optional[Union[List[str], Literal['Random']]] = 'Random',
         max_labels: int = 3,
     ):
-        super().__init__(openai_key, openai_org, openai_model)
+        super().__init__(openai_key, openai_org, openai_model, default_label)
         if max_labels < 2:
             raise ValueError("max_labels should be at least 2")
+        if isinstance(default_label, str) and default_label != "Random":
+            raise ValueError("default_label should be a list of strings or 'Random'")
         self.max_labels = max_labels
 
     def _extract_labels(self, y) -> List[str]:
@@ -152,6 +222,19 @@ def _extract_labels(self, y) -> List[str]:
     def _get_prompt(self, x) -> str:
         return build_zero_shot_prompt_mlc(x, repr(self.classes_), self.max_labels)
 
+    def _get_default_label(self):
+        """ Returns the default label based on the default_label argument. """
+        result = []
+        if isinstance(self.default_label, str) and self.default_label == "Random":
+            for cls, probability in zip(self.classes_, self.probabilities_):
+                coin_flip = random.choices([0,1], [1-probability, probability])[0]
+                if coin_flip == 1:
+                    result.append(cls)
+        else:
+            result = self.default_label
+
+        return result
+
     def _predict_single(self, x):
         completion = self._get_chat_completion(x)
         try:
@@ -162,11 +245,10 @@ def _predict_single(self, x):
             labels = []
 
         labels = list(filter(lambda l: l in self.classes_, labels))
-
-        if len(labels) > self.max_labels:
-            labels = labels[: self.max_labels - 1]
-        elif len(labels) < 1:
-            labels = [random.choices(self.classes_, self.probabilities_)[0]]
+        if len(labels) == 0:
+            labels = self._get_default_label()
+        if labels is not None and len(labels) > self.max_labels:
+            labels = labels[:self.max_labels - 1]
         return labels
 
     def fit(

diff --git a/skllm/openai/base_gpt.py b/skllm/openai/base_gpt.py
@@ -1,18 +1,16 @@
+from typing import Any, List, Optional, Union
+
 import numpy as np
-from numpy import ndarray
 import pandas as pd
-from skllm.utils import to_numpy as _to_numpy
-from typing import Any, Optional, Union, List
-from skllm.openai.mixin import OpenAIMixin as _OAIMixin
+from numpy import ndarray
+from sklearn.base import BaseEstimator as _BaseEstimator
+from sklearn.base import TransformerMixin as _TransformerMixin
 from tqdm import tqdm
-from sklearn.base import (
-    BaseEstimator as _BaseEstimator,
-    TransformerMixin as _TransformerMixin,
-)
-from skllm.openai.chatgpt import (
-    construct_message,
-    get_chat_completion,
-)
+
+from skllm.openai.chatgpt import construct_message, get_chat_completion
+from skllm.openai.mixin import OpenAIMixin as _OAIMixin
+from skllm.utils import to_numpy as _to_numpy
+
 
 class BaseZeroShotGPTTransformer(_BaseEstimator, _TransformerMixin, _OAIMixin): 
 

diff --git a/skllm/openai/credentials.py b/skllm/openai/credentials.py
@@ -1,5 +1,6 @@
 import openai
 
+
 def set_credentials(key: str, org: str):
     openai.api_key = key
     openai.organization = org
diff --git a/skllm/openai/embeddings.py b/skllm/openai/embeddings.py
@@ -1,7 +1,10 @@
-import openai
 from time import sleep
+
+import openai
+
 from skllm.openai.credentials import set_credentials
 
+
 def get_embedding(
     text, key: str, org: str, model="text-embedding-ada-002", max_retries=3
 ):

diff --git a/skllm/openai/mixin.py b/skllm/openai/mixin.py
@@ -1,6 +1,8 @@
 from typing import Optional
+
 from skllm.config import SKLLMConfig as _Config
 
+
 class OpenAIMixin:
 
     def _set_keys(self, key: Optional[str] = None, org: Optional[str] = None) -> None:

diff --git a/skllm/preprocessing/gpt_vectorizer.py b/skllm/preprocessing/gpt_vectorizer.py
@@ -1,14 +1,14 @@
-from sklearn.base import (
-    BaseEstimator as _BaseEstimator,
-    TransformerMixin as _TransformerMixin,
-)
-from typing import Any, Optional, Union, List
-from tqdm import tqdm
+from typing import Any, List, Optional, Union
+
 import numpy as np
-from numpy import ndarray
 import pandas as pd
-from skllm.openai.mixin import OpenAIMixin as _OAIMixin
+from numpy import ndarray
+from sklearn.base import BaseEstimator as _BaseEstimator
+from sklearn.base import TransformerMixin as _TransformerMixin
+from tqdm import tqdm
+
 from skllm.openai.embeddings import get_embedding as _get_embedding
+from skllm.openai.mixin import OpenAIMixin as _OAIMixin
 from skllm.utils import to_numpy as _to_numpy
 
 

diff --git a/tests/__init__.py b/tests/__init__.py
@@ -1,2 +1 @@
-from . import test_chatgpt
-from . import test_gpt_zero_shot_clf
+from . import test_chatgpt, test_gpt_zero_shot_clf
diff --git a/tests/test_chatgpt.py b/tests/test_chatgpt.py
@@ -1,6 +1,12 @@
 import unittest
 from unittest.mock import patch
-from skllm.openai.chatgpt import construct_message, get_chat_completion, extract_json_key
+
+from skllm.openai.chatgpt import (
+    construct_message,
+    extract_json_key,
+    get_chat_completion,
+)
+
 
 class TestChatGPT(unittest.TestCase):
 
@@ -16,7 +22,8 @@ def test_get_chat_completion(self, mock_create, mock_set_credentials):
         result = get_chat_completion(messages, key, org, model)
 
         self.assertTrue(mock_set_credentials.call_count <= 1, "set_credentials should be called at most once")
-        self.assertEqual(mock_create.call_count, 2, "ChatCompletion.create should be called twice due to an exception on the first call")
+        self.assertEqual(mock_create.call_count, 2, "ChatCompletion.create should be called twice due to an exception "
+                                                    "on the first call")
         self.assertEqual(result, "success")
 
     def test_construct_message(self):
-Original file line number
+Diff line change
@@ Expand Up / @@ -4,3 +4,5 @@ isort @@
     ruff
     docformatter
     interrogate
+    numpy
+    pandas