From 16f54687495ee69541c120533b5884443bdf0dc1 Mon Sep 17 00:00:00 2001 From: Ogundepo Odunayo Date: Thu, 1 Jun 2023 00:23:41 -0400 Subject: [PATCH 1/4] add docstrings for openai and preprocessing moduls --- skllm/gpt4all_client.py | 22 ++++++++++- skllm/models/gpt_zero_shot_clf.py | 25 ++++++++++++ skllm/openai/chatgpt.py | 51 +++++++++++++++++++++++-- skllm/openai/credentials.py | 12 +++++- skllm/openai/embeddings.py | 25 +++++++++++- skllm/openai/mixin.py | 21 +++++++++- skllm/preprocessing/gpt_summarizer.py | 30 ++++++++++++++- skllm/preprocessing/gpt_translator.py | 2 +- skllm/preprocessing/gpt_vectorizer.py | 55 ++++++++++++++++++++++++++- skllm/utils.py | 32 ++++++++++++++-- 10 files changed, 260 insertions(+), 15 deletions(-) diff --git a/skllm/gpt4all_client.py b/skllm/gpt4all_client.py index e8ef0ae..943dc94 100644 --- a/skllm/gpt4all_client.py +++ b/skllm/gpt4all_client.py @@ -1,3 +1,5 @@ +from typing import Dict + try: from gpt4all import GPT4All except (ImportError, ModuleNotFoundError): @@ -6,18 +8,34 @@ _loaded_models = {} -def get_chat_completion(messages, model="ggml-gpt4all-j-v1.3-groovy"): +def get_chat_completion(messages: str, model: str="ggml-gpt4all-j-v1.3-groovy") -> Dict: + """ + Get a chat completion from GPT4All which Format list of message dictionaries into a prompt and call model + generate on prompt + + Parameters + ---------- + messages : str + The messages to use as a prompt for the chat completion. + model : str + The model to use for the chat completion. Defaults to "ggml-gpt4all-j-v1.3-groovy". + + Returns + ------- + completion : Dict + """ if GPT4All is None: raise ImportError( "gpt4all is not installed, try `pip install scikit-llm[gpt4all]`" ) if model not in _loaded_models.keys(): _loaded_models[model] = GPT4All(model) + return _loaded_models[model].chat_completion( messages, verbose=False, streaming=False, temp=1e-10 ) -def unload_models(): +def unload_models() -> None: global _loaded_models _loaded_models = {} diff --git a/skllm/models/gpt_zero_shot_clf.py b/skllm/models/gpt_zero_shot_clf.py index fe64cfb..05cd03e 100644 --- a/skllm/models/gpt_zero_shot_clf.py +++ b/skllm/models/gpt_zero_shot_clf.py @@ -19,6 +19,18 @@ class _BaseZeroShotGPTClassifier(ABC, BaseEstimator, ClassifierMixin, _OAIMixin): + """ + A base class for zero-shot classification using GPT-3. + + Initialization Parameters + ---------- + openai_key : str, optional + The OPEN AI key to use. Defaults to None. + openai_org : str, optional + The OPEN AI organization to use. Defaults to None. + openai_model : str, optional + The OPEN AI model to use. Defaults to "gpt-3.5-turbo". + """ def __init__( self, openai_key: Optional[str] = None, @@ -29,6 +41,19 @@ def __init__( self.openai_model = openai_model def _to_np(self, X): + """ + Convert X to a numpy array. + + Parameters + ---------- + X : Any + The input data to convert to a numpy array. + + Returns + ------- + np.ndarray + The input data as a numpy array. + """ return _to_numpy(X) def fit( diff --git a/skllm/openai/chatgpt.py b/skllm/openai/chatgpt.py index dc1d862..576ee3c 100644 --- a/skllm/openai/chatgpt.py +++ b/skllm/openai/chatgpt.py @@ -1,5 +1,6 @@ import json from time import sleep +from typing import Any import openai @@ -7,13 +8,47 @@ from skllm.utils import find_json_in_string -def construct_message(role, content): +def construct_message(role: str, content: str) -> dict: + """ + Construct a message for the OpenAI API. + + Parameters + ---------- + role : str + The role of the message. Must be one of "system", "user", or "assistant". + content : str + The content of the message. + + Returns + ------- + message : dict + """ if role not in ("system", "user", "assistant"): raise ValueError("Invalid role") return {"role": role, "content": content} -def get_chat_completion(messages, key, org, model="gpt-3.5-turbo", max_retries=3): +def get_chat_completion(messages: str, key: str, org: str, model: str="gpt-3.5-turbo", max_retries: int=3): + """ + Get a chat completion from the OpenAI API. + + Parameters + ---------- + messages : str + input messages to use. + key : str + The OPEN AI key to use. + org : str + The OPEN AI organization to use. + model : str, optional + The OPEN AI model to use. Defaults to "gpt-3.5-turbo". + max_retries : int, optional + The maximum number of retries to use. Defaults to 3. + + Returns + ------- + completion : dict + """ set_credentials(key, org) error_msg = None error_type = None @@ -33,7 +68,17 @@ def get_chat_completion(messages, key, org, model="gpt-3.5-turbo", max_retries=3 ) -def extract_json_key(json_, key): +def extract_json_key(json_: str, key: str) -> Any: + """ + Extracts a key from a JSON string. + + Parameters + ---------- + json_ : str + The JSON string to extract the key from. + key : str + The key to extract. + """ try: json_ = json_.replace("\n", "") json_ = find_json_in_string(json_) diff --git a/skllm/openai/credentials.py b/skllm/openai/credentials.py index 6abdc42..959c20f 100644 --- a/skllm/openai/credentials.py +++ b/skllm/openai/credentials.py @@ -1,5 +1,15 @@ import openai -def set_credentials(key: str, org: str): +def set_credentials(key: str, org: str) -> None: + """ + Set the OpenAI key and organization. + + Parameters + ---------- + key : str + The OpenAI key to use. + org : str + The OpenAI organization to use. + """ openai.api_key = key openai.organization = org diff --git a/skllm/openai/embeddings.py b/skllm/openai/embeddings.py index cd45b8a..3cf1fa7 100644 --- a/skllm/openai/embeddings.py +++ b/skllm/openai/embeddings.py @@ -3,8 +3,29 @@ from skllm.openai.credentials import set_credentials def get_embedding( - text, key: str, org: str, model="text-embedding-ada-002", max_retries=3 -): + text: str, key: str, org: str, model: str="text-embedding-ada-002", max_retries: int=3 +): + """ + Encode a string and return the GPT embedding for a string. + + Parameters + ---------- + text : str + The string to encode. + key : str + The OPEN AI key to use. + org : str + The OPEN AI organization to use. + model : str, optional + The OPEN AI model to use. Defaults to "text-embedding-ada-002". + max_retries : int, optional + The maximum number of retries to use. Defaults to 3. + + Returns + ------- + emb : list + The GPT embedding for the string. + """ set_credentials(key, org) text = text.replace("\n", " ") error_msg = None diff --git a/skllm/openai/mixin.py b/skllm/openai/mixin.py index c446207..b5dd5e9 100644 --- a/skllm/openai/mixin.py +++ b/skllm/openai/mixin.py @@ -2,12 +2,24 @@ from skllm.config import SKLLMConfig as _Config class OpenAIMixin: - + """ + A mixin class that provides OpenAI key and organization to other classes. + """ def _set_keys(self, key: Optional[str] = None, org: Optional[str] = None) -> None: + """ + Set the OpenAI key and organization. + """ self.openai_key = key self.openai_org = org def _get_openai_key(self) -> str: + """ + Get the OpenAI key from the class or the config file. + + Returns + ------- + openai_key: str + """ key = self.openai_key if key is None: key = _Config.get_openai_key() @@ -16,6 +28,13 @@ def _get_openai_key(self) -> str: return key def _get_openai_org(self) -> str: + """ + Get the OpenAI organization from the class or the config file. + + Returns + ------- + openai_org: str + """ key = self.openai_org if key is None: key = _Config.get_openai_org() diff --git a/skllm/preprocessing/gpt_summarizer.py b/skllm/preprocessing/gpt_summarizer.py index 8804275..2260a1a 100644 --- a/skllm/preprocessing/gpt_summarizer.py +++ b/skllm/preprocessing/gpt_summarizer.py @@ -5,6 +5,21 @@ class GPTSummarizer(_BaseGPT): + """ + A text summarizer. + + Parameters + ---------- + openai_key : str, optional + The OPEN AI key to use. Defaults to None. + openai_org : str, optional + The OPEN AI organization to use. Defaults to None. + openai_model : str, optional + The OPEN AI model to use. Defaults to "gpt-3.5-turbo". + max_words : int, optional + The maximum number of words to use in the summary. Defaults to 15. + + """ system_msg = "You are a text summarizer." default_output = "Summary is unavailable." @@ -18,6 +33,19 @@ def __init__( self._set_keys(openai_key, openai_org) self.openai_model = openai_model self.max_words = max_words + - def _get_prompt(self, X) -> str: + def _get_prompt(self, X: str) -> str: + """ + Generates the prompt for the given input. + + Parameters + ---------- + X : str + sample to summarize + + Returns + ------- + str + """ return build_summary_prompt(X, self.max_words) diff --git a/skllm/preprocessing/gpt_translator.py b/skllm/preprocessing/gpt_translator.py index 35e4d68..368ef58 100644 --- a/skllm/preprocessing/gpt_translator.py +++ b/skllm/preprocessing/gpt_translator.py @@ -16,7 +16,7 @@ def __init__( openai_org: Optional[str] = None, openai_model: str = "gpt-3.5-turbo", output_language: str = "English", - ): + ) -> None: self._set_keys(openai_key, openai_org) self.openai_model = openai_model self.output_language = output_language diff --git a/skllm/preprocessing/gpt_vectorizer.py b/skllm/preprocessing/gpt_vectorizer.py index 3bf72a5..ff0d54b 100644 --- a/skllm/preprocessing/gpt_vectorizer.py +++ b/skllm/preprocessing/gpt_vectorizer.py @@ -13,6 +13,18 @@ class GPTVectorizer(_BaseEstimator, _TransformerMixin, _OAIMixin): + """ + A class that uses OPEN AI embedding model that converts text to GPT embeddings. + + Parameters + ---------- + openai_embedding_model : str + The OPEN AI embedding model to use. Defaults to "text-embedding-ada-002". + openai_key : str, optional + The OPEN AI key to use. Defaults to None. + openai_org : str, optional + The OPEN AI organization to use. Defaults to None. + """ def __init__( self, openai_embedding_model: str = "text-embedding-ada-002", @@ -22,10 +34,37 @@ def __init__( self.openai_embedding_model = openai_embedding_model self._set_keys(openai_key, openai_org) - def fit(self, X: Any = None, y: Any = None, **kwargs): + def fit(self, X: Any = None, y: Any = None, **kwargs) -> "GPTVectorizer": + """ + Fit the GPTVectorizer to the data. + This is modelled to function as the sklearn fit method + + Parameters + ---------- + X : Any, optional + y : Any, optional + kwargs : dict, optional + + Returns + ------- + self : GPTVectorizer + """ return self def transform(self, X: Optional[Union[np.ndarray, pd.Series, List[str]]]) -> ndarray: + """ + Transform a list of strings into a list of GPT embeddings. + This is modelled to function as the sklearn transform meethod + + Parameters + ---------- + X : np.ndarray, pd.Series, or list + The list of strings to transform into GPT embeddings. + + Returns + ------- + embeddings : np.ndarray + """ X = _to_numpy(X) embeddings = [] for i in tqdm(range(len(X))): @@ -36,4 +75,18 @@ def transform(self, X: Optional[Union[np.ndarray, pd.Series, List[str]]]) -> nda return embeddings def fit_transform(self, X: Optional[Union[np.ndarray, pd.Series, List[str]]], y=None, **fit_params) -> ndarray: + """ + Fit and transform a list of strings into a list of GPT embeddings. + This is modelled to function as the sklearn fit_transform method + + Parameters + ---------- + X : np.ndarray, pd.Series, or list + The list of strings to transform into GPT embeddings. + y : Any, optional + + Returns + ------- + embeddings : np.ndarray + """ return self.fit(X, y).transform(X) diff --git a/skllm/utils.py b/skllm/utils.py index 49b2c03..22a39a1 100644 --- a/skllm/utils.py +++ b/skllm/utils.py @@ -1,8 +1,21 @@ import numpy as np import pandas as pd +from typing import Any -def to_numpy(X): +def to_numpy(X: Any) -> np.ndarray: + """ + Convert a pandas Series or list to a numpy array. + + Parameters + ---------- + X : pd.Series or list + The data to convert to a numpy array. + + Returns + ------- + X : np.ndarray + """ if isinstance(X, pd.Series): X = X.to_numpy().astype(object) elif isinstance(X, list): @@ -12,11 +25,24 @@ def to_numpy(X): return X -def find_json_in_string(string): +def find_json_in_string(string: str) -> str: + """ + Find the first JSON object in a string. + + Parameters + ---------- + string : str + The string to search for a JSON object. + + Returns + ------- + json_string : str + """ + start = string.find("{") end = string.rfind("}") if start != -1 and end != -1: json_string = string[start : end + 1] else: - json_string = {} + json_string = "{}" return json_string From 5a12ef6aa20e7fdedd3359841899ed4cf433e849 Mon Sep 17 00:00:00 2001 From: Ogundepo Odunayo Date: Thu, 1 Jun 2023 21:24:28 -0400 Subject: [PATCH 2/4] Update class methods --- skllm/models/gpt_zero_shot_clf.py | 94 +++++++++++++++++++++++++-- skllm/openai/base_gpt.py | 54 ++++++++++++++- skllm/openai/chatgpt.py | 2 +- skllm/openai/credentials.py | 2 +- skllm/openai/embeddings.py | 2 +- skllm/openai/mixin.py | 2 +- skllm/preprocessing/gpt_summarizer.py | 4 +- skllm/preprocessing/gpt_vectorizer.py | 6 +- 8 files changed, 153 insertions(+), 13 deletions(-) diff --git a/skllm/models/gpt_zero_shot_clf.py b/skllm/models/gpt_zero_shot_clf.py index 05cd03e..ed5e1c5 100644 --- a/skllm/models/gpt_zero_shot_clf.py +++ b/skllm/models/gpt_zero_shot_clf.py @@ -27,7 +27,7 @@ class _BaseZeroShotGPTClassifier(ABC, BaseEstimator, ClassifierMixin, _OAIMixin) openai_key : str, optional The OPEN AI key to use. Defaults to None. openai_org : str, optional - The OPEN AI organization to use. Defaults to None. + The OPEN AI organization ID to use. Defaults to None. openai_model : str, optional The OPEN AI model to use. Defaults to "gpt-3.5-turbo". """ @@ -66,6 +66,18 @@ def fit( return self def predict(self, X: Union[np.ndarray, pd.Series, List[str]]): + """ + Predict the class of each input. + + Parameters + ---------- + X : Union[np.ndarray, pd.Series, List[str]] + The input data to predict the class of. + + Returns + ------- + List[str] + """ X = self._to_np(X) predictions = [] for i in tqdm(range(len(X))): @@ -76,7 +88,7 @@ def predict(self, X: Union[np.ndarray, pd.Series, List[str]]): def _extract_labels(self, y: Any) -> List[str]: pass - def _get_unique_targets(self, y): + def _get_unique_targets(self, y:Any): labels = self._extract_labels(y) counts = Counter(labels) @@ -102,6 +114,9 @@ def _get_chat_completion(self, x): class ZeroShotGPTClassifier(_BaseZeroShotGPTClassifier): + """ + A zero-shot classifier using GPT-3. + """ def __init__( self, openai_key: Optional[str] = None, @@ -111,6 +126,17 @@ def __init__( super().__init__(openai_key, openai_org, openai_model) def _extract_labels(self, y: Any) -> List[str]: + """ + Return the class labels as a list. + + Parameters + ---------- + y : Any + + Returns + ------- + List[str] + """ if isinstance(y, (pd.Series, np.ndarray)): labels = y.tolist() else: @@ -120,7 +146,19 @@ def _extract_labels(self, y: Any) -> List[str]: def _get_prompt(self, x) -> str: return build_zero_shot_prompt_slc(x, repr(self.classes_)) - def _predict_single(self, x): + def _predict_single(self, x: str) -> str: + """ + Predict the class for a single input. + + Parameters + ---------- + x : str + The input to predict the class of. + + Returns + ------- + str + """ completion = self._get_chat_completion(x) try: if self.openai_model.startswith("gpt4all::"): @@ -154,6 +192,20 @@ def fit( class MultiLabelZeroShotGPTClassifier(_BaseZeroShotGPTClassifier): + """ + A zero-shot classifier using GPT-3 for multi-label classification. + + Initialization Parameters + ---------- + openai_key : str, optional + The OPEN AI key to use. Defaults to None. + openai_org : str, optional + The OPEN AI organization ID to use. Defaults to None. + openai_model : str, optional + The OPEN AI model to use. Defaults to "gpt-3.5-turbo". + max_labels : int, optional + The maximum number of labels to predict. Defaults to 3. + """ def __init__( self, openai_key: Optional[str] = None, @@ -167,6 +219,17 @@ def __init__( self.max_labels = max_labels def _extract_labels(self, y) -> List[str]: + """ + Extract the labels into a list. + + Parameters + ---------- + y : Any + + Returns + ------- + List[str] + """ labels = [] for l in y: for j in l: @@ -177,7 +240,19 @@ def _extract_labels(self, y) -> List[str]: def _get_prompt(self, x) -> str: return build_zero_shot_prompt_mlc(x, repr(self.classes_), self.max_labels) - def _predict_single(self, x): + def _predict_single(self, x) -> List[str]: + """ + Predict the class for a single input. + + Parameters + ---------- + x : str + The input to predict the class of. + + Returns + ------- + str + """ completion = self._get_chat_completion(x) try: labels = extract_json_key(completion.choices[0].message["content"], "label") @@ -199,4 +274,15 @@ def fit( X: Optional[Union[np.ndarray, pd.Series, List[str]]], y: List[List[str]], ): + """ + Calls the parent fit method. + + Parameters + ---------- + X : Optional[Union[np.ndarray, pd.Series, List[str]]] + The input data. + y : List[List[str]] + The labels. + + """ return super().fit(X, y) diff --git a/skllm/openai/base_gpt.py b/skllm/openai/base_gpt.py index 49c3570..65ee50b 100644 --- a/skllm/openai/base_gpt.py +++ b/skllm/openai/base_gpt.py @@ -20,6 +20,19 @@ class BaseZeroShotGPTTransformer(_BaseEstimator, _TransformerMixin, _OAIMixin): default_output = "Output is unavailable" def _get_chat_completion(self, X): + """ + Get the chat completion for the given input using open ai API. + + Parameters + ---------- + X : str + Input string + + Returns + ------- + str + + """ prompt = self._get_prompt(X) msgs = [] msgs.append(construct_message("system", self.system_msg)) @@ -33,10 +46,37 @@ def _get_chat_completion(self, X): print(f"Skipping a sample due to the following error: {str(e)}") return self.default_output - def fit(self, X: Any = None, y: Any = None, **kwargs: Any): + def fit(self, X: Any = None, y: Any = None, **kwargs: Any) -> "_BaseZeroShotGPTTransformer": + """ + This method is modelled to function as the sklearn fit method + This method does not do anything and is only present to make the + class compatible with sklearn pipelines. + + Parameters + ---------- + X : Any, optional + y : Any, optional + kwargs : dict, optional + + Returns + ------- + self : BaseZeroShotGPTTransformer + """ + return self def transform(self, X: Optional[Union[np.ndarray, pd.Series, List[str]]], **kwargs: Any) -> ndarray: + """ + Convert a list of strings using the open ai API and a predefined prompt. + + Parameters + ---------- + X : Union[np.ndarray, pd.Series, List[str]] + + Returns + ------- + ndarray + """ X = _to_numpy(X) transformed = [] for i in tqdm(range(len(X))): @@ -47,4 +87,16 @@ def transform(self, X: Optional[Union[np.ndarray, pd.Series, List[str]]], **kwar return transformed def fit_transform(self, X: Optional[Union[np.ndarray, pd.Series, List[str]]], y=None, **fit_params) -> ndarray: + """ + Fit and transform a list of strings using the transform method. + This is modelled to function as the sklearn fit_transform method + + Parameters + ---------- + X : np.ndarray, pd.Series, or list + + Returns + ------- + ndarray + """ return self.fit(X, y).transform(X) \ No newline at end of file diff --git a/skllm/openai/chatgpt.py b/skllm/openai/chatgpt.py index 576ee3c..99e5c55 100644 --- a/skllm/openai/chatgpt.py +++ b/skllm/openai/chatgpt.py @@ -39,7 +39,7 @@ def get_chat_completion(messages: str, key: str, org: str, model: str="gpt-3.5-t key : str The OPEN AI key to use. org : str - The OPEN AI organization to use. + The OPEN AI organization ID to use. model : str, optional The OPEN AI model to use. Defaults to "gpt-3.5-turbo". max_retries : int, optional diff --git a/skllm/openai/credentials.py b/skllm/openai/credentials.py index 959c20f..ac11815 100644 --- a/skllm/openai/credentials.py +++ b/skllm/openai/credentials.py @@ -9,7 +9,7 @@ def set_credentials(key: str, org: str) -> None: key : str The OpenAI key to use. org : str - The OpenAI organization to use. + The OPEN AI organization ID to use. """ openai.api_key = key openai.organization = org diff --git a/skllm/openai/embeddings.py b/skllm/openai/embeddings.py index 3cf1fa7..91c6f42 100644 --- a/skllm/openai/embeddings.py +++ b/skllm/openai/embeddings.py @@ -15,7 +15,7 @@ def get_embedding( key : str The OPEN AI key to use. org : str - The OPEN AI organization to use. + The OPEN AI organization ID to use. model : str, optional The OPEN AI model to use. Defaults to "text-embedding-ada-002". max_retries : int, optional diff --git a/skllm/openai/mixin.py b/skllm/openai/mixin.py index b5dd5e9..0877327 100644 --- a/skllm/openai/mixin.py +++ b/skllm/openai/mixin.py @@ -29,7 +29,7 @@ def _get_openai_key(self) -> str: def _get_openai_org(self) -> str: """ - Get the OpenAI organization from the class or the config file. + Get the OpenAI organization ID from the class or the config file. Returns ------- diff --git a/skllm/preprocessing/gpt_summarizer.py b/skllm/preprocessing/gpt_summarizer.py index 2260a1a..e5b22c3 100644 --- a/skllm/preprocessing/gpt_summarizer.py +++ b/skllm/preprocessing/gpt_summarizer.py @@ -13,7 +13,7 @@ class GPTSummarizer(_BaseGPT): openai_key : str, optional The OPEN AI key to use. Defaults to None. openai_org : str, optional - The OPEN AI organization to use. Defaults to None. + The OPEN AI organization ID to use. Defaults to None. openai_model : str, optional The OPEN AI model to use. Defaults to "gpt-3.5-turbo". max_words : int, optional @@ -33,7 +33,7 @@ def __init__( self._set_keys(openai_key, openai_org) self.openai_model = openai_model self.max_words = max_words - + def _get_prompt(self, X: str) -> str: """ diff --git a/skllm/preprocessing/gpt_vectorizer.py b/skllm/preprocessing/gpt_vectorizer.py index ff0d54b..5013bdc 100644 --- a/skllm/preprocessing/gpt_vectorizer.py +++ b/skllm/preprocessing/gpt_vectorizer.py @@ -23,7 +23,7 @@ class GPTVectorizer(_BaseEstimator, _TransformerMixin, _OAIMixin): openai_key : str, optional The OPEN AI key to use. Defaults to None. openai_org : str, optional - The OPEN AI organization to use. Defaults to None. + The OPEN AI organization ID to use. Defaults to None. """ def __init__( self, @@ -37,7 +37,9 @@ def __init__( def fit(self, X: Any = None, y: Any = None, **kwargs) -> "GPTVectorizer": """ Fit the GPTVectorizer to the data. - This is modelled to function as the sklearn fit method + This is modelled to function as the sklearn fit method. + This method does not do anything and is only present to make the + GPTVectorizer compatible with sklearn pipelines. Parameters ---------- From 2e829317eae88918eaa3ad443f0b720d46475024 Mon Sep 17 00:00:00 2001 From: Ogundepo Odunayo Date: Sun, 4 Jun 2023 13:29:43 -0400 Subject: [PATCH 3/4] fix review issues --- skllm/gpt4all_client.py | 4 ++-- skllm/models/gpt_zero_shot_clf.py | 14 +++++++++++++- skllm/preprocessing/gpt_vectorizer.py | 12 +++++++----- skllm/utils.py | 2 +- 4 files changed, 23 insertions(+), 9 deletions(-) diff --git a/skllm/gpt4all_client.py b/skllm/gpt4all_client.py index 943dc94..ed699b3 100644 --- a/skllm/gpt4all_client.py +++ b/skllm/gpt4all_client.py @@ -8,14 +8,14 @@ _loaded_models = {} -def get_chat_completion(messages: str, model: str="ggml-gpt4all-j-v1.3-groovy") -> Dict: +def get_chat_completion(messages: Dict, model: str="ggml-gpt4all-j-v1.3-groovy") -> Dict: """ Get a chat completion from GPT4All which Format list of message dictionaries into a prompt and call model generate on prompt Parameters ---------- - messages : str + messages : Dict The messages to use as a prompt for the chat completion. model : str The model to use for the chat completion. Defaults to "ggml-gpt4all-j-v1.3-groovy". diff --git a/skllm/models/gpt_zero_shot_clf.py b/skllm/models/gpt_zero_shot_clf.py index 264e5c5..c1e9c08 100644 --- a/skllm/models/gpt_zero_shot_clf.py +++ b/skllm/models/gpt_zero_shot_clf.py @@ -73,6 +73,18 @@ def fit( X: Optional[Union[np.ndarray, pd.Series, List[str]]], y: Union[np.ndarray, pd.Series, List[str], List[List[str]]], ): + """ + Fits the model by storing the training data and extracting the unique targets. + + Parameters + ---------- + X : Optional[Union[np.ndarray, pd.Series, List[str]]] + The input array data to fit the model to. + + y : Union[np.ndarray, pd.Series, List[str], List[List[str]]] + The target array data to fit the model to. + + """ X = self._to_np(X) self.classes_, self.probabilities_ = self._get_unique_targets(y) return self @@ -307,7 +319,7 @@ def fit( Parameters ---------- X : Optional[Union[np.ndarray, pd.Series, List[str]]] - The input data. + Input array data y : List[List[str]] The labels. diff --git a/skllm/preprocessing/gpt_vectorizer.py b/skllm/preprocessing/gpt_vectorizer.py index 14d89c6..0de909b 100644 --- a/skllm/preprocessing/gpt_vectorizer.py +++ b/skllm/preprocessing/gpt_vectorizer.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from typing import Any, List, Optional, Union import numpy as np @@ -34,7 +36,7 @@ def __init__( self.openai_embedding_model = openai_embedding_model self._set_keys(openai_key, openai_org) - def fit(self, X: Any = None, y: Any = None, **kwargs) -> "GPTVectorizer": + def fit(self, X: Any = None, y: Any = None, **kwargs) -> GPTVectorizer: """ Fit the GPTVectorizer to the data. This is modelled to function as the sklearn fit method. @@ -60,8 +62,8 @@ def transform(self, X: Optional[Union[np.ndarray, pd.Series, List[str]]]) -> nda Parameters ---------- - X : np.ndarray, pd.Series, or list - The list of strings to transform into GPT embeddings. + X : Optional[Union[np.ndarray, pd.Series, List[str]]] + The input array of strings to transform into GPT embeddings. Returns ------- @@ -83,8 +85,8 @@ def fit_transform(self, X: Optional[Union[np.ndarray, pd.Series, List[str]]], y= Parameters ---------- - X : np.ndarray, pd.Series, or list - The list of strings to transform into GPT embeddings. + X : Optional[Union[np.ndarray, pd.Series, List[str]]] + The input array of strings to transform into GPT embeddings. y : Any, optional Returns diff --git a/skllm/utils.py b/skllm/utils.py index 22a39a1..f17979b 100644 --- a/skllm/utils.py +++ b/skllm/utils.py @@ -27,7 +27,7 @@ def to_numpy(X: Any) -> np.ndarray: def find_json_in_string(string: str) -> str: """ - Find the first JSON object in a string. + Find the JSON object in a string. Parameters ---------- From aa0ba8469bc029f5fedda9be775a7e45eed05d5c Mon Sep 17 00:00:00 2001 From: Ogundepo Odunayo Date: Sun, 4 Jun 2023 14:43:27 -0400 Subject: [PATCH 4/4] Fix more reviews --- skllm/completions.py | 5 ++++- skllm/gpt4all_client.py | 3 +-- skllm/models/gpt_few_shot_clf.py | 3 +-- skllm/models/gpt_zero_shot_clf.py | 16 +++++++++++----- skllm/openai/base_gpt.py | 14 +++++++------- skllm/openai/chatgpt.py | 18 +++++++++++++----- skllm/openai/embeddings.py | 4 ++-- skllm/preprocessing/gpt_vectorizer.py | 10 ++++------ skllm/utils.py | 6 +++--- 9 files changed, 46 insertions(+), 33 deletions(-) diff --git a/skllm/completions.py b/skllm/completions.py index 02e07df..dc64911 100644 --- a/skllm/completions.py +++ b/skllm/completions.py @@ -3,8 +3,11 @@ def get_chat_completion( - messages, openai_key=None, openai_org=None, model="gpt-3.5-turbo", max_retries=3 + messages: dict, openai_key: str=None, openai_org: str=None, model: str="gpt-3.5-turbo", max_retries: int=3 ): + """ + Gets a chat completion from the OpenAI API. + """ if model.startswith("gpt4all::"): return _g4a_get_chat_completion(messages, model[9:]) else: diff --git a/skllm/gpt4all_client.py b/skllm/gpt4all_client.py index ed699b3..2331463 100644 --- a/skllm/gpt4all_client.py +++ b/skllm/gpt4all_client.py @@ -10,8 +10,7 @@ def get_chat_completion(messages: Dict, model: str="ggml-gpt4all-j-v1.3-groovy") -> Dict: """ - Get a chat completion from GPT4All which Format list of message dictionaries into a prompt and call model - generate on prompt + Gets a chat completion from GPT4All Parameters ---------- diff --git a/skllm/models/gpt_few_shot_clf.py b/skllm/models/gpt_few_shot_clf.py index b552f38..31a6747 100644 --- a/skllm/models/gpt_few_shot_clf.py +++ b/skllm/models/gpt_few_shot_clf.py @@ -25,8 +25,7 @@ def fit( X: Union[np.ndarray, pd.Series, List[str]], y: Union[np.ndarray, pd.Series, List[str]], ): - """Fits the model by storing the training data and extracting the - unique targets. + """Fits the model to the given data. Parameters ---------- diff --git a/skllm/models/gpt_zero_shot_clf.py b/skllm/models/gpt_zero_shot_clf.py index c1e9c08..fd8a75c 100644 --- a/skllm/models/gpt_zero_shot_clf.py +++ b/skllm/models/gpt_zero_shot_clf.py @@ -49,7 +49,7 @@ def __init__( def _to_np(self, X): """ - Convert X to a numpy array. + Converts X to a numpy array. Parameters ---------- @@ -74,7 +74,7 @@ def fit( y: Union[np.ndarray, pd.Series, List[str], List[List[str]]], ): """ - Fits the model by storing the training data and extracting the unique targets. + Extracts the target for each datapoint in X. Parameters ---------- @@ -91,7 +91,7 @@ def fit( def predict(self, X: Union[np.ndarray, pd.Series, List[str]]): """ - Predict the class of each input. + Predicts the class of each input. Parameters ---------- @@ -193,6 +193,9 @@ def _get_default_label(self): return self.default_label def _predict_single(self, x): + """ + Predicts the labels for a single sample. + """ completion = self._get_chat_completion(x) try: label = str( @@ -256,7 +259,7 @@ def __init__( def _extract_labels(self, y) -> List[str]: """ - Extract the labels into a list. + Extracts the labels into a list. Parameters ---------- @@ -290,6 +293,9 @@ def _get_default_label(self): return result def _predict_single(self, x): + """ + Predicts the labels for a single sample. + """ completion = self._get_chat_completion(x) try: labels = extract_json_key(completion["choices"][0]["message"]["content"], "label") @@ -314,7 +320,7 @@ def fit( y: List[List[str]], ): """ - Calls the parent fit method. + Calls the parent fit method on input data. Parameters ---------- diff --git a/skllm/openai/base_gpt.py b/skllm/openai/base_gpt.py index 0499941..3fcc6c8 100644 --- a/skllm/openai/base_gpt.py +++ b/skllm/openai/base_gpt.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from typing import Any, List, Optional, Union import numpy as np @@ -19,7 +21,7 @@ class BaseZeroShotGPTTransformer(_BaseEstimator, _TransformerMixin, _OAIMixin): def _get_chat_completion(self, X): """ - Get the chat completion for the given input using open ai API. + Gets the chat completion for the given input using open ai API. Parameters ---------- @@ -44,11 +46,9 @@ def _get_chat_completion(self, X): print(f"Skipping a sample due to the following error: {str(e)}") return self.default_output - def fit(self, X: Any = None, y: Any = None, **kwargs: Any) -> "_BaseZeroShotGPTTransformer": + def fit(self, X: Any = None, y: Any = None, **kwargs: Any) -> BaseZeroShotGPTTransformer: """ - This method is modelled to function as the sklearn fit method - This method does not do anything and is only present to make the - class compatible with sklearn pipelines. + Fits the model to the data. Parameters ---------- @@ -65,7 +65,7 @@ class compatible with sklearn pipelines. def transform(self, X: Optional[Union[np.ndarray, pd.Series, List[str]]], **kwargs: Any) -> ndarray: """ - Convert a list of strings using the open ai API and a predefined prompt. + Converts a list of strings using the open ai API and a predefined prompt. Parameters ---------- @@ -86,7 +86,7 @@ def transform(self, X: Optional[Union[np.ndarray, pd.Series, List[str]]], **kwar def fit_transform(self, X: Optional[Union[np.ndarray, pd.Series, List[str]]], y=None, **fit_params) -> ndarray: """ - Fit and transform a list of strings using the transform method. + Fits and transforms a list of strings using the transform method. This is modelled to function as the sklearn fit_transform method Parameters diff --git a/skllm/openai/chatgpt.py b/skllm/openai/chatgpt.py index bef20ac..7ea93a7 100644 --- a/skllm/openai/chatgpt.py +++ b/skllm/openai/chatgpt.py @@ -10,7 +10,7 @@ def construct_message(role: str, content: str) -> dict: """ - Construct a message for the OpenAI API. + Constructs a message for the OpenAI API. Parameters ---------- @@ -28,13 +28,13 @@ def construct_message(role: str, content: str) -> dict: return {"role": role, "content": content} -def get_chat_completion(messages: str, key: str, org: str, model: str="gpt-3.5-turbo", max_retries: int=3): +def get_chat_completion(messages: dict, key: str, org: str, model: str="gpt-3.5-turbo", max_retries: int=3): """ - Get a chat completion from the OpenAI API. + Gets a chat completion from the OpenAI API. Parameters ---------- - messages : str + messages : dict input messages to use. key : str The OPEN AI key to use. @@ -69,7 +69,15 @@ def get_chat_completion(messages: str, key: str, org: str, model: str="gpt-3.5-t -def extract_json_key(json_, key): +def extract_json_key(json_: str, key: str): + """ + Extracts JSON key from a string. + + json_ : str + The JSON string to extract the key from. + key : str + The key to extract. + """ original_json = json_ for i in range(2): try: diff --git a/skllm/openai/embeddings.py b/skllm/openai/embeddings.py index e150f00..6712bda 100644 --- a/skllm/openai/embeddings.py +++ b/skllm/openai/embeddings.py @@ -9,7 +9,7 @@ def get_embedding( text: str, key: str, org: str, model: str="text-embedding-ada-002", max_retries: int=3 ): """ - Encode a string and return the GPT embedding for a string. + Encodes a string and return the embedding for a string. Parameters ---------- @@ -20,7 +20,7 @@ def get_embedding( org : str The OPEN AI organization ID to use. model : str, optional - The OPEN AI model to use. Defaults to "text-embedding-ada-002". + The model to use. Defaults to "text-embedding-ada-002". max_retries : int, optional The maximum number of retries to use. Defaults to 3. diff --git a/skllm/preprocessing/gpt_vectorizer.py b/skllm/preprocessing/gpt_vectorizer.py index 0de909b..fe4537b 100644 --- a/skllm/preprocessing/gpt_vectorizer.py +++ b/skllm/preprocessing/gpt_vectorizer.py @@ -38,10 +38,8 @@ def __init__( def fit(self, X: Any = None, y: Any = None, **kwargs) -> GPTVectorizer: """ - Fit the GPTVectorizer to the data. + Fits the GPTVectorizer to the data. This is modelled to function as the sklearn fit method. - This method does not do anything and is only present to make the - GPTVectorizer compatible with sklearn pipelines. Parameters ---------- @@ -57,8 +55,8 @@ def fit(self, X: Any = None, y: Any = None, **kwargs) -> GPTVectorizer: def transform(self, X: Optional[Union[np.ndarray, pd.Series, List[str]]]) -> ndarray: """ - Transform a list of strings into a list of GPT embeddings. - This is modelled to function as the sklearn transform meethod + Transforms a list of strings into a list of GPT embeddings. + This is modelled to function as the sklearn transform method Parameters ---------- @@ -80,7 +78,7 @@ def transform(self, X: Optional[Union[np.ndarray, pd.Series, List[str]]]) -> nda def fit_transform(self, X: Optional[Union[np.ndarray, pd.Series, List[str]]], y=None, **fit_params) -> ndarray: """ - Fit and transform a list of strings into a list of GPT embeddings. + Fits and transforms a list of strings into a list of GPT embeddings. This is modelled to function as the sklearn fit_transform method Parameters diff --git a/skllm/utils.py b/skllm/utils.py index f17979b..eca2289 100644 --- a/skllm/utils.py +++ b/skllm/utils.py @@ -5,11 +5,11 @@ def to_numpy(X: Any) -> np.ndarray: """ - Convert a pandas Series or list to a numpy array. + Converts a pandas Series or list to a numpy array. Parameters ---------- - X : pd.Series or list + X : Any The data to convert to a numpy array. Returns @@ -27,7 +27,7 @@ def to_numpy(X: Any) -> np.ndarray: def find_json_in_string(string: str) -> str: """ - Find the JSON object in a string. + Finds the JSON object in a string. Parameters ----------