diff --git a/README.md b/README.md index cb0c38f..4ddb0ed 100644 --- a/README.md +++ b/README.md @@ -202,12 +202,6 @@ Note: as the model is not being re-trained, but uses the training data during in ### Dynamic Few-Shot Text Classification -*To use this feature, you need to install `annoy` library:* - -```bash -pip install scikit-llm[annoy] -``` - `DynamicFewShotGPTClassifier` dynamically selects N samples per class to include in the prompt. This allows the few-shot classifier to scale to datasets that are too large for the standard context window of LLMs. *How does it work?* @@ -227,6 +221,20 @@ clf.fit(X, y) labels = clf.predict(X) ``` +By default the classifier uses kneighbors algorithm from sklearn, which might be slow for large datasets. In this case, it is possible to switch to [annoy](https://github.com/spotify/annoy): + +```bash +pip install scikit-llm[annoy] +``` + +```python +from skllm.memory._annoy import AnnoyMemoryIndex +from skllm.memory.base import IndexConstructor + +index = IndexConstructor(AnnoyMemoryIndex) +clf = DynamicFewShotGPTClassifier(memory_index=index) +``` + ### Text Classification with Google PaLM 2 At the moment 3 PaLM based models are available in test mode: diff --git a/skllm/memory/__init__.py b/skllm/memory/__init__.py index ad533bf..e2f357d 100644 --- a/skllm/memory/__init__.py +++ b/skllm/memory/__init__.py @@ -1 +1 @@ -from skllm.memory._annoy import AnnoyMemoryIndex +from skllm.memory._sklearn_nn import SklearnMemoryIndex diff --git a/skllm/memory/_annoy.py b/skllm/memory/_annoy.py index c6820d4..d66bf33 100644 --- a/skllm/memory/_annoy.py +++ b/skllm/memory/_annoy.py @@ -23,35 +23,43 @@ class AnnoyMemoryIndex(_BaseMemoryIndex): metric to use, by default "euclidean" """ - def __init__(self, dim: int, metric: str = "euclidean", **kwargs: Any) -> None: + def __init__(self, dim: int = -1, metric: str = "euclidean", **kwargs: Any) -> None: if AnnoyIndex is None: raise ImportError( - "Annoy is not installed. Please install annoy by running `pip install scikit-llm[annoy]`." + "Annoy is not installed. Please install annoy by running `pip install" + " scikit-llm[annoy]`." ) - self._index = AnnoyIndex(dim, metric) self.metric = metric self.dim = dim self.built = False + self._index = None + self._counter = 0 - def add(self, id: int, vector: ndarray) -> None: + def add(self, vector: ndarray) -> None: """Adds a vector to the index. Parameters ---------- - id : Any - identifier for the vector vector : ndarray vector to add to the index """ if self.built: raise RuntimeError("Cannot add vectors after index is built.") + if self.dim < 0: + raise ValueError("Dimensionality must be positive.") + if not self._index: + self._index = AnnoyIndex(self.dim, self.metric) + id = self._counter self._index.add_item(id, vector) + self._counter += 1 def build(self) -> None: """Builds the index. No new vectors can be added after building. """ + if self.dim < 0: + raise ValueError("Dimensionality must be positive.") self._index.build(-1) self.built = True @@ -70,6 +78,7 @@ def retrieve(self, vectors: ndarray, k: int) -> List[List[int]]: List ids of retrieved nearest neighbors """ + print("ANNOY RETRIEVE") if not self.built: raise RuntimeError("Cannot retrieve vectors before the index is built.") return [ diff --git a/skllm/memory/_sklearn_nn.py b/skllm/memory/_sklearn_nn.py new file mode 100644 index 0000000..d993566 --- /dev/null +++ b/skllm/memory/_sklearn_nn.py @@ -0,0 +1,66 @@ +from typing import Any, List + +import numpy as np +from sklearn.neighbors import NearestNeighbors + +from skllm.memory.base import _BaseMemoryIndex + + +class SklearnMemoryIndex(_BaseMemoryIndex): + """Memory index using Sklearn's NearestNeighbors. + + Parameters + ---------- + dim : int + dimensionality of the vectors + metric : str, optional + metric to use, by default "euclidean" + """ + + def __init__(self, dim: int = -1, metric: str = "euclidean", **kwargs: Any) -> None: + self._index = NearestNeighbors(metric=metric, **kwargs) + self.metric = metric + self.dim = dim + self.built = False + self.data = [] + + def add(self, vector: np.ndarray) -> None: + """Adds a vector to the index. + + Parameters + ---------- + vector : np.ndarray + vector to add to the index + """ + if self.built: + raise RuntimeError("Cannot add vectors after index is built.") + self.data.append(vector) + + def build(self) -> None: + """Builds the index. + + No new vectors can be added after building. + """ + data_matrix = np.array(self.data) + self._index.fit(data_matrix) + self.built = True + + def retrieve(self, vectors: np.ndarray, k: int) -> List[List[int]]: + """Retrieves the k nearest neighbors for each vector. + + Parameters + ---------- + vectors : np.ndarray + vectors to retrieve nearest neighbors for + k : int + number of nearest neighbors to retrieve + + Returns + ------- + List + ids of retrieved nearest neighbors + """ + if not self.built: + raise RuntimeError("Cannot retrieve vectors before the index is built.") + _, indices = self._index.kneighbors(vectors, n_neighbors=k) + return indices.tolist() diff --git a/skllm/memory/base.py b/skllm/memory/base.py index 248eec2..d3e4aaf 100644 --- a/skllm/memory/base.py +++ b/skllm/memory/base.py @@ -1,5 +1,5 @@ from abc import ABC, abstractmethod -from typing import Any, List +from typing import Any, List, Type from numpy import ndarray @@ -43,3 +43,12 @@ def build(self) -> None: All build parameters should be passed to the constructor. """ pass + + +class IndexConstructor: + def __init__(self, index: Type[_BaseMemoryIndex], **kwargs: Any) -> None: + self.index = index + self.kwargs = kwargs + + def __call__(self) -> _BaseMemoryIndex: + return self.index(**self.kwargs) diff --git a/skllm/models/gpt/gpt_dyn_few_shot_clf.py b/skllm/models/gpt/gpt_dyn_few_shot_clf.py index 1ccc3c8..7a1bd32 100644 --- a/skllm/models/gpt/gpt_dyn_few_shot_clf.py +++ b/skllm/models/gpt/gpt_dyn_few_shot_clf.py @@ -3,7 +3,8 @@ import numpy as np import pandas as pd -from skllm.memory import AnnoyMemoryIndex +from skllm.memory import SklearnMemoryIndex +from skllm.memory.base import IndexConstructor from skllm.models._base import _BaseZeroShotGPTClassifier from skllm.preprocessing import GPTVectorizer from skllm.prompts.builders import build_few_shot_prompt_slc @@ -35,6 +36,8 @@ class DynamicFewShotGPTClassifier(_BaseZeroShotGPTClassifier): default_label : Optional[Union[List[str], str]] , default : 'Random' The default label to use if the LLM could not generate a response for a sample. If set to 'Random' a random label will be chosen based on probabilities from the training set. + memory_index : Optional[IndexConstructor], default : None + The memory index constructor to use. If None, a SklearnMemoryIndex will be used. """ def __init__( @@ -44,9 +47,11 @@ def __init__( openai_org: str | None = None, openai_model: str = "gpt-3.5-turbo", default_label: str | None = "Random", + memory_index: IndexConstructor | None = None, ): super().__init__(openai_key, openai_org, openai_model, default_label) self.n_examples = n_examples + self.memory_index = memory_index def fit( self, @@ -79,9 +84,13 @@ def fit( partition = X[y == cls] self.data_[cls]["partition"] = partition embeddings = self.embedding_model_.transform(partition) - index = AnnoyMemoryIndex(embeddings.shape[1]) - for i, embedding in enumerate(embeddings): - index.add(i, embedding) + if self.memory_index is not None: + index = self.memory_index() + index.dim = embeddings.shape[1] + else: + index = SklearnMemoryIndex(embeddings.shape[1]) + for embedding in embeddings: + index.add(embedding) index.build() self.data_[cls]["index"] = index