Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 14 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -202,12 +202,6 @@ Note: as the model is not being re-trained, but uses the training data during in

### Dynamic Few-Shot Text Classification

*To use this feature, you need to install `annoy` library:*

```bash
pip install scikit-llm[annoy]
```

`DynamicFewShotGPTClassifier` dynamically selects N samples per class to include in the prompt. This allows the few-shot classifier to scale to datasets that are too large for the standard context window of LLMs.

*How does it work?*
Expand All @@ -227,6 +221,20 @@ clf.fit(X, y)
labels = clf.predict(X)
```

By default the classifier uses kneighbors algorithm from sklearn, which might be slow for large datasets. In this case, it is possible to switch to [annoy](https://github.com/spotify/annoy):

```bash
pip install scikit-llm[annoy]
```

```python
from skllm.memory._annoy import AnnoyMemoryIndex
from skllm.memory.base import IndexConstructor

index = IndexConstructor(AnnoyMemoryIndex)
clf = DynamicFewShotGPTClassifier(memory_index=index)
```

### Text Classification with Google PaLM 2

At the moment 3 PaLM based models are available in test mode:
Expand Down
2 changes: 1 addition & 1 deletion skllm/memory/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
from skllm.memory._annoy import AnnoyMemoryIndex
from skllm.memory._sklearn_nn import SklearnMemoryIndex
21 changes: 15 additions & 6 deletions skllm/memory/_annoy.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,35 +23,43 @@ class AnnoyMemoryIndex(_BaseMemoryIndex):
metric to use, by default "euclidean"
"""

def __init__(self, dim: int, metric: str = "euclidean", **kwargs: Any) -> None:
def __init__(self, dim: int = -1, metric: str = "euclidean", **kwargs: Any) -> None:
if AnnoyIndex is None:
raise ImportError(
"Annoy is not installed. Please install annoy by running `pip install scikit-llm[annoy]`."
"Annoy is not installed. Please install annoy by running `pip install"
" scikit-llm[annoy]`."
)
self._index = AnnoyIndex(dim, metric)
self.metric = metric
self.dim = dim
self.built = False
self._index = None
self._counter = 0

def add(self, id: int, vector: ndarray) -> None:
def add(self, vector: ndarray) -> None:
"""Adds a vector to the index.

Parameters
----------
id : Any
identifier for the vector
vector : ndarray
vector to add to the index
"""
if self.built:
raise RuntimeError("Cannot add vectors after index is built.")
if self.dim < 0:
raise ValueError("Dimensionality must be positive.")
if not self._index:
self._index = AnnoyIndex(self.dim, self.metric)
id = self._counter
self._index.add_item(id, vector)
self._counter += 1

def build(self) -> None:
"""Builds the index.

No new vectors can be added after building.
"""
if self.dim < 0:
raise ValueError("Dimensionality must be positive.")
self._index.build(-1)
self.built = True

Expand All @@ -70,6 +78,7 @@ def retrieve(self, vectors: ndarray, k: int) -> List[List[int]]:
List
ids of retrieved nearest neighbors
"""
print("ANNOY RETRIEVE")
if not self.built:
raise RuntimeError("Cannot retrieve vectors before the index is built.")
return [
Expand Down
66 changes: 66 additions & 0 deletions skllm/memory/_sklearn_nn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
from typing import Any, List

import numpy as np
from sklearn.neighbors import NearestNeighbors

from skllm.memory.base import _BaseMemoryIndex


class SklearnMemoryIndex(_BaseMemoryIndex):
"""Memory index using Sklearn's NearestNeighbors.

Parameters
----------
dim : int
dimensionality of the vectors
metric : str, optional
metric to use, by default "euclidean"
"""

def __init__(self, dim: int = -1, metric: str = "euclidean", **kwargs: Any) -> None:
self._index = NearestNeighbors(metric=metric, **kwargs)
self.metric = metric
self.dim = dim
self.built = False
self.data = []

def add(self, vector: np.ndarray) -> None:
"""Adds a vector to the index.

Parameters
----------
vector : np.ndarray
vector to add to the index
"""
if self.built:
raise RuntimeError("Cannot add vectors after index is built.")
self.data.append(vector)

def build(self) -> None:
"""Builds the index.

No new vectors can be added after building.
"""
data_matrix = np.array(self.data)
self._index.fit(data_matrix)
self.built = True

def retrieve(self, vectors: np.ndarray, k: int) -> List[List[int]]:
"""Retrieves the k nearest neighbors for each vector.

Parameters
----------
vectors : np.ndarray
vectors to retrieve nearest neighbors for
k : int
number of nearest neighbors to retrieve

Returns
-------
List
ids of retrieved nearest neighbors
"""
if not self.built:
raise RuntimeError("Cannot retrieve vectors before the index is built.")
_, indices = self._index.kneighbors(vectors, n_neighbors=k)
return indices.tolist()
11 changes: 10 additions & 1 deletion skllm/memory/base.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from abc import ABC, abstractmethod
from typing import Any, List
from typing import Any, List, Type

from numpy import ndarray

Expand Down Expand Up @@ -43,3 +43,12 @@ def build(self) -> None:
All build parameters should be passed to the constructor.
"""
pass


class IndexConstructor:
def __init__(self, index: Type[_BaseMemoryIndex], **kwargs: Any) -> None:
self.index = index
self.kwargs = kwargs

def __call__(self) -> _BaseMemoryIndex:
return self.index(**self.kwargs)
17 changes: 13 additions & 4 deletions skllm/models/gpt/gpt_dyn_few_shot_clf.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
import numpy as np
import pandas as pd

from skllm.memory import AnnoyMemoryIndex
from skllm.memory import SklearnMemoryIndex
from skllm.memory.base import IndexConstructor
from skllm.models._base import _BaseZeroShotGPTClassifier
from skllm.preprocessing import GPTVectorizer
from skllm.prompts.builders import build_few_shot_prompt_slc
Expand Down Expand Up @@ -35,6 +36,8 @@ class DynamicFewShotGPTClassifier(_BaseZeroShotGPTClassifier):
default_label : Optional[Union[List[str], str]] , default : 'Random'
The default label to use if the LLM could not generate a response for a sample. If set to 'Random' a random
label will be chosen based on probabilities from the training set.
memory_index : Optional[IndexConstructor], default : None
The memory index constructor to use. If None, a SklearnMemoryIndex will be used.
"""

def __init__(
Expand All @@ -44,9 +47,11 @@ def __init__(
openai_org: str | None = None,
openai_model: str = "gpt-3.5-turbo",
default_label: str | None = "Random",
memory_index: IndexConstructor | None = None,
):
super().__init__(openai_key, openai_org, openai_model, default_label)
self.n_examples = n_examples
self.memory_index = memory_index

def fit(
self,
Expand Down Expand Up @@ -79,9 +84,13 @@ def fit(
partition = X[y == cls]
self.data_[cls]["partition"] = partition
embeddings = self.embedding_model_.transform(partition)
index = AnnoyMemoryIndex(embeddings.shape[1])
for i, embedding in enumerate(embeddings):
index.add(i, embedding)
if self.memory_index is not None:
index = self.memory_index()
index.dim = embeddings.shape[1]
else:
index = SklearnMemoryIndex(embeddings.shape[1])
for embedding in embeddings:
index.add(embedding)
index.build()
self.data_[cls]["index"] = index

Expand Down