Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 68 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -202,9 +202,15 @@ Note: as the model is not being re-trained, but uses the training data during in

### Dynamic Few-Shot Text Classification

_To use this feature, you need to install `annoy` library:_

```bash
pip install scikit-llm[annoy]
```

`DynamicFewShotGPTClassifier` dynamically selects N samples per class to include in the prompt. This allows the few-shot classifier to scale to datasets that are too large for the standard context window of LLMs.

*How does it work?*
_How does it work?_

During fitting, the whole dataset is partitioned by class, vectorized, and stored.

Expand Down Expand Up @@ -288,6 +294,67 @@ clf.fit(X_train, y_train_encoded)
yh = clf.predict(X_test)
```

### LLM Fine-Tuning

At the moment the following scenarios are supported for tuning:

- **Text classification**: the model is fine-tuned to predict a single label per sample. The following estimators are supported:
- `skllm.models.palm.PaLMClassifier`
- `skllm.models.gpt.GPTClassifier`
- **Text to text**: the model is fine-tuned on arbitrary text input-output pairs. The following estimators are supported:
- `skllm.models.palm.PaLM`
- `skllm.models.gpt.GPT`

Example 1: Fine-tuning a PaLM model for text classification

```python
from skllm.models.palm import PaLMClassifier
clf = PaLMClassifier(n_update_steps=100)
clf.fit(X_train, y_train) # y_train is a list of labels
labels = clf.predict(X_test)
```

Example 2: Fine-tuning a PaLM model for text to text tasks

```python
from skllm.models.palm import PaLM
clf = PaLM(n_update_steps=100)
clf.fit(X_train, y_train) # y_train is any desired output text
labels = clf.predict(X_test)
```

_Note:_ PaLM models tuning requires a Vertex AI account. Please refer to our [official guide on Medium](https://medium.com/@iryna230520/fine-tune-google-palm-2-with-scikit-llm-d41b0aa673a5) for more details.

Example 3: Fine-tuning a GPT model for text classification

```python
from skllm.models.gpt import GPTClassifier

clf = GPTClassifier(
base_model = "gpt-3.5-turbo-0613",
n_epochs = None, # int or None. When None, will be determined automatically by OpenAI
default_label = "Random", # optional
)

clf.fit(X_train, y_train) # y_train is a list of labels
labels = clf.predict(X_test)
```

Example 4: Fine-tuning a GPT model for text to text tasks

```python
from skllm.models.gpt import GPTC

clf = GPT(
base_model = "gpt-3.5-turbo-0613",
n_epochs = None, # int or None. When None, will be determined automatically by OpenAI
system_msg = "You are a text processing model."
)

clf.fit(X_train, y_train) # y_train is any desired output text
labels = clf.predict(X_test)
```

### Text Summarization

GPT excels at performing summarization tasks. Therefore, we provide `GPTSummarizer` that can be used both as stand-alone estimator, or as a preprocessor (in this case we can make an analogy with a dimensionality reduction preprocessor).
Expand Down
2 changes: 2 additions & 0 deletions skllm/models/gpt/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,5 @@
ZeroShotGPTClassifier,
MultiLabelZeroShotGPTClassifier,
)

from skllm.models.gpt.gpt import GPTClassifier, GPT
174 changes: 174 additions & 0 deletions skllm/models/gpt/gpt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
from typing import Optional, Union, List
import pandas as pd
from skllm.models._base import _BaseZeroShotGPTClassifier
from skllm.prompts.builders import build_zero_shot_prompt_slc
from skllm.openai.credentials import set_credentials
from skllm.openai.tuning import create_tuning_job, await_results, delete_file
import numpy as np
import json
import uuid


def _build_clf_example(
x: str, y: str, system_msg="You are a text classification model."
):
sample = {
"messages": [
{"role": "system", "content": system_msg},
{"role": "user", "content": x},
{"role": "assistant", "content": y},
]
}
return json.dumps(sample)


class _Tunable:
system_msg = "You are a text classification model."

def _build_label(self, label: str):
return json.dumps({"label": label})

def _tune(self, X, y):
file_uuid = str(uuid.uuid4())
filename = f"skllm_{file_uuid}.jsonl"
with open(filename, "w+") as f:
for xi, yi in zip(X, y):
f.write(
_build_clf_example(
self._get_prompt(xi), self._build_label(yi), self.system_msg
)
)
f.write("\n")
set_credentials(self._get_openai_key(), self._get_openai_org())
job = create_tuning_job(
self.base_model,
filename,
self.n_epochs,
self.custom_suffix,
)
print(f"Created new tuning job. JOB_ID = {job['id']}")
job = await_results(job["id"])
self.openai_model = job["fine_tuned_model"]
delete_file(job["training_file"])
print(f"Finished training. Number of trained tokens: {job['trained_tokens']}.")


class GPTClassifier(_BaseZeroShotGPTClassifier, _Tunable):
"""Fine-tunable GPT classifier for single-label classification."""

supported_models = ["gpt-3.5-turbo-0613"]

def __init__(
self,
base_model: str = "gpt-3.5-turbo-0613",
default_label: Optional[str] = "Random",
openai_key: Optional[str] = None,
openai_org: Optional[str] = None,
n_epochs: Optional[int] = None,
custom_suffix: Optional[str] = "skllm",
):
self.base_model = base_model
self.n_epochs = n_epochs
self.custom_suffix = custom_suffix
if base_model not in self.supported_models:
raise ValueError(
f"Model {base_model} is not supported. Supported models are"
f" {self.supported_models}"
)
super().__init__(
openai_model="undefined",
default_label=default_label,
openai_key=openai_key,
openai_org=openai_org,
)

def _get_prompt(self, x: str) -> str:
return build_zero_shot_prompt_slc(x, repr(self.classes_))

def fit(
self,
X: Union[np.ndarray, pd.Series, List[str]],
y: Union[np.ndarray, pd.Series, List[str]],
):
"""Fits the model to the given data.

Parameters
----------
X : Union[np.ndarray, pd.Series, List[str]]
training data
y : Union[np.ndarray, pd.Series, List[str]]
training labels

Returns
-------
GPTClassifier
self
"""
X = self._to_np(X)
y = self._to_np(y)
super().fit(X, y)
self._tune(X, y)
return self


# similarly to PaLM, this is not a classifier, but a quick way to re-use the code
# the hierarchy of classes will be reworked in the next releases
class GPT(_BaseZeroShotGPTClassifier, _Tunable):
"""Fine-tunable GPT on arbitrary input-output pairs."""

supported_models = ["gpt-3.5-turbo-0613"]

def __init__(
self,
base_model: str = "gpt-3.5-turbo-0613",
openai_key: Optional[str] = None,
openai_org: Optional[str] = None,
n_epochs: Optional[int] = None,
custom_suffix: Optional[str] = "skllm",
system_msg: Optional[str] = "You are a text processing model.",
):
self.base_model = base_model
self.n_epochs = n_epochs
self.custom_suffix = custom_suffix
self.system_msg = system_msg
if base_model not in self.supported_models:
raise ValueError(
f"Model {base_model} is not supported. Supported models are"
f" {self.supported_models}"
)
super().__init__(
openai_model="undefined", # this will be rewritten later
default_label="Random", # just for compatibility
openai_key=openai_key,
openai_org=openai_org,
)

def _get_prompt(self, x: str) -> str:
return x

def _build_label(self, label: str):
return label

def fit(
self,
X: Union[np.ndarray, pd.Series, List[str]],
y: Union[np.ndarray, pd.Series, List[str]],
):
"""Fits the model to the given data.

Parameters
----------
X : Union[np.ndarray, pd.Series, List[str]]
training data
y : Union[np.ndarray, pd.Series, List[str]]
training labels

Returns
-------
GPT
self
"""
X = self._to_np(X)
y = self._to_np(y)
self._tune(X, y)
return self
67 changes: 67 additions & 0 deletions skllm/openai/tuning.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
from typing import Optional
import openai
from time import sleep
from datetime import datetime
import os


def create_tuning_job(
model: str,
training_file: str,
n_epochs: Optional[str] = None,
suffix: Optional[str] = None,
):
out = openai.File.create(file=open(training_file, "rb"), purpose="fine-tune")
print(f"Created new file. FILE_ID = {out['id']}")
print(f"Waiting for file to be processed ...")
while not wait_file_ready(out["id"]):
sleep(5)
# delete the training_file after it is uploaded
os.remove(training_file)
params = {
"model": model,
"training_file": out["id"],
}
if n_epochs is not None:
params["hyperparameters"] = {"n_epochs": n_epochs}
if suffix is not None:
params["suffix"] = suffix
return openai.FineTuningJob.create(**params)


def await_results(job_id: str, check_interval: int = 120):
while True:
job = openai.FineTuningJob.retrieve(job_id)
status = job["status"]
if status == "succeeded":
return job
elif status == "failed" or status == "cancelled":
print(job)
raise RuntimeError(f"Tuning job failed with status {status}")
else:
now = datetime.now()
print(
f"[{now}] Waiting for tuning job to complete. Current status: {status}"
)
sleep(check_interval)

def delete_file(file_id:str):
openai.File.delete(file_id)

def wait_file_ready(file_id):
files = openai.File.list()["data"]
found = False
for file in files:
if file["id"] == file_id:
found = True
if file["status"] == "processed":
return True
elif file["status"] in ["error", "deleting", "deleted"]:
print(file)
raise RuntimeError(
f"File upload {file_id} failed with status {file['status']}"
)
else:
return False
if not found:
raise RuntimeError(f"File {file_id} not found")