Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 19 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@ SKLLMConfig.set_openai_key("<YOUR_KEY>")
SKLLMConfig.set_openai_org("<YOUR_ORGANISATION>")
```

**Important notice:**
**Important notice:**

- If you have a free trial OpenAI account, the [rate limits](https://platform.openai.com/docs/guides/rate-limits/overview) are not sufficient (specifically 3 requests per minute). Please switch to the "pay as you go" plan first.
- When calling `SKLLMConfig.set_openai_org`, you have to provide your organization ID and **NOT** the name. You can find your ID [here](https://platform.openai.com/account/org-settings).

Expand Down Expand Up @@ -156,17 +157,33 @@ yh = clf.predict(X_test)
GPT excels at performing summarization tasks. Therefore, we provide `GPTSummarizer` that can be used both as stand-alone estimator, or as a preprocessor (in this case we can make an analogy with a dimensionality reduction preprocessor).

Example:

```python
from skllm.preprocessing import GPTSummarizer
from skllm.datasets import get_summarization_dataset

X = get_summarization_dataset()
s = GPTSummarizer(openai_model = 'gpt-3.5-turbo', max_words = 15)
s = GPTSummarizer(openai_model="gpt-3.5-turbo", max_words=15)
summaries = s.fit_transform(X)
```

Please be aware that the `max_words` hyperparameter sets a soft limit, which is not strictly enforced outside of the prompt. Therefore, in some cases, the actual number of words might be slightly higher.

### Text Translation

GPT models have demonstrated their effectiveness in translation tasks by generating accurate translations across various languages. Thus, we added `GPTTranslator` that allows translating an arbitraty text into a language of interest.

Example:

```python
from skllm.preprocessing import GPTTranslator
from skllm.datasets import get_translation_dataset

X = get_translation_dataset()
t = GPTTranslator(openai_model="gpt-3.5-turbo", output_language="English")
translated_text = t.fit_transform(X)
```

## Roadmap 🧭

- [x] Zero-Shot Classification with OpenAI GPT 3/4
Expand Down
3 changes: 2 additions & 1 deletion skllm/datasets/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from skllm.datasets.multi_class import get_classification_dataset
from skllm.datasets.multi_label import get_multilabel_classification_dataset
from skllm.datasets.summarization import get_summarization_dataset
from skllm.datasets.summarization import get_summarization_dataset
from skllm.datasets.translation import get_translation_dataset
23 changes: 23 additions & 0 deletions skllm/datasets/translation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
def get_translation_dataset():
X = [
r"Me encanta bailar salsa y bachata. Es una forma divertida de expresarme.",
r"J'ai passé mes dernières vacances en Grèce. Les plages étaient magnifiques.",
(
r"Ich habe gestern ein tolles Buch gelesen. Die Geschichte war fesselnd bis"
r" zum Ende."
),
(
r"Gosto de cozinhar pratos tradicionais italianos. O espaguete à carbonara"
r" é um dos meus favoritos."
),
(
r"Mám v plánu letos v létě vyrazit na výlet do Itálie. Doufám, že navštívím"
r" Řím a Benátky."
),
(
r"Mijn favoriete hobby is fotograferen. Ik hou ervan om mooie momenten vast"
r" te leggen."
),
]

return X
3 changes: 2 additions & 1 deletion skllm/preprocessing/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
from skllm.preprocessing.gpt_summarizer import GPTSummarizer
from skllm.preprocessing.gpt_translator import GPTTranslator
from skllm.preprocessing.gpt_vectorizer import GPTVectorizer
from skllm.preprocessing.gpt_summarizer import GPTSummarizer
37 changes: 37 additions & 0 deletions skllm/preprocessing/gpt_translator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
from typing import Optional

from skllm.openai.base_gpt import BaseZeroShotGPTTransformer as _BaseGPT
from skllm.prompts.builders import build_translation_prompt


class GPTTranslator(_BaseGPT):
"""A text translator."""

system_msg = "You are a text translator."
default_output = "Translation is unavailable."

def __init__(
self,
openai_key: Optional[str] = None,
openai_org: Optional[str] = None,
openai_model: str = "gpt-3.5-turbo",
output_language: str = "English",
):
self._set_keys(openai_key, openai_org)
self.openai_model = openai_model
self.output_language = output_language

def _get_prompt(self, X: str) -> str:
"""Generates the prompt for the given input.

Parameters
----------
X : str
sample to translate

Returns
-------
str
translated sample
"""
return build_translation_prompt(X, self.output_language)
23 changes: 23 additions & 0 deletions skllm/prompts/builders.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from skllm.prompts.templates import (
SUMMARY_PROMPT_TEMPLATE,
TRANSLATION_PROMPT_TEMPLATE,
ZERO_SHOT_CLF_PROMPT_TEMPLATE,
ZERO_SHOT_MLCLF_PROMPT_TEMPLATE,
)
Expand Down Expand Up @@ -78,3 +79,25 @@ def build_summary_prompt(
prepared prompt
"""
return template.format(x=x, max_words=max_words)


def build_translation_prompt(
x: str, output_language: str, template: str = TRANSLATION_PROMPT_TEMPLATE
) -> str:
"""Builds a prompt for text translation.

Parameters
----------
x : str
sample to translate
output_language : str
language to translate to
template : str
prompt template to use, must contain placeholders for all variables, by default TRANSLATION_PROMPT_TEMPLATE

Returns
-------
str
prepared prompt
"""
return template.format(x=x, output_language=output_language)
15 changes: 15 additions & 0 deletions skllm/prompts/templates.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,18 @@
Text sample: ```{x}```
Summarized text:
"""

TRANSLATION_PROMPT_TEMPLATE = """
You will be provided with an arbitrary text sample, delimited by triple backticks.
Your task is to translate this text to {output_language} language and output the translated text.

Perform the following actions:
1. Determine the language of the text sample.
2. If the language is not {output_language}, translate the text sample to {output_language} language.
3. Output the translated text.
If the text sample provided is not in a recognizable language, output "No translation available".
Do not output any additional information except the translated text.

Text sample: ```{x}```
Translated text:
"""