From 3d008d741dc56d949e51d89e99c602e2497f75ce Mon Sep 17 00:00:00 2001 From: Iryna K Date: Mon, 29 May 2023 22:32:11 +0200 Subject: [PATCH] added GPTTranslator --- README.md | 21 +++++++++++++-- skllm/datasets/__init__.py | 3 ++- skllm/datasets/translation.py | 23 +++++++++++++++++ skllm/preprocessing/__init__.py | 3 ++- skllm/preprocessing/gpt_translator.py | 37 +++++++++++++++++++++++++++ skllm/prompts/builders.py | 23 +++++++++++++++++ skllm/prompts/templates.py | 15 +++++++++++ 7 files changed, 121 insertions(+), 4 deletions(-) create mode 100644 skllm/datasets/translation.py create mode 100644 skllm/preprocessing/gpt_translator.py diff --git a/README.md b/README.md index b72172d..c775e67 100644 --- a/README.md +++ b/README.md @@ -34,7 +34,8 @@ SKLLMConfig.set_openai_key("") SKLLMConfig.set_openai_org("") ``` -**Important notice:** +**Important notice:** + - If you have a free trial OpenAI account, the [rate limits](https://platform.openai.com/docs/guides/rate-limits/overview) are not sufficient (specifically 3 requests per minute). Please switch to the "pay as you go" plan first. - When calling `SKLLMConfig.set_openai_org`, you have to provide your organization ID and **NOT** the name. You can find your ID [here](https://platform.openai.com/account/org-settings). @@ -156,17 +157,33 @@ yh = clf.predict(X_test) GPT excels at performing summarization tasks. Therefore, we provide `GPTSummarizer` that can be used both as stand-alone estimator, or as a preprocessor (in this case we can make an analogy with a dimensionality reduction preprocessor). Example: + ```python from skllm.preprocessing import GPTSummarizer from skllm.datasets import get_summarization_dataset X = get_summarization_dataset() -s = GPTSummarizer(openai_model = 'gpt-3.5-turbo', max_words = 15) +s = GPTSummarizer(openai_model="gpt-3.5-turbo", max_words=15) summaries = s.fit_transform(X) ``` Please be aware that the `max_words` hyperparameter sets a soft limit, which is not strictly enforced outside of the prompt. Therefore, in some cases, the actual number of words might be slightly higher. +### Text Translation + +GPT models have demonstrated their effectiveness in translation tasks by generating accurate translations across various languages. Thus, we added `GPTTranslator` that allows translating an arbitraty text into a language of interest. + +Example: + +```python +from skllm.preprocessing import GPTTranslator +from skllm.datasets import get_translation_dataset + +X = get_translation_dataset() +t = GPTTranslator(openai_model="gpt-3.5-turbo", output_language="English") +translated_text = t.fit_transform(X) +``` + ## Roadmap 🧭 - [x] Zero-Shot Classification with OpenAI GPT 3/4 diff --git a/skllm/datasets/__init__.py b/skllm/datasets/__init__.py index 512949c..f7b6b0f 100644 --- a/skllm/datasets/__init__.py +++ b/skllm/datasets/__init__.py @@ -1,3 +1,4 @@ from skllm.datasets.multi_class import get_classification_dataset from skllm.datasets.multi_label import get_multilabel_classification_dataset -from skllm.datasets.summarization import get_summarization_dataset \ No newline at end of file +from skllm.datasets.summarization import get_summarization_dataset +from skllm.datasets.translation import get_translation_dataset diff --git a/skllm/datasets/translation.py b/skllm/datasets/translation.py new file mode 100644 index 0000000..61ceffc --- /dev/null +++ b/skllm/datasets/translation.py @@ -0,0 +1,23 @@ +def get_translation_dataset(): + X = [ + r"Me encanta bailar salsa y bachata. Es una forma divertida de expresarme.", + r"J'ai passé mes dernières vacances en Grèce. Les plages étaient magnifiques.", + ( + r"Ich habe gestern ein tolles Buch gelesen. Die Geschichte war fesselnd bis" + r" zum Ende." + ), + ( + r"Gosto de cozinhar pratos tradicionais italianos. O espaguete à carbonara" + r" é um dos meus favoritos." + ), + ( + r"Mám v plánu letos v létě vyrazit na výlet do Itálie. Doufám, že navštívím" + r" Řím a Benátky." + ), + ( + r"Mijn favoriete hobby is fotograferen. Ik hou ervan om mooie momenten vast" + r" te leggen." + ), + ] + + return X diff --git a/skllm/preprocessing/__init__.py b/skllm/preprocessing/__init__.py index 3a7c3a1..29643e2 100644 --- a/skllm/preprocessing/__init__.py +++ b/skllm/preprocessing/__init__.py @@ -1,2 +1,3 @@ +from skllm.preprocessing.gpt_summarizer import GPTSummarizer +from skllm.preprocessing.gpt_translator import GPTTranslator from skllm.preprocessing.gpt_vectorizer import GPTVectorizer -from skllm.preprocessing.gpt_summarizer import GPTSummarizer \ No newline at end of file diff --git a/skllm/preprocessing/gpt_translator.py b/skllm/preprocessing/gpt_translator.py new file mode 100644 index 0000000..35e4d68 --- /dev/null +++ b/skllm/preprocessing/gpt_translator.py @@ -0,0 +1,37 @@ +from typing import Optional + +from skllm.openai.base_gpt import BaseZeroShotGPTTransformer as _BaseGPT +from skllm.prompts.builders import build_translation_prompt + + +class GPTTranslator(_BaseGPT): + """A text translator.""" + + system_msg = "You are a text translator." + default_output = "Translation is unavailable." + + def __init__( + self, + openai_key: Optional[str] = None, + openai_org: Optional[str] = None, + openai_model: str = "gpt-3.5-turbo", + output_language: str = "English", + ): + self._set_keys(openai_key, openai_org) + self.openai_model = openai_model + self.output_language = output_language + + def _get_prompt(self, X: str) -> str: + """Generates the prompt for the given input. + + Parameters + ---------- + X : str + sample to translate + + Returns + ------- + str + translated sample + """ + return build_translation_prompt(X, self.output_language) diff --git a/skllm/prompts/builders.py b/skllm/prompts/builders.py index c9d7418..bad634d 100644 --- a/skllm/prompts/builders.py +++ b/skllm/prompts/builders.py @@ -2,6 +2,7 @@ from skllm.prompts.templates import ( SUMMARY_PROMPT_TEMPLATE, + TRANSLATION_PROMPT_TEMPLATE, ZERO_SHOT_CLF_PROMPT_TEMPLATE, ZERO_SHOT_MLCLF_PROMPT_TEMPLATE, ) @@ -78,3 +79,25 @@ def build_summary_prompt( prepared prompt """ return template.format(x=x, max_words=max_words) + + +def build_translation_prompt( + x: str, output_language: str, template: str = TRANSLATION_PROMPT_TEMPLATE +) -> str: + """Builds a prompt for text translation. + + Parameters + ---------- + x : str + sample to translate + output_language : str + language to translate to + template : str + prompt template to use, must contain placeholders for all variables, by default TRANSLATION_PROMPT_TEMPLATE + + Returns + ------- + str + prepared prompt + """ + return template.format(x=x, output_language=output_language) diff --git a/skllm/prompts/templates.py b/skllm/prompts/templates.py index 18785c0..8190531 100644 --- a/skllm/prompts/templates.py +++ b/skllm/prompts/templates.py @@ -39,3 +39,18 @@ Text sample: ```{x}``` Summarized text: """ + +TRANSLATION_PROMPT_TEMPLATE = """ +You will be provided with an arbitrary text sample, delimited by triple backticks. +Your task is to translate this text to {output_language} language and output the translated text. + +Perform the following actions: +1. Determine the language of the text sample. +2. If the language is not {output_language}, translate the text sample to {output_language} language. +3. Output the translated text. +If the text sample provided is not in a recognizable language, output "No translation available". +Do not output any additional information except the translated text. + +Text sample: ```{x}``` +Translated text: +"""