diff --git a/.github/assets/badges/.gitkeep b/.github/assets/badges/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000..b8b8480 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,10 @@ +version: 2 +updates: + - package-ecosystem: "pip" + directory: "/" + schedule: + interval: "monthly" + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "monthly" diff --git a/.github/workflows/pre_commit_auto_update.yml b/.github/workflows/pre_commit_auto_update.yml new file mode 100644 index 0000000..3d5f436 --- /dev/null +++ b/.github/workflows/pre_commit_auto_update.yml @@ -0,0 +1,37 @@ +# Run a pre-commit autoupdate every week and open a pull request if needed +name: Pre-commit auto-update + +on: + # At 00:00 on the 1st of every month. + schedule: + - cron: "0 0 1 * *" + workflow_dispatch: + +permissions: + contents: write + pull-requests: write + +jobs: + pre-commit-auto-update: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v3 + - name: Set up Python + uses: actions/setup-python@v4 + - name: Install pre-commit + run: pip install pre-commit + - name: Run pre-commit + run: pre-commit autoupdate + - name: Set git config + run: | + git config --local user.email "action@github.com" + git config --local user.name "GitHub Action" + - uses: peter-evans/create-pull-request@v5 + with: + token: ${{ github.token }} + branch: update/pre-commit-hooks + title: Update pre-commit hooks + commit-message: "Update pre-commit hooks" + body: Update versions of pre-commit hooks to latest version. + labels: "dependencies,github_actions" diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..28dc6e7 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,74 @@ +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.4.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer + - id: check-docstring-first + - id: check-xml + - id: check-json + - id: check-yaml + - id: check-toml + - id: debug-statements + - id: check-executables-have-shebangs + - id: check-case-conflict + - id: check-added-large-files + - id: detect-aws-credentials + - id: detect-private-key + # Formatter for Json and Yaml files + - repo: https://github.com/pre-commit/mirrors-prettier + rev: v3.0.0-alpha.9-for-vscode + hooks: + - id: prettier + types: [json, yaml, toml] + # Formatter for markdown files + - repo: https://github.com/executablebooks/mdformat + rev: 0.7.16 + hooks: + - id: mdformat + args: ["--number"] + additional_dependencies: + - mdformat-gfm + - mdformat-tables + - mdformat-frontmatter + - mdformat-black + - mdformat-shfmt + # An extremely fast Python linter, written in Rust + - repo: https://github.com/charliermarsh/ruff-pre-commit + rev: "v0.0.263" + hooks: + - id: ruff + args: [--fix, --exit-non-zero-on-fix] + # Python code formatter + - repo: https://github.com/psf/black + rev: 23.3.0 + hooks: + - id: black + args: ["--config", "pyproject.toml"] + # Python's import formatter + - repo: https://github.com/PyCQA/isort + rev: 5.12.0 + hooks: + - id: isort + # Formats docstrings to follow PEP 257 + - repo: https://github.com/PyCQA/docformatter + rev: v1.6.4 + hooks: + - id: docformatter + additional_dependencies: [tomli] + args: ["--in-place", "--config", "pyproject.toml"] + # Python tool for docstring coverage + - repo: https://github.com/econchick/interrogate + rev: 1.5.0 + hooks: + - id: interrogate + args: + [ + "--config", + "pyproject.toml", + "--generate-badge", + ".github/assets/badges", + "--badge-style", + "flat", + ] + pass_filenames: false diff --git a/README.md b/README.md index 6df3e92..05da555 100644 --- a/README.md +++ b/README.md @@ -6,9 +6,9 @@ Seamlessly integrate powerful language models like ChatGPT into scikit-learn for enhanced text analysis tasks. -## Installation 💾 +## Installation 💾 -```bash +```bash pip install scikit-llm ``` @@ -24,10 +24,12 @@ You can support the project in the following ways: ## Documentation 📚 ### Configuring OpenAI API Key + At the moment Scikit-LLM is only compatible with some of the OpenAI models. Hence, a user-provided OpenAI API key is required. ```python from skllm.config import SKLLMConfig + SKLLMConfig.set_openai_key("") SKLLMConfig.set_openai_org("") ``` @@ -39,18 +41,20 @@ One of the powerful ChatGPT features is the ability to perform text classificati We provide a class `ZeroShotGPTClassifier` that allows to create such a model as a regular scikit-learn classifier. Example 1: Training as a regular classifier + ```python from skllm import ZeroShotGPTClassifier from skllm.datasets import get_classification_dataset # demo sentiment analysis dataset # labels: positive, negative, neutral -X, y = get_classification_dataset() +X, y = get_classification_dataset() -clf = ZeroShotGPTClassifier(openai_model = "gpt-3.5-turbo") +clf = ZeroShotGPTClassifier(openai_model="gpt-3.5-turbo") clf.fit(X, y) labels = clf.predict(X) ``` + Scikit-LLM will automatically query the OpenAI API and transform the response into a regular list of labels. Additionally, Scikit-LLM will ensure that the obtained response contains a valid label. If this is not the case, a label will be selected randomly (label probabilities are proportional to label occurrences in the training set). @@ -66,19 +70,17 @@ from skllm.datasets import get_classification_dataset X, _ = get_classification_dataset() clf = ZeroShotGPTClassifier() -clf.fit(None, ['positive', 'negative', 'neutral']) +clf.fit(None, ["positive", "negative", "neutral"]) labels = clf.predict(X) - ``` -**Note:** unlike in a typical supervised setting, the performance of a zero-shot classifier greatly depends on how the label itself is structured. It has to be expressed in natural language, be descriptive and self-explanatory. For example, in the previous semantic classification task, it could be beneficial to transform a label from `""` to `"the semantics of the provided text is "`. - +**Note:** unlike in a typical supervised setting, the performance of a zero-shot classifier greatly depends on how the label itself is structured. It has to be expressed in natural language, be descriptive and self-explanatory. For example, in the previous semantic classification task, it could be beneficial to transform a label from `""` to `"the semantics of the provided text is "`. ### Multi-Label Zero-Shot Text Classification With a class `MultiLabelZeroShotGPTClassifier` it is possible to perform the classification in multi-label setting, which means that each sample might be assigned to one or several distinct classes. -Example: +Example: ```python from skllm import MultiLabelZeroShotGPTClassifier @@ -99,16 +101,16 @@ from skllm.datasets import get_multilabel_classification_dataset X, _ = get_multilabel_classification_dataset() candidate_labels = [ - "Quality", - "Price", - "Delivery", - "Service", - "Product Variety", - "Customer Support", - "Packaging", - "User Experience", - "Return Policy", - "Product Information" + "Quality", + "Price", + "Delivery", + "Service", + "Product Variety", + "Customer Support", + "Packaging", + "User Experience", + "Return Policy", + "Product Information", ] clf = MultiLabelZeroShotGPTClassifier(max_labels=3) clf.fit(None, [candidate_labels]) @@ -120,6 +122,7 @@ labels = clf.predict(X) As an alternative to using GPT as a classifier, it can be used solely for data preprocessing. `GPTVectorizer` allows to embed a chunk of text of arbitrary length to a fixed-dimensional vector, that can be used with virtually any classification or regression model. Example 1: Embedding the text + ```python from skllm.preprocessing import GPTVectorizer @@ -128,6 +131,7 @@ vectors = model.fit_transform(X) ``` Example 2: Combining the Vectorizer with the XGBoost Classifier in a Sklearn Pipeline + ```python from sklearn.pipeline import Pipeline from sklearn.preprocessing import LabelEncoder @@ -137,7 +141,7 @@ le = LabelEncoder() y_train_encoded = le.fit_transform(y_train) y_test_encoded = le.transform(y_test) -steps = [('GPT', GPTVectorizer()), ('Clf', XGBClassifier())] +steps = [("GPT", GPTVectorizer()), ("Clf", XGBClassifier())] clf = Pipeline(steps) clf.fit(X_train, y_train_encoded) yh = clf.predict(X_test) @@ -146,11 +150,25 @@ yh = clf.predict(X_test) ## Roadmap 🧭 - [x] Zero-Shot Classification with OpenAI GPT 3/4 - - [x] Multiclass classification - - [x] Multi-label classification - - [x] ChatGPT models - - [ ] InstructGPT models + - [x] Multiclass classification + - [x] Multi-label classification + - [x] ChatGPT models + - [ ] InstructGPT models - [ ] Few shot classifier - [x] GPT Vectorizer - [ ] GPT Fine-tuning (optional) -- [ ] Integration of other LLMs \ No newline at end of file +- [ ] Integration of other LLMs + +## Contributing + +In order to install all development dependencies, run the following command: + +```shell +pip install -e ".[dev]" +``` + +To ensure that you follow the development workflow, please setup the pre-commit hooks: + +```shell +pre-commit install +``` diff --git a/pyproject.toml b/pyproject.toml index 3c437a6..b893cfa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,14 +1,13 @@ [build-system] - requires = ["setuptools>=61.0"] build-backend = "setuptools.build_meta" [project] dependencies = [ -"scikit-learn>=1.1.0", -"pandas>=1.5.0", -"openai>=0.27.0", -"tqdm>=4.60.0", + "scikit-learn>=1.1.0", + "pandas>=1.5.0", + "openai>=0.27.0", + "tqdm>=4.60.0", ] name = "scikit-llm" version = "0.1.0b2" @@ -25,7 +24,71 @@ classifiers = [ "License :: OSI Approved :: MIT License", "Operating System :: OS Independent", ] +dynamic = ["optional-dependencies"] + +[tool.setuptools.dynamic.optional-dependencies] +dev = { file = ["requirements-dev.txt"] } + +[tool.ruff] +select = [ + # pycodestyle + "E", + # pyflakes + "F", + # pydocstyle + "D", + # flake8-bandit + "S", + # pyupgrade + "UP", + # pep8-naming + "N", +] +# Error E501 (Line too long) is ignored because of docstrings. +ignore = [ + "S101", + "S301", + "S311", + "D100", + "D200", + "D203", + "D205", + "D401", + "E501", +] +extend-exclude = ["tests/*.py", "setup.py"] +target-version = "py38" +force-exclude = true + +[tool.ruff.per-file-ignores] +"__init__.py" = ["E402", "F401", "F403", "F811"] + +[tool.ruff.pydocstyle] +convention = "numpy" + +[tool.mypy] +ignore_missing_imports = true + +[tool.black] +preview = true +target-version = ['py38', 'py39', 'py310', 'py311'] + +[tool.isort] +profile = "black" +filter_files = true +known_first_party = ["skllm", "skllm.*"] + +[tool.docformatter] +close-quotes-on-newline = true # D209 + +[tool.interrogate] +fail-under = 80 +ignore-module = true +ignore-nested-functions = true +ignore-private = true +exclude = ["tests", "setup.py"] + [tool.pytest.ini_options] pythonpath = [ "." -] \ No newline at end of file +] diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..3164f42 --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,6 @@ +pre-commit +black +isort +ruff +docformatter +interrogate