BeastByteAI · iryna-kondr · May 26, 2023 · May 22, 2023 · May 22, 2023 · May 23, 2023
diff --git a/.github/assets/badges/.gitkeep b/.github/assets/badges/.gitkeep
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
@@ -0,0 +1,10 @@
+version: 2
+updates:
+  - package-ecosystem: "pip"
+    directory: "/"
+    schedule:
+      interval: "monthly"
+  - package-ecosystem: "github-actions"
+    directory: "/"
+    schedule:
+      interval: "monthly"
diff --git a/.github/workflows/pre_commit_auto_update.yml b/.github/workflows/pre_commit_auto_update.yml
@@ -0,0 +1,37 @@
+# Run a pre-commit autoupdate every week and open a pull request if needed
+name: Pre-commit auto-update
+
+on:
+  # At 00:00 on the 1st of every month.
+  schedule:
+    - cron: "0 0 1 * *"
+  workflow_dispatch:
+
+permissions:
+  contents: write
+  pull-requests: write
+
+jobs:
+  pre-commit-auto-update:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+      - name: Set up Python
+        uses: actions/setup-python@v4
+      - name: Install pre-commit
+        run: pip install pre-commit
+      - name: Run pre-commit
+        run: pre-commit autoupdate
+      - name: Set git config
+        run: |
+          git config --local user.email "action@github.com"
+          git config --local user.name "GitHub Action"
+      - uses: peter-evans/create-pull-request@v5
+        with:
+          token: ${{ github.token }}
+          branch: update/pre-commit-hooks
+          title: Update pre-commit hooks
+          commit-message: "Update pre-commit hooks"
+          body: Update versions of pre-commit hooks to latest version.
+          labels: "dependencies,github_actions"
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,74 @@
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.4.0
+    hooks:
+      - id: trailing-whitespace
+      - id: end-of-file-fixer
+      - id: check-docstring-first
+      - id: check-xml
+      - id: check-json
+      - id: check-yaml
+      - id: check-toml
+      - id: debug-statements
+      - id: check-executables-have-shebangs
+      - id: check-case-conflict
+      - id: check-added-large-files
+      - id: detect-aws-credentials
+      - id: detect-private-key
+  # Formatter for Json and Yaml files
+  - repo: https://github.com/pre-commit/mirrors-prettier
+    rev: v3.0.0-alpha.9-for-vscode
+    hooks:
+      - id: prettier
+        types: [json, yaml, toml]
+  # Formatter for markdown files
+  - repo: https://github.com/executablebooks/mdformat
+    rev: 0.7.16
+    hooks:
+      - id: mdformat
+        args: ["--number"]
+        additional_dependencies:
+          - mdformat-gfm
+          - mdformat-tables
+          - mdformat-frontmatter
+          - mdformat-black
+          - mdformat-shfmt
+  # An extremely fast Python linter, written in Rust
+  - repo: https://github.com/charliermarsh/ruff-pre-commit
+    rev: "v0.0.263"
+    hooks:
+      - id: ruff
+        args: [--fix, --exit-non-zero-on-fix]
+  # Python code formatter
+  - repo: https://github.com/psf/black
+    rev: 23.3.0
+    hooks:
+      - id: black
+        args: ["--config", "pyproject.toml"]
+  # Python's import formatter
+  - repo: https://github.com/PyCQA/isort
+    rev: 5.12.0
+    hooks:
+      - id: isort
+  # Formats docstrings to follow PEP 257
+  - repo: https://github.com/PyCQA/docformatter
+    rev: v1.6.4
+    hooks:
+      - id: docformatter
+        additional_dependencies: [tomli]
+        args: ["--in-place", "--config", "pyproject.toml"]
+  # Python tool for docstring coverage
+  - repo: https://github.com/econchick/interrogate
+    rev: 1.5.0
+    hooks:
+      - id: interrogate
+        args:
+          [
+            "--config",
+            "pyproject.toml",
+            "--generate-badge",
+            ".github/assets/badges",
+            "--badge-style",
+            "flat",
+          ]
+        pass_filenames: false
diff --git a/README.md b/README.md
@@ -6,9 +6,9 @@
 
 Seamlessly integrate powerful language models like ChatGPT into scikit-learn for enhanced text analysis tasks.
 
-## Installation 💾 
+## Installation 💾
 
-```bash 
+```bash
 pip install scikit-llm
 ```
 
@@ -24,10 +24,12 @@ You can support the project in the following ways:
 ## Documentation 📚
 
 ### Configuring OpenAI API Key
+
 At the moment Scikit-LLM is only compatible with some of the OpenAI models. Hence, a user-provided OpenAI API key is required.
 
 ```python
 from skllm.config import SKLLMConfig
+
 SKLLMConfig.set_openai_key("<YOUR_KEY>")
 SKLLMConfig.set_openai_org("<YOUR_ORGANISATION>")
 ```
@@ -39,18 +41,20 @@ One of the powerful ChatGPT features is the ability to perform text classificati
 We provide a class `ZeroShotGPTClassifier` that allows to create such a model as a regular scikit-learn classifier.
 
 Example 1: Training as a regular classifier
+
 ```python
 from skllm import ZeroShotGPTClassifier
 from skllm.datasets import get_classification_dataset
 
 # demo sentiment analysis dataset
 # labels: positive, negative, neutral
-X, y = get_classification_dataset() 
+X, y = get_classification_dataset()
 
-clf = ZeroShotGPTClassifier(openai_model = "gpt-3.5-turbo")
+clf = ZeroShotGPTClassifier(openai_model="gpt-3.5-turbo")
 clf.fit(X, y)
 labels = clf.predict(X)
 ```
+
 Scikit-LLM will automatically query the OpenAI API and transform the response into a regular list of labels.
 
 Additionally, Scikit-LLM will ensure that the obtained response contains a valid label. If this is not the case, a label will be selected randomly (label probabilities are proportional to label occurrences in the training set).
@@ -66,19 +70,17 @@ from skllm.datasets import get_classification_dataset
 X, _ = get_classification_dataset()
 
 clf = ZeroShotGPTClassifier()
-clf.fit(None, ['positive', 'negative', 'neutral'])
+clf.fit(None, ["positive", "negative", "neutral"])
 labels = clf.predict(X)
-
 ```
 
-**Note:** unlike in a typical supervised setting, the performance of a zero-shot classifier greatly depends on how the label itself is structured. It has to be expressed in natural language, be descriptive and self-explanatory. For example, in the previous semantic classification task, it could be beneficial to transform a label from `"<semantics>"` to `"the semantics of the provided text is <semantics>"`. 
-
+**Note:** unlike in a typical supervised setting, the performance of a zero-shot classifier greatly depends on how the label itself is structured. It has to be expressed in natural language, be descriptive and self-explanatory. For example, in the previous semantic classification task, it could be beneficial to transform a label from `"<semantics>"` to `"the semantics of the provided text is <semantics>"`.
 
 ### Multi-Label Zero-Shot Text Classification
 
 With a class `MultiLabelZeroShotGPTClassifier` it is possible to perform the classification in multi-label setting, which means that each sample might be assigned to one or several distinct classes.
 
-Example: 
+Example:
 
 ```python
 from skllm import MultiLabelZeroShotGPTClassifier
@@ -99,16 +101,16 @@ from skllm.datasets import get_multilabel_classification_dataset
 
 X, _ = get_multilabel_classification_dataset()
 candidate_labels = [
-    "Quality", 
-    "Price", 
-    "Delivery", 
-    "Service", 
-    "Product Variety", 
-    "Customer Support", 
-    "Packaging", 
-    "User Experience", 
-    "Return Policy", 
-    "Product Information"
+    "Quality",
+    "Price",
+    "Delivery",
+    "Service",
+    "Product Variety",
+    "Customer Support",
+    "Packaging",
+    "User Experience",
+    "Return Policy",
+    "Product Information",
 ]
 clf = MultiLabelZeroShotGPTClassifier(max_labels=3)
 clf.fit(None, [candidate_labels])
@@ -120,6 +122,7 @@ labels = clf.predict(X)
 As an alternative to using GPT as a classifier, it can be used solely for data preprocessing. `GPTVectorizer` allows to embed a chunk of text of arbitrary length to a fixed-dimensional vector, that can be used with virtually any classification or regression model.
 
 Example 1: Embedding the text
+
 ```python
 from skllm.preprocessing import GPTVectorizer
 
@@ -128,6 +131,7 @@ vectors = model.fit_transform(X)
 ```
 
 Example 2: Combining the Vectorizer with the XGBoost Classifier in a Sklearn Pipeline
+
 ```python
 from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import LabelEncoder
@@ -137,7 +141,7 @@ le = LabelEncoder()
 y_train_encoded = le.fit_transform(y_train)
 y_test_encoded = le.transform(y_test)
 
-steps = [('GPT', GPTVectorizer()), ('Clf', XGBClassifier())]
+steps = [("GPT", GPTVectorizer()), ("Clf", XGBClassifier())]
 clf = Pipeline(steps)
 clf.fit(X_train, y_train_encoded)
 yh = clf.predict(X_test)
@@ -146,11 +150,25 @@ yh = clf.predict(X_test)
 ## Roadmap 🧭
 
 - [x] Zero-Shot Classification with OpenAI GPT 3/4
-    - [x] Multiclass classification
-    - [x] Multi-label classification
-    - [x] ChatGPT models
-    - [ ] InstructGPT models
+  - [x] Multiclass classification
+  - [x] Multi-label classification
+  - [x] ChatGPT models
+  - [ ] InstructGPT models
 - [ ] Few shot classifier
 - [x] GPT Vectorizer
 - [ ] GPT Fine-tuning (optional)
-- [ ] Integration of other LLMs
+- [ ] Integration of other LLMs
+
+## Contributing
+
+In order to install all development dependencies, run the following command:
+
+```shell
+pip install -e ".[dev]"
+```
+
+To ensure that you follow the development workflow, please setup the pre-commit hooks:
+
+```shell
+pre-commit install
+```
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,14 +1,13 @@
 [build-system]
-
 requires = ["setuptools>=61.0"]
 build-backend = "setuptools.build_meta"
 
 [project]
 dependencies = [
-"scikit-learn>=1.1.0",
-"pandas>=1.5.0",
-"openai>=0.27.0",
-"tqdm>=4.60.0",
+  "scikit-learn>=1.1.0",
+  "pandas>=1.5.0",
+  "openai>=0.27.0",
+  "tqdm>=4.60.0",
 ]
 name = "scikit-llm"
 version = "0.1.0b2"
@@ -25,7 +24,71 @@ classifiers = [
     "License :: OSI Approved :: MIT License",
     "Operating System :: OS Independent",
 ]
+dynamic = ["optional-dependencies"]
+
+[tool.setuptools.dynamic.optional-dependencies]
+dev = { file = ["requirements-dev.txt"] }
+
+[tool.ruff]
+select = [
+    # pycodestyle
+    "E",
+    # pyflakes
+    "F",
+    # pydocstyle
+    "D",
+    # flake8-bandit
+    "S",
+    # pyupgrade
+    "UP",
+    # pep8-naming
+    "N",
+]
+# Error E501 (Line too long) is ignored because of docstrings.
+ignore = [
+    "S101",
+    "S301",
+    "S311",
+    "D100",
+    "D200",
+    "D203",
+    "D205",
+    "D401",
+    "E501",
+]
+extend-exclude = ["tests/*.py", "setup.py"]
+target-version = "py38"
+force-exclude = true
+
+[tool.ruff.per-file-ignores]
+"__init__.py" = ["E402", "F401", "F403", "F811"]
+
+[tool.ruff.pydocstyle]
+convention = "numpy"
+
+[tool.mypy]
+ignore_missing_imports = true
+
+[tool.black]
+preview = true
+target-version = ['py38', 'py39', 'py310', 'py311']
+
+[tool.isort]
+profile = "black"
+filter_files = true
+known_first_party = ["skllm", "skllm.*"]
+
+[tool.docformatter]
+close-quotes-on-newline = true # D209
+
+[tool.interrogate]
+fail-under = 80
+ignore-module = true
+ignore-nested-functions = true
+ignore-private = true
+exclude = ["tests", "setup.py"]
+
 [tool.pytest.ini_options]
 pythonpath = [
   "."
-]
+]
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -0,0 +1,6 @@
+pre-commit
+black
+isort
+ruff
+docformatter
+interrogate