Skip to content

Commit

Permalink
Merge pull request #40 from Oxen-AI/feat/diff
Browse files Browse the repository at this point in the history
Feat/diff
  • Loading branch information
gschoeni committed Mar 1, 2024
2 parents 7fc357f + b2018c8 commit 024616b
Show file tree
Hide file tree
Showing 27 changed files with 772 additions and 160 deletions.
283 changes: 154 additions & 129 deletions oxen/Cargo.lock

Large diffs are not rendered by default.

6 changes: 3 additions & 3 deletions oxen/Cargo.toml
@@ -1,6 +1,6 @@
[package]
name = "oxen"
version = "0.10.1"
version = "0.11.1"
edition = "2021"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
Expand All @@ -14,9 +14,9 @@ pyo3-asyncio = { version = "0.20.0", features = ["attributes", "tokio-runtime"]
log = "0.4.17"
pyo3-log = "0.9.0"
tokio = { version = "1", features = ["full"] }
pyo3-polars = "0.9.0"
pyo3-polars = "0.10.0"
serde_json = "1.0.106"
liboxen = "0.10.16"
liboxen = "0.11.1"
# liboxen = { path = "../../rust/Oxen/src/lib" }

[build-dependencies]
Expand Down
2 changes: 2 additions & 0 deletions oxen/python/oxen/__init__.py
Expand Up @@ -13,6 +13,7 @@
from oxen import auth
from oxen import loaders
from oxen.clone import clone
from oxen.diff.diff import diff
from oxen.init import init
from oxen.config import is_configured

Expand All @@ -34,4 +35,5 @@
"auth",
"loaders",
"util",
"diff"
]
1 change: 1 addition & 0 deletions oxen/python/oxen/auth.py
Expand Up @@ -4,6 +4,7 @@
import os
import requests


def config_auth(token: str, host: str = "hub.oxen.ai", path: Optional[str] = None):
"""
Configures authentication for a host.
Expand Down
6 changes: 2 additions & 4 deletions oxen/python/oxen/config.py
@@ -1,8 +1,6 @@
from .oxen import auth, util
from oxen.user import config_user
from typing import Optional
from .oxen import util
import os
import requests


def is_configured():
"""
Expand Down
20 changes: 20 additions & 0 deletions oxen/python/oxen/df.py
@@ -0,0 +1,20 @@
from .oxen import df

import os
from polars import DataFrame


def save(
data_frame: DataFrame,
path: os.PathLike,
):
"""
Saves a data frame to a file. The file format is inferred from the file extension.
Args:
data_frame: `DataFrame`
The polars data frame to save.
path: `os.PathLike`
The path to save the data frame to.
"""
return df.save(data_frame, path)
Empty file.
12 changes: 12 additions & 0 deletions oxen/python/oxen/diff/change_type.py
@@ -0,0 +1,12 @@
from enum import Enum


class ChangeType(Enum):
"""
An enum representing the type of change in a diff.
"""

ADDED = "Added"
REMOVED = "Removed"
MODIFIED = "Modified"
UNCHANGED = "Unchanged"
140 changes: 140 additions & 0 deletions oxen/python/oxen/diff/diff.py
@@ -0,0 +1,140 @@
"""
Oxen can be used to compare data frames and return a tabular diff.
For example comparing two data frames will give you an output data frame,
where the `.oxen.diff.status` column shows if the row was `added`, `removed`,
or `modified`.
```
shape: (6, 7)
+-------------+-----+-----+-------+--------+-------------+-------------------+
| file | x | y | width | height | label.right | .oxen.diff.status |
| --- | --- | --- | --- | --- | --- | --- |
| str | i64 | i64 | i64 | i64 | str | str |
+-------------+-----+-----+-------+--------+-------------+-------------------+
| image_0.jpg | 0 | 0 | 10 | 10 | cat | modified |
| image_1.jpg | 1 | 2 | 10 | 20 | null | removed |
| image_1.jpg | 200 | 100 | 10 | 20 | dog | added |
| image_2.jpg | 4 | 10 | 20 | 20 | null | removed |
| image_3.jpg | 4 | 10 | 20 | 20 | dog | added |
| image_4.jpg | 10 | 10 | 10 | 10 | dog | added |
+-------------+-----+-----+-------+--------+-------------+-------------------+
```
## Usage
```python
import os
import oxen
result = oxen.diff("dataset_1.csv", "dataset_2.csv")
print(result.get())
```
"""

from ..oxen import PyDiff
from ..oxen import diff as py_diff

from oxen import df
from oxen.diff.tabular_diff import TabularDiff
from oxen.diff.text_diff import TextDiff

import os
from typing import Optional

def diff(
path: os.PathLike,
to: Optional[os.PathLike] = None,
repo_dir: Optional[os.PathLike] = None,
revision_left: Optional[str] = None,
revision_right: Optional[str] = None,
output: Optional[os.PathLike] = None,
keys: list[str] = [],
targets: list[str] = [],
):
"""
Compares data from two paths and returns a diff respecting the type of data.
Args:
path: `os.PathLike`
The path to diff. If `to` is not provided,
this will compare the data frame to the previous commit.
to: `os.PathLike`
An optional second path to compare to.
If provided this will be the right side of the diff.
repo_dir: `os.PathLike`
The path to the oxen repository. Must be provided if `compare_to` is
not provided, or if `revision_left` or `revision_right` is provided.
If not provided, the repository will be searched for in the current working directory.
revision_left: `str`
The left revision to compare. Can be a commit hash or branch name.
revision_right: `str`
The right revision to compare. Can be a commit hash or branch name.
output: `os.PathLike`
The path to save the diff to. If not provided, the diff will not be saved.
keys: `list[str]`
Only for tabular diffs. The keys to compare on.
This is used to join the two data frames.
Keys will be combined and hashed to create a identifier for each row.
targets: `list[str]`
Only for tabular diffs. The targets to compare on.
This is used to compare the values of the two data frames.
"""
result = py_diff.diff_paths(
path, keys, targets, to, repo_dir, revision_left, revision_right
)
if output:
df.save(result, output)
return Diff(result)

class Diff:
"""
Diff class wraps many types of diffs and provides a consistent interface.
For example the diff can be tabular or text. Eventually we will extend this
to support other types of diffs such as images, audio, etc.
"""

def __init__(self, py_diff: PyDiff):
self._py_diff = py_diff

def __repr__(self) -> str:
return f"Diff(format={self.format})"

@property
def format(self) -> str:
"""
Returns the format of the diff. Ie. tabular, text, etc.
"""
return self._py_diff.format

@property
def tabular(self) -> Optional[TabularDiff]:
"""
Returns the tabular diff if the diff is tabular.
"""
if self.format == "tabular":
return TabularDiff(self._py_diff.tabular)
return None

@property
def text(self) -> Optional[TextDiff]:
"""
Returns the text diff if the diff is text.
"""
if self.format == "text":
return TextDiff(self._py_diff.text)
return None

def get(self):
"""
Resolves the diff type and returns the appropriate diff object.
"""
match self._py_diff.format:
case "tabular":
return TabularDiff(self._py_diff.tabular)
case "text":
return TextDiff(self._py_diff.text)
case "unknown":
raise ValueError("The diff type is unknown.")

21 changes: 21 additions & 0 deletions oxen/python/oxen/diff/tabular_diff.py
@@ -0,0 +1,21 @@
from ..oxen import PyTabularDiff

from polars import DataFrame


class TabularDiff:
"""
This class returns a polars data frame that represents a tabular diff.
"""
def __init__(self, diff: PyTabularDiff):
self._diff = diff

def __repr__(self) -> str:
return f"TabularDiff(shape={self._diff.data.shape})\n\n{self._diff.data}"

@property
def data(self) -> DataFrame:
"""
Returns the data of the diff as a polars data frame.
"""
return self._diff.data
88 changes: 88 additions & 0 deletions oxen/python/oxen/diff/text_diff.py
@@ -0,0 +1,88 @@
from ..oxen import PyTextDiff, PyLineDiff, PyChangeType

from oxen.diff.change_type import ChangeType


class LineDiff:
"""
A class representing a change in a line of text.
+ Added
- Removed
"""

def __init__(self, diff: PyLineDiff):
self._diff = diff

def __repr__(self) -> str:
return (
f"LineDiff(modification={self._diff.modification}, text={self._diff.text})"
)

@property
def modification(self) -> ChangeType:
"""
Returns the modification of the line diff.
"""
match self._diff.modification:
case PyChangeType.Added:
return ChangeType.ADDED
case PyChangeType.Removed:
return ChangeType.REMOVED
case PyChangeType.Modified:
return ChangeType.MODIFIED
case PyChangeType.Unchanged:
return ChangeType.UNCHANGED
case _:
raise ValueError(f"Invalid modification: {self._diff.modification}")

@property
def text(self) -> str:
"""
Returns the text of the line diff.
"""
return self._diff.text


class TextDiff:
"""
A class representing a text diff.
"""
def __init__(self, diff: PyTextDiff):
self._diff = diff

def __repr__(self) -> str:
return f"TextDiff(num_added={self.num_added}, num_removed={self.num_removed})"

def __str__(self) -> str:
# iterate over lines and print them with a + or - prefix
return "\n".join([f"{line.value}" for line in self._diff.lines])

@property
def num_added(self) -> int:
"""
Returns the number of added lines in the diff.
"""
# count the number of added lines
return self._count_lines(PyChangeType.Added)

@property
def num_removed(self) -> int:
"""
Returns the number of removed lines in the diff.
"""
# count the number of removed lines
return self._count_lines(PyChangeType.Removed)

@property
def lines(self) -> list[LineDiff]:
"""
Returns the contents of the diff as a polars data frame.
"""
# map the PyLineDiff to LineDiff
return [LineDiff(line) for line in self._diff.lines]

def _count_lines(self, modification: PyChangeType) -> int:
return len(
[line for line in self._diff.lines if line.modification == modification]
)
7 changes: 4 additions & 3 deletions oxen/python/oxen/local_repo.py
Expand Up @@ -81,8 +81,7 @@ def branches(self):
return self._repo.list_branches()

def branch(self, name: str, delete=False):
"""
"""
""" """
return self._repo.branch(name, delete)

def checkout(self, revision: str, create=False):
Expand Down Expand Up @@ -169,7 +168,9 @@ def set_remote(self, name: str, url: str):
def create_remote(self, name: str):
self._repo.create_remote(name)

def push(self, remote_name: str = "origin", branch: str = "main", delete: bool = False):
def push(
self, remote_name: str = "origin", branch: str = "main", delete: bool = False
):
"""
Push data to a remote repo from a local repo.
Expand Down
2 changes: 1 addition & 1 deletion oxen/python/oxen/remote_repo.py
Expand Up @@ -193,7 +193,7 @@ def download(
if directory and not os.path.exists(directory):
os.makedirs(directory, exist_ok=True)

if revision == None:
if revision is None:
self._repo.download(src, dst, self.revision)
else:
self._repo.download(src, dst, revision)
Expand Down
20 changes: 20 additions & 0 deletions oxen/src/df.rs
@@ -0,0 +1,20 @@



use pyo3::prelude::*;
use std::path::PathBuf;

use liboxen::core::df::tabular;
use crate::error::PyOxenError;

use pyo3_polars::PyDataFrame;

#[pyfunction]
pub fn save(
df: PyDataFrame,
path: PathBuf
) -> Result<(), PyOxenError> {
let mut df = df.as_ref().clone();
tabular::write_df(&mut df, path)?;
Ok(())
}

0 comments on commit 024616b

Please sign in to comment.