Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #40 from Oxen-AI/feat/diff
Feat/diff
- Loading branch information
Showing
27 changed files
with
772 additions
and
160 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
from .oxen import df | ||
|
||
import os | ||
from polars import DataFrame | ||
|
||
|
||
def save( | ||
data_frame: DataFrame, | ||
path: os.PathLike, | ||
): | ||
""" | ||
Saves a data frame to a file. The file format is inferred from the file extension. | ||
Args: | ||
data_frame: `DataFrame` | ||
The polars data frame to save. | ||
path: `os.PathLike` | ||
The path to save the data frame to. | ||
""" | ||
return df.save(data_frame, path) |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
from enum import Enum | ||
|
||
|
||
class ChangeType(Enum): | ||
""" | ||
An enum representing the type of change in a diff. | ||
""" | ||
|
||
ADDED = "Added" | ||
REMOVED = "Removed" | ||
MODIFIED = "Modified" | ||
UNCHANGED = "Unchanged" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,140 @@ | ||
""" | ||
Oxen can be used to compare data frames and return a tabular diff. | ||
For example comparing two data frames will give you an output data frame, | ||
where the `.oxen.diff.status` column shows if the row was `added`, `removed`, | ||
or `modified`. | ||
``` | ||
shape: (6, 7) | ||
+-------------+-----+-----+-------+--------+-------------+-------------------+ | ||
| file | x | y | width | height | label.right | .oxen.diff.status | | ||
| --- | --- | --- | --- | --- | --- | --- | | ||
| str | i64 | i64 | i64 | i64 | str | str | | ||
+-------------+-----+-----+-------+--------+-------------+-------------------+ | ||
| image_0.jpg | 0 | 0 | 10 | 10 | cat | modified | | ||
| image_1.jpg | 1 | 2 | 10 | 20 | null | removed | | ||
| image_1.jpg | 200 | 100 | 10 | 20 | dog | added | | ||
| image_2.jpg | 4 | 10 | 20 | 20 | null | removed | | ||
| image_3.jpg | 4 | 10 | 20 | 20 | dog | added | | ||
| image_4.jpg | 10 | 10 | 10 | 10 | dog | added | | ||
+-------------+-----+-----+-------+--------+-------------+-------------------+ | ||
``` | ||
## Usage | ||
```python | ||
import os | ||
import oxen | ||
result = oxen.diff("dataset_1.csv", "dataset_2.csv") | ||
print(result.get()) | ||
``` | ||
""" | ||
|
||
from ..oxen import PyDiff | ||
from ..oxen import diff as py_diff | ||
|
||
from oxen import df | ||
from oxen.diff.tabular_diff import TabularDiff | ||
from oxen.diff.text_diff import TextDiff | ||
|
||
import os | ||
from typing import Optional | ||
|
||
def diff( | ||
path: os.PathLike, | ||
to: Optional[os.PathLike] = None, | ||
repo_dir: Optional[os.PathLike] = None, | ||
revision_left: Optional[str] = None, | ||
revision_right: Optional[str] = None, | ||
output: Optional[os.PathLike] = None, | ||
keys: list[str] = [], | ||
targets: list[str] = [], | ||
): | ||
""" | ||
Compares data from two paths and returns a diff respecting the type of data. | ||
Args: | ||
path: `os.PathLike` | ||
The path to diff. If `to` is not provided, | ||
this will compare the data frame to the previous commit. | ||
to: `os.PathLike` | ||
An optional second path to compare to. | ||
If provided this will be the right side of the diff. | ||
repo_dir: `os.PathLike` | ||
The path to the oxen repository. Must be provided if `compare_to` is | ||
not provided, or if `revision_left` or `revision_right` is provided. | ||
If not provided, the repository will be searched for in the current working directory. | ||
revision_left: `str` | ||
The left revision to compare. Can be a commit hash or branch name. | ||
revision_right: `str` | ||
The right revision to compare. Can be a commit hash or branch name. | ||
output: `os.PathLike` | ||
The path to save the diff to. If not provided, the diff will not be saved. | ||
keys: `list[str]` | ||
Only for tabular diffs. The keys to compare on. | ||
This is used to join the two data frames. | ||
Keys will be combined and hashed to create a identifier for each row. | ||
targets: `list[str]` | ||
Only for tabular diffs. The targets to compare on. | ||
This is used to compare the values of the two data frames. | ||
""" | ||
result = py_diff.diff_paths( | ||
path, keys, targets, to, repo_dir, revision_left, revision_right | ||
) | ||
if output: | ||
df.save(result, output) | ||
return Diff(result) | ||
|
||
class Diff: | ||
""" | ||
Diff class wraps many types of diffs and provides a consistent interface. | ||
For example the diff can be tabular or text. Eventually we will extend this | ||
to support other types of diffs such as images, audio, etc. | ||
""" | ||
|
||
def __init__(self, py_diff: PyDiff): | ||
self._py_diff = py_diff | ||
|
||
def __repr__(self) -> str: | ||
return f"Diff(format={self.format})" | ||
|
||
@property | ||
def format(self) -> str: | ||
""" | ||
Returns the format of the diff. Ie. tabular, text, etc. | ||
""" | ||
return self._py_diff.format | ||
|
||
@property | ||
def tabular(self) -> Optional[TabularDiff]: | ||
""" | ||
Returns the tabular diff if the diff is tabular. | ||
""" | ||
if self.format == "tabular": | ||
return TabularDiff(self._py_diff.tabular) | ||
return None | ||
|
||
@property | ||
def text(self) -> Optional[TextDiff]: | ||
""" | ||
Returns the text diff if the diff is text. | ||
""" | ||
if self.format == "text": | ||
return TextDiff(self._py_diff.text) | ||
return None | ||
|
||
def get(self): | ||
""" | ||
Resolves the diff type and returns the appropriate diff object. | ||
""" | ||
match self._py_diff.format: | ||
case "tabular": | ||
return TabularDiff(self._py_diff.tabular) | ||
case "text": | ||
return TextDiff(self._py_diff.text) | ||
case "unknown": | ||
raise ValueError("The diff type is unknown.") | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
from ..oxen import PyTabularDiff | ||
|
||
from polars import DataFrame | ||
|
||
|
||
class TabularDiff: | ||
""" | ||
This class returns a polars data frame that represents a tabular diff. | ||
""" | ||
def __init__(self, diff: PyTabularDiff): | ||
self._diff = diff | ||
|
||
def __repr__(self) -> str: | ||
return f"TabularDiff(shape={self._diff.data.shape})\n\n{self._diff.data}" | ||
|
||
@property | ||
def data(self) -> DataFrame: | ||
""" | ||
Returns the data of the diff as a polars data frame. | ||
""" | ||
return self._diff.data |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
from ..oxen import PyTextDiff, PyLineDiff, PyChangeType | ||
|
||
from oxen.diff.change_type import ChangeType | ||
|
||
|
||
class LineDiff: | ||
""" | ||
A class representing a change in a line of text. | ||
+ Added | ||
- Removed | ||
""" | ||
|
||
def __init__(self, diff: PyLineDiff): | ||
self._diff = diff | ||
|
||
def __repr__(self) -> str: | ||
return ( | ||
f"LineDiff(modification={self._diff.modification}, text={self._diff.text})" | ||
) | ||
|
||
@property | ||
def modification(self) -> ChangeType: | ||
""" | ||
Returns the modification of the line diff. | ||
""" | ||
match self._diff.modification: | ||
case PyChangeType.Added: | ||
return ChangeType.ADDED | ||
case PyChangeType.Removed: | ||
return ChangeType.REMOVED | ||
case PyChangeType.Modified: | ||
return ChangeType.MODIFIED | ||
case PyChangeType.Unchanged: | ||
return ChangeType.UNCHANGED | ||
case _: | ||
raise ValueError(f"Invalid modification: {self._diff.modification}") | ||
|
||
@property | ||
def text(self) -> str: | ||
""" | ||
Returns the text of the line diff. | ||
""" | ||
return self._diff.text | ||
|
||
|
||
class TextDiff: | ||
""" | ||
A class representing a text diff. | ||
""" | ||
def __init__(self, diff: PyTextDiff): | ||
self._diff = diff | ||
|
||
def __repr__(self) -> str: | ||
return f"TextDiff(num_added={self.num_added}, num_removed={self.num_removed})" | ||
|
||
def __str__(self) -> str: | ||
# iterate over lines and print them with a + or - prefix | ||
return "\n".join([f"{line.value}" for line in self._diff.lines]) | ||
|
||
@property | ||
def num_added(self) -> int: | ||
""" | ||
Returns the number of added lines in the diff. | ||
""" | ||
# count the number of added lines | ||
return self._count_lines(PyChangeType.Added) | ||
|
||
@property | ||
def num_removed(self) -> int: | ||
""" | ||
Returns the number of removed lines in the diff. | ||
""" | ||
# count the number of removed lines | ||
return self._count_lines(PyChangeType.Removed) | ||
|
||
@property | ||
def lines(self) -> list[LineDiff]: | ||
""" | ||
Returns the contents of the diff as a polars data frame. | ||
""" | ||
# map the PyLineDiff to LineDiff | ||
return [LineDiff(line) for line in self._diff.lines] | ||
|
||
def _count_lines(self, modification: PyChangeType) -> int: | ||
return len( | ||
[line for line in self._diff.lines if line.modification == modification] | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
|
||
|
||
|
||
use pyo3::prelude::*; | ||
use std::path::PathBuf; | ||
|
||
use liboxen::core::df::tabular; | ||
use crate::error::PyOxenError; | ||
|
||
use pyo3_polars::PyDataFrame; | ||
|
||
#[pyfunction] | ||
pub fn save( | ||
df: PyDataFrame, | ||
path: PathBuf | ||
) -> Result<(), PyOxenError> { | ||
let mut df = df.as_ref().clone(); | ||
tabular::write_df(&mut df, path)?; | ||
Ok(()) | ||
} |
Oops, something went wrong.