Skip to content

Commit

Permalink
v0.3.17
Browse files Browse the repository at this point in the history
  • Loading branch information
AmenRa committed Sep 27, 2023
1 parent 643b3d7 commit ae2550b
Show file tree
Hide file tree
Showing 6 changed files with 44 additions and 17 deletions.
7 changes: 7 additions & 0 deletions changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,13 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [0.3.17] - 2022-09-27
### Changed
- All `Run` import methods allow for specifying the `name` of the run.

### Fixed
- Fixed misleading error messages when importing `Qrels` and `Run` from `pandas.DataFrame` with wrong `dtypes`.

## [0.3.16] - 2022-08-03
### Added
- Added support for importing qrels from `parquet` files in `qrels.py`.
Expand Down
11 changes: 9 additions & 2 deletions docs/run.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ run = Run(run_dict, name="bm25")
Parse a run file into `ranx.Run`.
Supported formats are JSON, TREC run, gzipped TREC run, and LZ4.
Correct import behavior is inferred from the file extension: `.json` -> `json`, `.trec` -> `trec`, `.txt` -> `trec`, `.gz` -> `trec`, `.lz4` -> `lz4`.
Use the `kind` argument to override the default behavior.
Use the argument `kind` to override the default behavior.
Use the argument `name` to set the name of the run. Default is `None`.

```python
run = Run.from_file("path/to/run.json") # JSON file
Expand All @@ -40,6 +41,9 @@ run = Run.from_file("path/to/run.custom", kind="json") # Loaded as JSON file
```

## Load from Pandas DataFrames
`ranx` can load `runs` from Pandas DataFrames.
The argument `name` is used to set the name of the run. Default is `None`.

```python
from pandas import DataFrame

Expand All @@ -54,12 +58,14 @@ run = Run.from_df(
q_id_col="q_id",
doc_id_col="doc_id",
score_col="score",
name="my_run",
)
```

## Load from Parquet files
`ranx` can load `runs` from Parquet files, even from remote sources.
You can control the behavior of the underlying `pandas.read_parquet` function by passing additional arguments through the `pd_kwargs` argument (see https://pandas.pydata.org/docs/reference/api/pandas.read_parquet.html).
You can control the behavior of the underlying `pandas.read_parquet` function by passing additional arguments through the `pd_kwargs` argument (see https://pandas.pydata.org/docs/reference/api/pandas.read_parquet.html).
The argument `name` is used to set the name of the run. Default is `None`.

```python
run = Run.from_parquet(
Expand All @@ -68,6 +74,7 @@ run = Run.from_parquet(
doc_id_col="doc_id",
score_col="score",
pd_kwargs=None,
name="my_run",
)
```

Expand Down
4 changes: 2 additions & 2 deletions ranx/data_structures/qrels.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,10 +293,10 @@ def from_df(
"""
assert (
df[q_id_col].dtype == "O"
), "DataFrame scores column dtype must be `object` (string)"
), "DataFrame Query IDs column dtype must be `object` (string)"
assert (
df[doc_id_col].dtype == "O"
), "DataFrame scores column dtype must be `object` (string)"
), "DataFrame Document IDs column dtype must be `object` (string)"
assert df[score_col].dtype == int, "DataFrame scores column dtype must be `int`"

qrels_dict = (
Expand Down
19 changes: 13 additions & 6 deletions ranx/data_structures/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,11 +221,12 @@ def save(self, path: str = "run.json", kind: str = None):
f.write("\n")

@staticmethod
def from_dict(d: Dict[str, Dict[str, float]]):
def from_dict(d: Dict[str, Dict[str, float]], name: str = None):
"""Convert a Python dictionary in form of {q_id: {doc_id: score}} to ranx.Run.
Args:
d (Dict[str, Dict[str, int]]): Run as Python dictionary
name (str, optional): Run name. Defaults to None.
Returns:
Run: ranx.Run
Expand All @@ -248,6 +249,7 @@ def from_dict(d: Dict[str, Dict[str, float]]):
run = Run()
run.run = create_and_sort(q_ids, doc_ids, scores)
run.sorted = True
run.name = name

return run

Expand All @@ -258,6 +260,7 @@ def from_file(path: str, kind: str = None, name: str = None):
Args:
path (str): File path.
kind (str, optional): Kind of file to load, must be either "json" or "trec".
name (str, optional): Run name. Defaults to None.
Returns:
Run: ranx.Run
Expand All @@ -279,8 +282,7 @@ def from_file(path: str, kind: str = None, name: str = None):
if name is None:
name = run_name

run = Run.from_dict(run)
run.name = name
run = Run.from_dict(run, name)

return run

Expand All @@ -290,6 +292,7 @@ def from_df(
q_id_col: str = "q_id",
doc_id_col: str = "doc_id",
score_col: str = "score",
name: str = None,
):
"""Convert a Pandas DataFrame to ranx.Run.
Expand All @@ -298,16 +301,17 @@ def from_df(
q_id_col (str, optional): Query IDs column. Defaults to "q_id".
doc_id_col (str, optional): Document IDs column. Defaults to "doc_id".
score_col (str, optional): Relevance scores column. Defaults to "score".
name (str, optional): Run name. Defaults to None.
Returns:
Run: ranx.Run
"""
assert (
df[q_id_col].dtype == "O"
), "DataFrame scores column dtype must be `object` (string)"
), "DataFrame Query IDs column dtype must be `object` (string)"
assert (
df[doc_id_col].dtype == "O"
), "DataFrame scores column dtype must be `object` (string)"
), "DataFrame Document IDs column dtype must be `object` (string)"
assert (
df[score_col].dtype == float
), "DataFrame scores column dtype must be `float`"
Expand All @@ -318,7 +322,7 @@ def from_df(
.to_dict()
)

return Run.from_dict(run_py)
return Run.from_dict(run_py, name)

@staticmethod
def from_parquet(
Expand All @@ -327,6 +331,7 @@ def from_parquet(
doc_id_col: str = "doc_id",
score_col: str = "score",
pd_kwargs: Dict[str, Any] = None,
name: str = None,
):
"""Convert a Parquet file to ranx.Run.
Expand All @@ -336,6 +341,7 @@ def from_parquet(
doc_id_col (str, optional): Document IDs column. Defaults to "doc_id".
score_col (str, optional): Relevance scores column. Defaults to "score".
pd_kwargs (Dict[str, Any], optional): Additional arguments to pass to `pandas.read_parquet` (see https://pandas.pydata.org/docs/reference/api/pandas.read_parquet.html). Defaults to None.
name (str, optional): Run name. Defaults to None.
Returns:
Run: ranx.Run
Expand All @@ -347,6 +353,7 @@ def from_parquet(
q_id_col=q_id_col,
doc_id_col=doc_id_col,
score_col=score_col,
name=name,
)

@staticmethod
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

setuptools.setup(
name="ranx",
version="0.3.16",
version="0.3.17",
author="Elias Bassani",
author_email="elias.bssn@gmail.com",
description="ranx: A Blazing-Fast Python Library for Ranking Evaluation, Comparison, and Fusion",
Expand Down
18 changes: 12 additions & 6 deletions tests/unit/ranx/data_structures/run_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,8 +203,9 @@ def test_to_dataframe():

def test_save_load_json(run):
Run(run).save("tests/unit/ranx/test_data/run.json")
run = Run.from_file("tests/unit/ranx/test_data/run.json")
run = Run.from_file("tests/unit/ranx/test_data/run.json", name="test_run")

assert run.name == "test_run"
assert len(run.run) == 2
assert len(run.run["q1"]) == 3
assert len(run.run["q2"]) == 2
Expand All @@ -217,8 +218,9 @@ def test_save_load_json(run):

def test_save_load_trec(run):
Run(run).save("tests/unit/ranx/test_data/run.trec")
run = Run.from_file("tests/unit/ranx/test_data/run.trec")
run = Run.from_file("tests/unit/ranx/test_data/run.trec", name="test_run")

assert run.name == "test_run"
assert len(run.run) == 2
assert len(run.run["q1"]) == 3
assert len(run.run["q2"]) == 2
Expand All @@ -230,8 +232,9 @@ def test_save_load_trec(run):


def test_load_gzipped_trec(run):
run = Run.from_file("tests/unit/ranx/test_data/run.trec.gz")
run = Run.from_file("tests/unit/ranx/test_data/run.trec.gz", name="test_run")

assert run.name == "test_run"
assert len(run.run) == 2
assert len(run.run["q1"]) == 3
assert len(run.run["q2"]) == 2
Expand Down Expand Up @@ -269,8 +272,9 @@ def test_from_dict():
},
}

run = Run.from_dict(run_py)
run = Run.from_dict(run_py, name="test_run")

assert run.name == "test_run"
assert len(run.run) == 2
assert len(run.run["q1"]) == 3
assert len(run.run["q2"]) == 2
Expand All @@ -290,8 +294,9 @@ def test_from_dataframe():
}
)

run = Run.from_df(df)
run = Run.from_df(df, name="test_run")

assert run.name == "test_run"
assert len(run.run) == 2
assert len(run.run["q1"]) == 3
assert len(run.run["q2"]) == 2
Expand All @@ -303,8 +308,9 @@ def test_from_dataframe():


def test_from_parquet():
run = Run.from_parquet("tests/unit/ranx/test_data/run.parquet")
run = Run.from_parquet("tests/unit/ranx/test_data/run.parquet", name="test_run")

assert run.name == "test_run"
assert len(run.run) == 2
assert len(run.run["q1"]) == 3
assert len(run.run["q2"]) == 2
Expand Down

0 comments on commit ae2550b

Please sign in to comment.