diff --git a/CHANGELOG.md b/CHANGELOG.md index 231a96d..2d819e9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,15 @@ ## Development Changes: +* Update table suppression when totals are true for pivot table ([#165](https://github.com/AI-SDC/ACRO/pull/165)) +* Fix the problem of shape mismatch when there are two columns and the aggfunc is count or sum ([#167](https://github.com/AI-SDC/ACRO/pull/167)) +* Remove all files and folders created during testing ([#168](https://github.com/AI-SDC/ACRO/pull/168)) +* Create an example notebook with simple examples of acro ([#170](https://github.com/AI-SDC/ACRO/pull/170)) +* Add support for histogram ([#176](https://github.com/AI-SDC/ACRO/pull/176)) +* Add inherited members from acro_tables and acro_regression to the sphinx docs ([#177](https://github.com/AI-SDC/ACRO/pull/177)) +* Update the R help function ([#178](https://github.com/AI-SDC/ACRO/pull/178)) +* Update the finalise function by checking the provided folder name and ask for new one if it exists ([#179](https://github.com/AI-SDC/ACRO/pull/179)) +* Add histogram and survival analysis to R ([#182](https://github.com/AI-SDC/ACRO/pull/182)) ## Version 0.4.3 (Sep 22, 2023) diff --git a/acro/acro_tables.py b/acro/acro_tables.py index 295d746..ddc72ea 100644 --- a/acro/acro_tables.py +++ b/acro/acro_tables.py @@ -137,27 +137,7 @@ def crosstab( # pylint: disable=too-many-arguments,too-many-locals normalize, ) # delete empty rows and columns from table - deleted_rows = [] - deleted_cols = [] - # define empty columns and rows using boolean masks - empty_cols_mask = table.sum(axis=0) == 0 - empty_rows_mask = table.sum(axis=1) == 0 - - deleted_cols = list(table.columns[empty_cols_mask]) - table = table.loc[:, ~empty_cols_mask] - deleted_rows = list(table.index[empty_rows_mask]) - table = table.loc[~empty_rows_mask, :] - - # create a message with the deleted column's names - comments = [] - if deleted_cols: - msg_cols = ", ".join(str(col) for col in deleted_cols) - comments.append(f"Empty columns: {msg_cols} were deleted.") - if deleted_rows: - msg_rows = ", ".join(str(row) for row in deleted_rows) - comments.append(f"Empty rows: {msg_rows} were deleted.") - if comments: - logger.info(" ".join(comments)) + table, comments = delete_empty_rows_columns(table) masks = create_crosstab_masks( index, @@ -244,6 +224,9 @@ def pivot_table( # pylint: disable=too-many-arguments,too-many-locals (hierarchical indexes) on the index and columns of the result DataFrame. + To provide consistent behaviour with different aggregation functions, + 'empty' rows or columns -i.e. that are all NaN or 0 (count,sum) are removed. + Parameters ---------- data : DataFrame @@ -307,6 +290,9 @@ def pivot_table( # pylint: disable=too-many-arguments,too-many-locals sort, ) + # delete empty rows and columns from table + table, comments = delete_empty_rows_columns(table) + # suppression masks to apply based on the following checks masks: dict[str, DataFrame] = {} @@ -387,6 +373,7 @@ def pivot_table( # pylint: disable=too-many-arguments,too-many-locals summary=summary, outcome=outcome, output=[table], + comments=comments, ) return table @@ -837,6 +824,45 @@ def create_crosstab_masks( # pylint: disable=too-many-arguments,too-many-locals return masks +def delete_empty_rows_columns(table: DataFrame) -> tuple[DataFrame, list[str]]: + """Deletes empty rows and columns from table. + + Parameters + ---------- + table : DataFrame + The table where the empty rows and columns will be deleted from. + + Returns + ------- + DataFrame + The resulting table where the empty columns and rows were deleted. + list[str] + A comment showing information about the deleted columns and rows. + """ + deleted_rows = [] + deleted_cols = [] + # define empty columns and rows using boolean masks + empty_cols_mask = table.sum(axis=0) == 0 + empty_rows_mask = table.sum(axis=1) == 0 + + deleted_cols = list(table.columns[empty_cols_mask]) + table = table.loc[:, ~empty_cols_mask] + deleted_rows = list(table.index[empty_rows_mask]) + table = table.loc[~empty_rows_mask, :] + + # create a message with the deleted column's names + comments = [] + if deleted_cols: + msg_cols = ", ".join(str(col) for col in deleted_cols) + comments.append(f"Empty columns: {msg_cols} were deleted.") + if deleted_rows: + msg_rows = ", ".join(str(row) for row in deleted_rows) + comments.append(f"Empty rows: {msg_rows} were deleted.") + if comments: + logger.info(" ".join(comments)) + return (table, comments) + + def rounded_survival_table(survival_table): """Calculates the rounded surival function.""" death_censored = ( diff --git a/notebooks/test.ipynb b/notebooks/test.ipynb index bb20099..84ea5eb 100644 --- a/notebooks/test.ipynb +++ b/notebooks/test.ipynb @@ -1657,141 +1657,6 @@ "table" ] }, - { - "cell_type": "code", - "execution_count": 17, - "id": "3f016823", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
year201020112012201320142015All
grant_type
G138906688.0127533696.0171878704.0203357200.0206222208.0133601200.09.814997e+08
N0.07192804.07779685.08728330.07858697.08501187.04.006070e+07
R504137056.0532464704.0480105472.0511361408.0554594176.0551457280.03.134120e+09
R/G46544000.0128380000.0134480000.0134125000.0142766000.0146228992.07.325240e+08
All689587776.0795571264.0794243904.0857571968.0911441088.0839788672.04.888204e+09
\n", - "
" - ], - "text/plain": [ - "year 2010 2011 2012 2013 2014 \\\n", - "grant_type \n", - "G 138906688.0 127533696.0 171878704.0 203357200.0 206222208.0 \n", - "N 0.0 7192804.0 7779685.0 8728330.0 7858697.0 \n", - "R 504137056.0 532464704.0 480105472.0 511361408.0 554594176.0 \n", - "R/G 46544000.0 128380000.0 134480000.0 134125000.0 142766000.0 \n", - "All 689587776.0 795571264.0 794243904.0 857571968.0 911441088.0 \n", - "\n", - "year 2015 All \n", - "grant_type \n", - "G 133601200.0 9.814997e+08 \n", - "N 8501187.0 4.006070e+07 \n", - "R 551457280.0 3.134120e+09 \n", - "R/G 146228992.0 7.325240e+08 \n", - "All 839788672.0 4.888204e+09 " - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "table = pd.crosstab(\n", - " index=df[\"grant_type\"],\n", - " columns=df[\"year\"],\n", - " values=df[\"inc_grants\"],\n", - " aggfunc=\"sum\",\n", - " margins=True,\n", - ")\n", - "table" - ] - }, { "cell_type": "code", "execution_count": 18, diff --git a/test/test_initial.py b/test/test_initial.py index 7054a05..e01508e 100644 --- a/test/test_initial.py +++ b/test/test_initial.py @@ -169,6 +169,39 @@ def test_pivot_table_cols(data, acro): shutil.rmtree(PATH) +def test_pivot_table_with_aggfunc_sum(data, acro): + """Test the pivot table with two columns and aggfunc sum.""" + acro = ACRO(suppress=False) + _ = acro.pivot_table( + data, + index="year", + columns=["grant_type", "survivor"], + values="inc_grants", + aggfunc="sum", + ) + _ = acro.pivot_table( + data, + index=["grant_type", "survivor"], + columns="year", + values="inc_grants", + aggfunc="sum", + ) + acro.add_exception("output_0", "Let me have it") + acro.add_exception("output_1", "I need this output") + results: Records = acro.finalise(PATH) + output_0 = results.get_index(0) + output_1 = results.get_index(1) + comment_0 = ( + "Empty columns: ('N', 'Dead in 2015'), ('R/G', 'Dead in 2015') were deleted." + ) + comment_1 = ( + "Empty rows: ('N', 'Dead in 2015'), ('R/G', 'Dead in 2015') were deleted." + ) + assert output_0.comments == [comment_0] + assert output_1.comments == [comment_1] + shutil.rmtree(PATH) + + def test_ols(data, acro): """Ordinary Least Squares test.""" new_df = data[["inc_activity", "inc_grants", "inc_donations", "total_costs"]]