From 4602179a54556aebe85e83bd4506f31560e2528a Mon Sep 17 00:00:00 2001 From: mahaalbashir Date: Tue, 31 Oct 2023 11:39:12 +0000 Subject: [PATCH 1/3] updating pivot_table --- CHANGELOG.md | 9 +++++++++ acro/acro_tables.py | 27 +++++++++++++++++++++++++++ test/test_initial.py | 33 +++++++++++++++++++++++++++++++++ 3 files changed, 69 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 231a96d..2d819e9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,15 @@ ## Development Changes: +* Update table suppression when totals are true for pivot table ([#165](https://github.com/AI-SDC/ACRO/pull/165)) +* Fix the problem of shape mismatch when there are two columns and the aggfunc is count or sum ([#167](https://github.com/AI-SDC/ACRO/pull/167)) +* Remove all files and folders created during testing ([#168](https://github.com/AI-SDC/ACRO/pull/168)) +* Create an example notebook with simple examples of acro ([#170](https://github.com/AI-SDC/ACRO/pull/170)) +* Add support for histogram ([#176](https://github.com/AI-SDC/ACRO/pull/176)) +* Add inherited members from acro_tables and acro_regression to the sphinx docs ([#177](https://github.com/AI-SDC/ACRO/pull/177)) +* Update the R help function ([#178](https://github.com/AI-SDC/ACRO/pull/178)) +* Update the finalise function by checking the provided folder name and ask for new one if it exists ([#179](https://github.com/AI-SDC/ACRO/pull/179)) +* Add histogram and survival analysis to R ([#182](https://github.com/AI-SDC/ACRO/pull/182)) ## Version 0.4.3 (Sep 22, 2023) diff --git a/acro/acro_tables.py b/acro/acro_tables.py index 0318f36..7988e0f 100644 --- a/acro/acro_tables.py +++ b/acro/acro_tables.py @@ -244,6 +244,9 @@ def pivot_table( # pylint: disable=too-many-arguments,too-many-locals (hierarchical indexes) on the index and columns of the result DataFrame. + To provide consistent behaviour with different aggregation functions, + 'empty' rows or columns -i.e. that are all NaN or 0 (count,sum) are removed. + Parameters ---------- data : DataFrame @@ -307,6 +310,29 @@ def pivot_table( # pylint: disable=too-many-arguments,too-many-locals sort, ) + # delete empty rows and columns from table + deleted_rows = [] + deleted_cols = [] + # define empty columns and rows using boolean masks + empty_cols_mask = table.sum(axis=0) == 0 + empty_rows_mask = table.sum(axis=1) == 0 + + deleted_cols = list(table.columns[empty_cols_mask]) + table = table.loc[:, ~empty_cols_mask] + deleted_rows = list(table.index[empty_rows_mask]) + table = table.loc[~empty_rows_mask, :] + + # create a message with the deleted column's names + comments = [] + if deleted_cols: + msg_cols = ", ".join(str(col) for col in deleted_cols) + comments.append(f"Empty columns: {msg_cols} were deleted.") + if deleted_rows: + msg_rows = ", ".join(str(row) for row in deleted_rows) + comments.append(f"Empty rows: {msg_rows} were deleted.") + if comments: + logger.info(" ".join(comments)) + # suppression masks to apply based on the following checks masks: dict[str, DataFrame] = {} @@ -387,6 +413,7 @@ def pivot_table( # pylint: disable=too-many-arguments,too-many-locals summary=summary, outcome=outcome, output=[table], + comments=comments, ) return table diff --git a/test/test_initial.py b/test/test_initial.py index aecfe63..66a5051 100644 --- a/test/test_initial.py +++ b/test/test_initial.py @@ -169,6 +169,39 @@ def test_pivot_table_cols(data, acro): shutil.rmtree(PATH) +def test_pivot_table_with_aggfunc_sum(data, acro): + """Test the pivot table with two columns and aggfunc sum.""" + acro = ACRO(suppress=False) + _ = acro.pivot_table( + data, + index="year", + columns=["grant_type", "survivor"], + values="inc_grants", + aggfunc="sum", + ) + _ = acro.pivot_table( + data, + index=["grant_type", "survivor"], + columns="year", + values="inc_grants", + aggfunc="sum", + ) + acro.add_exception("output_0", "Let me have it") + acro.add_exception("output_1", "I need this output") + results: Records = acro.finalise(PATH) + output_0 = results.get_index(0) + output_1 = results.get_index(1) + comment_0 = ( + "Empty columns: ('N', 'Dead in 2015'), ('R/G', 'Dead in 2015') were deleted." + ) + comment_1 = ( + "Empty rows: ('N', 'Dead in 2015'), ('R/G', 'Dead in 2015') were deleted." + ) + assert output_0.comments == [comment_0] + assert output_1.comments == [comment_1] + shutil.rmtree(PATH) + + def test_ols(data, acro): """Ordinary Least Squares test.""" new_df = data[["inc_activity", "inc_grants", "inc_donations", "total_costs"]] From ac000044c66af0a2e443f9b48528798f43bfe585 Mon Sep 17 00:00:00 2001 From: mahaalbashir Date: Tue, 31 Oct 2023 11:51:53 +0000 Subject: [PATCH 2/3] fixing pyint issue --- acro/acro_tables.py | 70 +++++++++------------- notebooks/test.ipynb | 135 ------------------------------------------- 2 files changed, 28 insertions(+), 177 deletions(-) diff --git a/acro/acro_tables.py b/acro/acro_tables.py index 7988e0f..07e6115 100644 --- a/acro/acro_tables.py +++ b/acro/acro_tables.py @@ -137,27 +137,7 @@ def crosstab( # pylint: disable=too-many-arguments,too-many-locals normalize, ) # delete empty rows and columns from table - deleted_rows = [] - deleted_cols = [] - # define empty columns and rows using boolean masks - empty_cols_mask = table.sum(axis=0) == 0 - empty_rows_mask = table.sum(axis=1) == 0 - - deleted_cols = list(table.columns[empty_cols_mask]) - table = table.loc[:, ~empty_cols_mask] - deleted_rows = list(table.index[empty_rows_mask]) - table = table.loc[~empty_rows_mask, :] - - # create a message with the deleted column's names - comments = [] - if deleted_cols: - msg_cols = ", ".join(str(col) for col in deleted_cols) - comments.append(f"Empty columns: {msg_cols} were deleted.") - if deleted_rows: - msg_rows = ", ".join(str(row) for row in deleted_rows) - comments.append(f"Empty rows: {msg_rows} were deleted.") - if comments: - logger.info(" ".join(comments)) + table, comments = delete_empty_rows_columns(table) masks = create_crosstab_masks( index, @@ -311,27 +291,7 @@ def pivot_table( # pylint: disable=too-many-arguments,too-many-locals ) # delete empty rows and columns from table - deleted_rows = [] - deleted_cols = [] - # define empty columns and rows using boolean masks - empty_cols_mask = table.sum(axis=0) == 0 - empty_rows_mask = table.sum(axis=1) == 0 - - deleted_cols = list(table.columns[empty_cols_mask]) - table = table.loc[:, ~empty_cols_mask] - deleted_rows = list(table.index[empty_rows_mask]) - table = table.loc[~empty_rows_mask, :] - - # create a message with the deleted column's names - comments = [] - if deleted_cols: - msg_cols = ", ".join(str(col) for col in deleted_cols) - comments.append(f"Empty columns: {msg_cols} were deleted.") - if deleted_rows: - msg_rows = ", ".join(str(row) for row in deleted_rows) - comments.append(f"Empty rows: {msg_rows} were deleted.") - if comments: - logger.info(" ".join(comments)) + table, comments = delete_empty_rows_columns(table) # suppression masks to apply based on the following checks masks: dict[str, DataFrame] = {} @@ -845,6 +805,32 @@ def create_crosstab_masks( # pylint: disable=too-many-arguments,too-many-locals return masks +def delete_empty_rows_columns(table): + """Deletes empty rows and columns from table.""" + deleted_rows = [] + deleted_cols = [] + # define empty columns and rows using boolean masks + empty_cols_mask = table.sum(axis=0) == 0 + empty_rows_mask = table.sum(axis=1) == 0 + + deleted_cols = list(table.columns[empty_cols_mask]) + table = table.loc[:, ~empty_cols_mask] + deleted_rows = list(table.index[empty_rows_mask]) + table = table.loc[~empty_rows_mask, :] + + # create a message with the deleted column's names + comments = [] + if deleted_cols: + msg_cols = ", ".join(str(col) for col in deleted_cols) + comments.append(f"Empty columns: {msg_cols} were deleted.") + if deleted_rows: + msg_rows = ", ".join(str(row) for row in deleted_rows) + comments.append(f"Empty rows: {msg_rows} were deleted.") + if comments: + logger.info(" ".join(comments)) + return (table, comments) + + def rounded_survival_table(survival_table): """Calculates the rounded surival function.""" death_censored = ( diff --git a/notebooks/test.ipynb b/notebooks/test.ipynb index 3a30abe..9283b13 100644 --- a/notebooks/test.ipynb +++ b/notebooks/test.ipynb @@ -1657,141 +1657,6 @@ "table" ] }, - { - "cell_type": "code", - "execution_count": 17, - "id": "3f016823", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
year201020112012201320142015All
grant_type
G138906688.0127533696.0171878704.0203357200.0206222208.0133601200.09.814997e+08
N0.07192804.07779685.08728330.07858697.08501187.04.006070e+07
R504137056.0532464704.0480105472.0511361408.0554594176.0551457280.03.134120e+09
R/G46544000.0128380000.0134480000.0134125000.0142766000.0146228992.07.325240e+08
All689587776.0795571264.0794243904.0857571968.0911441088.0839788672.04.888204e+09
\n", - "
" - ], - "text/plain": [ - "year 2010 2011 2012 2013 2014 \\\n", - "grant_type \n", - "G 138906688.0 127533696.0 171878704.0 203357200.0 206222208.0 \n", - "N 0.0 7192804.0 7779685.0 8728330.0 7858697.0 \n", - "R 504137056.0 532464704.0 480105472.0 511361408.0 554594176.0 \n", - "R/G 46544000.0 128380000.0 134480000.0 134125000.0 142766000.0 \n", - "All 689587776.0 795571264.0 794243904.0 857571968.0 911441088.0 \n", - "\n", - "year 2015 All \n", - "grant_type \n", - "G 133601200.0 9.814997e+08 \n", - "N 8501187.0 4.006070e+07 \n", - "R 551457280.0 3.134120e+09 \n", - "R/G 146228992.0 7.325240e+08 \n", - "All 839788672.0 4.888204e+09 " - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "table = pd.crosstab(\n", - " index=df[\"grant_type\"],\n", - " columns=df[\"year\"],\n", - " values=df[\"inc_grants\"],\n", - " aggfunc=\"sum\",\n", - " margins=True,\n", - ")\n", - "table" - ] - }, { "cell_type": "code", "execution_count": 18, From eddc20ee7d9beaf1dedaad05d5afbe990ec64930 Mon Sep 17 00:00:00 2001 From: mahaalbashir Date: Tue, 31 Oct 2023 13:10:37 +0000 Subject: [PATCH 3/3] adding docstring --- acro/acro_tables.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/acro/acro_tables.py b/acro/acro_tables.py index 07e6115..5f5e738 100644 --- a/acro/acro_tables.py +++ b/acro/acro_tables.py @@ -805,8 +805,21 @@ def create_crosstab_masks( # pylint: disable=too-many-arguments,too-many-locals return masks -def delete_empty_rows_columns(table): - """Deletes empty rows and columns from table.""" +def delete_empty_rows_columns(table: DataFrame) -> tuple[DataFrame, list[str]]: + """Deletes empty rows and columns from table. + + Parameters + ---------- + table : DataFrame + The table where the empty rows and columns will be deleted from. + + Returns + ------- + DataFrame + The resulting table where the empty columns and rows were deleted. + list[str] + A comment showing information about the deleted columns and rows. + """ deleted_rows = [] deleted_cols = [] # define empty columns and rows using boolean masks