Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Missing Some Pandas Abstract Methods on GeoArray #1445

Closed
stdavis opened this issue Jan 19, 2023 · 3 comments
Closed

Missing Some Pandas Abstract Methods on GeoArray #1445

stdavis opened this issue Jan 19, 2023 · 3 comments
Assignees

Comments

@stdavis
Copy link

stdavis commented Jan 19, 2023

Describe the bug
I'm unable to compare spatial dataframes using panda.DataFrame.eq.

To Reproduce
Steps to reproduce the behavior:

sdf.eq(another_sdf)

error:

pandas.errors.AbstractMethodError: This method must be defined in the concrete class GeoArray

Screenshots
n/a

Expected behavior
I expected to be able to use this method to compare the data in two spatial dataframes.

Platform (please complete the following information):

  • OS: macOS
  • Browser: n/a
  • Python API Version: 2.0.1

Additional context
n/a

@nanaeaubry
Copy link
Contributor

Hi @stdavis we are still working on this.

In the meantime someone provided a helper function they made for the same reason

def compare_df(new_df, old_df, match_field, columns: bool = True):
    diff = {
        "added_rows": {},
        "deleted_rows": {},
        "added_columns": {},
        "deleted_columns": {},
        "modified_rows": {},
    }

    if old_df.empty and new_df.empty:
        logger.error("Both dataframes are empty, cannot compate two empty dataframes")
        return diff

    if old_df.empty and not new_df.empty:
        old_df = pd.DataFrame(data=None, columns=new_df.columns, index=new_df.index)

    if new_df.empty and not old_df.empty:
        new_df = pd.DataFrame(data=None, columns=old_df.columns, index=old_df.index)

    if columns:
        # New columns
        for column in new_df.columns:
            if column not in old_df.columns:
                diff["added_columns"][column] = new_df[[match_field, column]].copy()

        for column in old_df.columns:
            if column not in new_df.columns:
                diff["deleted_columns"][column] = old_df[[match_field, column]].copy()

        # Common columns
        common_columns_list = []
        for column in new_df.columns:
            if column in old_df.columns:
                common_columns_list.append(column)

    # Finding changes in rows
    merged_rows = new_df.merge(
        old_df,
        on=match_field,
        how="outer",
        indicator=True,
        suffixes=("_new", "_old"),
    )

    # Finding added rows
    added_rows = merged_rows[merged_rows["_merge"] == "left_only"].drop(
        columns=["_merge"]
    )
    # Removing the old
    for column in added_rows.columns:
        if column.endswith("_old"):
            added_rows = added_rows.drop(columns=[column])
        # Renaming the new
        if column.endswith("_new"):
            added_rows = added_rows.rename(columns={column: column.rstrip("_new")})
    diff["added_rows"] = added_rows

    # Finding deleted rows
    deleted_rows = merged_rows[merged_rows["_merge"] == "right_only"].drop(
        columns=["_merge"]
    )
    # Removing the new
    for column in deleted_rows.columns:
        if column.endswith("_new"):
            deleted_rows = deleted_rows.drop(columns=[column])
        # Renaming the old
        deleted_rows = deleted_rows.rename(columns={column: column.rstrip("_old")})
    diff["deleted_rows"] = deleted_rows

    # Finding modified rows
    common_rows_match_field_list = merged_rows[merged_rows["_merge"] == "both"][
        match_field
    ].to_list()

    # Looking at the rows that are existing in both the old and new layers so that we can compare them
    common_rows_new = new_df[new_df[match_field].isin(common_rows_match_field_list)]
    common_rows_old = old_df[old_df[match_field].isin(common_rows_match_field_list)]

    # Compare common columns attributes
    merged_common_rows = common_rows_new.merge(
        common_rows_old,
        on=None,
        how="outer",
        indicator=True,
    )

    modified_rows = merged_common_rows[
        merged_common_rows["_merge"] == "left_only"
    ].drop(columns=["_merge"])
    diff["modified_rows"] = modified_rows

    return diff

@nanaeaubry
Copy link
Contributor

We have added an eq to our GeoAccessor for the next release.

You will have to make sure to use the spatial namespace to do it :) We also implemented a version of the method above compare into the GeoAccessor class.

Hopefully this will be helpful for your workflows.

A beta release will be made in July for 2.2.0

@rperendy
Copy link

rperendy commented May 1, 2024

@nanaeaubry I just wanted to add that I had to make some slight modifications for compare_df.

  1. The column.rstrip() for "_new" or "old" would remove "e" for added_rows if the column name had an "e" before ""; likewise for "o" in deleted_rows.
  2. The merge works for old and new dfs have the same column names if either one is empty. However, if this is not the case and if the geometry column in either has nulls for some rows it would throw an error.
  3. if either df is empty common_rows_match_field_list will be an empty array but merged_common_rows was throwing errors.

This is what I edited to make it work for me:

def replace_none_geometries(df, geom_columns, placeholder):
    for col in geom_columns:
        df[col] = df[col].apply(lambda x: placeholder if x is None else x)
    return df

def check_geometries(df):
    geom_cols = []
    for col in df.columns:
        if(df[col].dtype == 'geometry'):
            geom_cols.append(col)
    if len(geom_cols) > 0:
        df = replace_none_geometries(df,geom_cols,{})
    return df

def compare_df(new_df, old_df, match_field, columns: bool = True):
    diff = {
        "added_rows": {},
        "deleted_rows": {},
        "added_columns": {},
        "deleted_columns": {},
        "modified_rows": {},
    }
    
    check_geometry = False
    
    if len(old_df.columns) != len(new_df.columns):
        check_geometry = True

    if old_df.empty and new_df.empty:
        logger.error("Both dataframes are empty, cannot compate two empty dataframes")
        return diff

    if old_df.empty and not new_df.empty:
        if check_geometry:
            new_df = check_geometries(new_df)
        old_df = pd.DataFrame(data=None, columns=new_df.columns, index=new_df.index)

    if new_df.empty and not old_df.empty:
        if check_geometry:
            old_df = check_geometries(old_df)
        new_df = pd.DataFrame(data=None, columns=old_df.columns, index=old_df.index)

    if columns:
        # New columns
        for column in new_df.columns:
            if column not in old_df.columns:
                diff["added_columns"][column] = new_df[[match_field, column]].copy()

        for column in old_df.columns:
            if column not in new_df.columns:
                diff["deleted_columns"][column] = old_df[[match_field, column]].copy()

        # Common columns
        common_columns_list = []
        for column in new_df.columns:
            if column in old_df.columns:
                common_columns_list.append(column)
    
    # Finding changes in rows
    merged_rows = new_df.merge(
        old_df,
        on=match_field,
        how="outer",
        indicator=True,
        suffixes=("_new", "_old"),
    )
    
    # Finding added rows
    added_rows = merged_rows[merged_rows["_merge"] == "left_only"].drop(
        columns=["_merge"]
    )
    
    # Removing the old
    for column in added_rows.columns:
        if column.endswith("_old"):
            added_rows = added_rows.drop(columns=[column])
        # Renaming the new
        if column.endswith("_new"):
            new_column_name = column[:-len("_new")]
            added_rows = added_rows.rename(columns={column: new_column_name})
            #added_rows = added_rows.rename(columns={column: column.rstrip("_new")})
    diff["added_rows"] = added_rows

    # Finding deleted rows
    deleted_rows = merged_rows[merged_rows["_merge"] == "right_only"].drop(
        columns=["_merge"]
    )
    # Removing the new
    for column in deleted_rows.columns:
        if column.endswith("_new"):
            deleted_rows = deleted_rows.drop(columns=[column])
        # Renaming the old
        if column.endswith("_old"):
            new_column_name = column[:-len("_old")]
            deleted_rows = deleted_rows.rename(columns={column: new_column_name})
            #deleted_rows = deleted_rows.rename(columns={column: column.rstrip("_old")})
    diff["deleted_rows"] = deleted_rows

    # Finding modified rows
    common_rows_match_field_list = merged_rows[merged_rows["_merge"] == "both"][
        match_field
    ].to_list()
    
    if len(common_rows_match_field_list) > 0:
        # Looking at the rows that are existing in both the old and new layers so that we can compare them
        common_rows_new = new_df[new_df[match_field].isin(common_rows_match_field_list)]
        common_rows_old = old_df[old_df[match_field].isin(common_rows_match_field_list)]
        # Compare common columns attributes
        merged_common_rows = common_rows_new.merge(
            common_rows_old,
            on=None,
            how="outer",
            indicator=True,
        )

        modified_rows = merged_common_rows[
            merged_common_rows["_merge"] == "left_only"
        ].drop(columns=["_merge"])
        diff["modified_rows"] = modified_rows

    return diff

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

No branches or pull requests

4 participants