In [1]:
import pandas as pd
import numpy as np
# import bioframe
from akita_utils.tsv_gen_utils import filter_by_overlap

In [2]:
def test_filter_by_overlap():
    
    # sub-test 1 : testing if counting works

    df1 = pd.DataFrame(
        [
            ["chr1", 50, 60],
            ["chrX", 400, 410],
            ["chr1", 500, 510],
            ["chr1", 520, 530],
            ["chr1", 560, 570],
        ],
        columns=["chrom", "start", "end"],
    )
    df2 = pd.DataFrame(
        [
            ["chrX", 400, 410],
            ["chr1", 500, 510],
            ["chr1", 500, 510],
            ["chr1", 520, 530],
            ["chr1", 560, 570],
            ["chr1", 1500, 1510],
        ],
        columns=["chrom", "start", "end"],
    )

    test1_out = filter_by_overlap(df1,
                                    df2,
                                    exclude_window=0,
                                    overlap_threshold=1)

    assert len(test1_out) == 4, "sub-test 1 failed"
    # the only row that is filtered out is: ["chr1", 500, 510]
    # since it repeats twice in the df2
    
    
    # sub-test 2 : checking if expanding the window by 30 bp works

    df1 = pd.DataFrame(
        [
            ["chr1", 50, 60],
            ["chr1", 500, 510],
            ["chr1", 560, 570],
        ],
        columns=["chrom", "start", "end"],
    )
    df2 = pd.DataFrame(
        [
            ["chr1", 520, 530],
        ],
        columns=["chrom", "start", "end"],
    )


    test2_out = filter_by_overlap(df1,
                                df2,
                                exclude_window=30)

    assert len(test2_out) == 2, "sub-test 2 failed"
    # only the middle, ["chr1", 500, 510], row should be filtered out
    
    
    # sub-test 3 : checking if expanding the window by 40 bp works

    df1 = pd.DataFrame(
        [
            ["chrX", 400, 410],
            ["chr1", 459, 469],
            ["chr1", 520, 530],
            ["chr1", 560, 570],
            ["chr1", 1500, 1510],
        ],
        columns=["chrom", "start", "end"],
    )
    df2 = pd.DataFrame(
        [
            ["chr1", 500, 510],
        ],
        columns=["chrom", "start", "end"],
    )

    test3_out = filter_by_overlap(df1,
                df2,
                exclude_window=60)

    assert len(test3_out) == 2, "sub-test 3 failed"
    # the only left df1 rows are the first and the last 
    # since they do not overlap expanded window od df2
    
    
    # sub-test 4 : as sub-test 3, but we're checking the col names

    df1 = pd.DataFrame(
        [
            ["chrX", 400, 410],
            ["chr1", 459, 469],
            ["chr1", 520, 530],
            ["chr1", 560, 570],
            ["chr1", 1500, 1510],
        ],
        columns=["alt_chrom", "alt_start", "alt_end"],
    )
    df2 = pd.DataFrame(
        [
            ["chr1", 500, 510],
        ],
        columns=["chrom", "start", "end"],
    )

    test4_out = filter_by_overlap(df1,
                df2,
                working_df_cols=["alt_chrom", "alt_start", "alt_end"])

    assert len(test4_out) == 2, "sub-test 4 failed"
    # same as in sub-test 3;
    # checking if column names' renaming works as expected
    

In [3]:
test_filter_by_overlap()