# DATASET CURATION - MASKED ROI PROJECT


**Objectives**: 

To create the following groups:
1. **Positive group**: BIRADS 0 that became BIRADS 3, 4, 5, 6 in the subsequent diagnostic study
2. **Negative group**: BIRADS 1, 2 and BIRADS 0 that became BIRADS 1, 2 in the subsequent diagnostic study


## 1. Prep

In [1]:
import pandas as pd
import numpy as np
import random
from tqdm import tqdm

from IPython.display import display

pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 500)

In [2]:
def get_stats(df, suffix=None):
    """Provides a quick summary of a dataframe."""
    try:
        print(f"DF shape: {df.shape}")
        print(f"# Patients: {df.empi_anon.nunique()}")
        print(f"# Cases: {df.acc_anon.nunique()}\n")
        print(f"# Images: {df.png_path.nunique()}\n")
    except Exception as e:
        print(e)

In [3]:
# EMBED
metadata_full = pd.read_csv("/data/mammo/tables/metadata_all_cohort_with_ROI_HITI.csv", dtype=str)
magview_full = pd.read_csv("/data/mammo/tables/magview_all_cohorts_anon_HITI.csv", dtype=str)

In [4]:
# Selecting the following columns

meta_cols = [
    "empi_anon",
    "acc_anon",
    "ImageLateralityFinal",
    "ViewPosition",
    "study_date_anon",
    "FinalImageType",
    "png_path",
    "StudyDescription",
    "match_level",
    "num_roi",
    "ROI_coords"
]

mag_cols = [
    "empi_anon",
    "acc_anon",
    "study_date_anon",
    "desc",
    "side",
    "asses",
    "path_severity",
    "bside",
    'procdate_anon',
    'pdate_anon',
]

In [5]:
metadata = metadata_full[meta_cols].copy()
magview = magview_full[mag_cols].copy()

In [6]:
metadata.study_date_anon = pd.to_datetime(metadata.study_date_anon)
magview.study_date_anon = pd.to_datetime(magview.study_date_anon)

In [7]:
metadata.num_roi = metadata.num_roi.astype(int)

## 2. METADATA: 2D MLO & CC

In [8]:
# EMBED 2D (MLO and CC)
meta_2d = metadata.loc[(metadata.FinalImageType=="2D") & (metadata.ViewPosition.isin(["MLO", "CC"]))]
get_stats(meta_2d)

DF shape: (1645747, 11)
# Patients: 111988
# Cases: 355306

# Images: 1645747



In [9]:
def get_image_stats(df):
    """Provides a quick summary of the number of unique images and the ROIs."""
    temp_df = pd.merge(df, meta_2d, on=["empi_anon", "acc_anon"], how="left")
    temp_df = temp_df.loc[
        (temp_df.side==temp_df.ImageLateralityFinal)
    ]
    temp_df.drop_duplicates(subset="png_path", inplace=True)
    print(f"# PNG PATH: {int(temp_df.png_path.nunique())}")
    print(f"# ROI: {int(temp_df.num_roi.sum())}")
    print(f"{temp_df.num_roi.value_counts()}")
    del temp_df

## 3. Screening

In [10]:
# SCREENING
screening_magview = magview.loc[magview.desc.str.contains("screen", case=False)].copy()
get_stats(screening_magview)

DF shape: (311129, 10)
# Patients: 103704
# Cases: 295885

'DataFrame' object has no attribute 'png_path'


### 3.1. Creating entries for the negative contralateral breast in bilateral examinations

```
MAGVIEW only has entries if a finding exists.

This means that if an exam is a bilateral exam and only one of the breast has a finding, the contralateral breast (negative) won't have an entry.

This would be problematic at the time when we need to merge with METADATA, because the contralateral breast would be excluded.

Therefore, we would need to create rows for the negative contralateral breast.
```

In [11]:
def get_exam_laterality(row):
    """A convenient function to get the exam laterality to be used with DF.apply() instead of iterating over each row."""
    if ("bilat" in row.desc.lower()):
        return "B"
    elif ("left" in row.desc.lower()):
        return "L"
    elif ("right" in row.desc.lower()):
        return "R"
    else:
        return None

In [12]:
# Applying the get_exam_laterality function
screening_magview["exam_laterality"] = screening_magview.apply(get_exam_laterality, axis=1)

In [13]:
screening_magview.exam_laterality.value_counts(dropna=False)

B    298848
R      6360
L      5921
Name: exam_laterality, dtype: int64

In [14]:
screening_magview.side.value_counts(dropna=False)

NaN    208681
L       43373
R       43366
B       15709
Name: side, dtype: int64

In [15]:
# side == nan --> B
screening_magview.side = screening_magview.side.fillna("B")

In [16]:
# create copy for assigning B to R
screening_magview_r = screening_magview.loc[screening_magview.side=="B"].copy()
screening_magview_r.side = screening_magview.side.str.replace("B", "R")

# assigning B to L
screening_magview.side = screening_magview.side.str.replace("B", "L")

# appending R and L
screening_magview = pd.concat([screening_magview, screening_magview_r])

In [17]:
print(screening_magview.side.value_counts(dropna=False))
print(screening_magview.shape)

L    267763
R    267756
Name: side, dtype: int64
(535519, 11)


In [18]:
screening_magview = screening_magview.sort_values(["empi_anon", "acc_anon", "study_date_anon"]).drop_duplicates()
screening_magview

Unnamed: 0,empi_anon,acc_anon,study_date_anon,desc,side,asses,path_severity,bside,procdate_anon,pdate_anon,exam_laterality
173182,10000865,3629273638679348,2013-01-07,MG Screening Bilateral,L,B,,,,,B
173182,10000865,3629273638679348,2013-01-07,MG Screening Bilateral,R,B,,,,,B
31918,10000879,6992096043050201,2018-02-16,MG Screen Bilat w/Tomo/CAD Stnd Protocol,L,N,,,,,B
31918,10000879,6992096043050201,2018-02-16,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,N,,,,,B
42024,10000879,8162137067574239,2020-10-14,MG Screen Bilat w/Tomo/CAD Stnd Protocol,L,N,,,,,B
...,...,...,...,...,...,...,...,...,...,...,...
26104,99999564,4369225803558884,2017-04-25,MG Screen Bilat w/Tomo/CAD Stnd Protocol,L,N,,,,,B
26104,99999564,4369225803558884,2017-04-25,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,N,,,,,B
31922,99999564,8832872399780580,2019-02-27,MG Screen Bilat w/Tomo/CAD Stnd Protocol,L,N,,,,,B
31922,99999564,8832872399780580,2019-02-27,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,N,,,,,B


In [19]:
exam_lat_b = screening_magview.loc[screening_magview.exam_laterality=="B"]
exam_lat_b.sample(2)

Unnamed: 0,empi_anon,acc_anon,study_date_anon,desc,side,asses,path_severity,bside,procdate_anon,pdate_anon,exam_laterality
344811,47494328,2099425055413879,2013-11-21,MG Screening Bilateral w/CAD,L,N,,,,,B
255272,40212230,5767382270904809,2021-05-07,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,N,,,,,B


In [20]:
# We want to aggregate all the sides for each bilateral exam so that we can filter those having only a single side.
exam_lat_b_agg = exam_lat_b.groupby('acc_anon')['side'].apply(''.join).reset_index()
exam_lat_b_agg.sample(2)

Unnamed: 0,acc_anon,side
235918,8473976286914433,LR
48434,2539713197479326,LR


In [21]:
exam_lat_b_agg.side.value_counts()

LR         225476
L           26501
R           26219
RL           3200
LL            634
RR            603
LLR           400
LRR           199
RLR           185
LLRR           99
RLL            56
RRL            53
LLL            45
RRR            42
LRLR           28
RLLR           24
LRL            23
RRLL           15
LLLR            8
RRRR            6
RRRL            5
RRLR            4
LLLRRR          4
RRLLL           3
RLLL            3
LRRR            3
LLLL            2
LRLL            2
LRRLR           2
LLRL            2
RRRLL           1
LLLRLR          1
RRLLR           1
LLRRR           1
LLLLLR          1
LRLRR           1
LLLLRR          1
LRRRR           1
LLLRR           1
RLRR            1
LRRLL           1
RRLLLRR         1
RRRLLL          1
RRRRR           1
LRRL            1
LRLRLR          1
Name: side, dtype: int64

In [22]:
exam_lat_b_side_r = exam_lat_b_agg.loc[~(exam_lat_b_agg.side.str.contains("L"))].copy()
exam_lat_b_side_l = exam_lat_b_agg.loc[~(exam_lat_b_agg.side.str.contains("R"))].copy()

In [23]:
screening_magview_right_to_left = screening_magview.loc[screening_magview.acc_anon.isin(exam_lat_b_side_r.acc_anon)].copy().drop_duplicates()
screening_magview_left_to_right = screening_magview.loc[screening_magview.acc_anon.isin(exam_lat_b_side_l.acc_anon)].copy().drop_duplicates()

In [24]:
# Creating the negative Left side
screening_magview_right_to_left.loc[screening_magview_right_to_left.side=="R", "side"] = "L"
screening_magview_right_to_left.loc[screening_magview_right_to_left.side=="L", "asses"] = "N"
screening_magview_right_to_left.loc[screening_magview_right_to_left.side=="L", "path_severity"] = np.nan

screening_magview_right_to_left

Unnamed: 0,empi_anon,acc_anon,study_date_anon,desc,side,asses,path_severity,bside,procdate_anon,pdate_anon,exam_laterality
161468,10001945,8351351741085824,2017-08-02,MG Screen Bilat w/Tomo/CAD Stnd Protocol,L,N,,,,,B
37799,10033806,1069386741434572,2019-10-05,MG Screen Bilat w/Tomo/CAD Stnd Protocol,L,N,,,,,B
42125,10033806,9337281572732866,2020-10-08,MG Screen Bilat w/Tomo/CAD Stnd Protocol,L,N,,,,,B
386461,10043340,8892551633769298,2020-02-06,MG Screen Bilat w/Tomo/CAD Stnd Protocol,L,N,,,,,B
66818,10043985,1960584382049532,2018-04-18,MG Screen Bilat w/Tomo/CAD Stnd Protocol,L,N,,,,,B
...,...,...,...,...,...,...,...,...,...,...,...
283165,99975030,2279566061761287,2017-08-07,MG Screening Bilateral,L,N,,,,,B
345671,99980000,4817391795521519,2014-02-03,MG Screening Bilateral w/CAD,L,N,,,,,B
155650,99982588,8416878517191964,2018-02-26,MG Screen Bilat w/Tomo/CAD Stnd Protocol,L,N,,,,,B
404779,99991060,5463528102254256,2016-03-08,MG Screening Bilateral,L,N,,R,2016-03-28,2016-03-29 00:00:00,B


In [25]:
# Creating the negative Right side
screening_magview_left_to_right.loc[screening_magview_left_to_right.side=="L", "side"] = "R"
screening_magview_left_to_right.loc[screening_magview_left_to_right.side=="R", "asses"] = "N"
screening_magview_left_to_right.loc[screening_magview_left_to_right.side=="R", "path_severity"] = np.nan

screening_magview_left_to_right

Unnamed: 0,empi_anon,acc_anon,study_date_anon,desc,side,asses,path_severity,bside,procdate_anon,pdate_anon,exam_laterality
266782,10002247,4667123741504672,2015-02-04,MG Screening Bilateral,R,N,,,,,B
391750,10010842,2152357467079057,2014-07-05,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,N,,L,2014-07-18,2014-07-18 00:00:00,B
20834,10015693,1334581155737139,2015-10-11,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,N,,,,,B
403309,10019705,1199638332134935,2016-06-08,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,N,,,,,B
425750,10019705,3384492963323757,2020-07-19,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,N,,,,,B
...,...,...,...,...,...,...,...,...,...,...,...
263656,99985035,9268129462152986,2014-09-11,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,N,,,,,B
29702,99986224,8107409307566891,2018-05-22,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,N,,,,,B
240347,99986388,1665558280896604,2017-10-01,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,N,,,,,B
202020,99997273,2996543584580499,2018-09-13,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,N,,,,,B


In [26]:
# Merging the original and the two negative contralaterals
screening_magview_with_contralat = pd.concat([screening_magview, screening_magview_left_to_right, screening_magview_right_to_left]).sort_values(["empi_anon", "acc_anon", "study_date_anon"]).drop_duplicates()
screening_magview_with_contralat.sample(2)

Unnamed: 0,empi_anon,acc_anon,study_date_anon,desc,side,asses,path_severity,bside,procdate_anon,pdate_anon,exam_laterality
296187,44132453,1473567328233293,2019-08-11,MG Screening Bilateral,L,N,,,,,B
322144,20812558,3290510072755021,2016-03-01,MG Screening Bilateral,R,N,,,,,B


In [27]:
get_image_stats(screening_magview_with_contralat)

# PNG PATH: 1298290
# ROI: 25879
0.0    1274855
1.0      21185
2.0       2060
3.0        186
4.0          4
Name: num_roi, dtype: int64


### 3.2. BIRADS 0

In [28]:
b0 = screening_magview_with_contralat.loc[screening_magview_with_contralat.asses.isin(["A"])]

get_stats(b0)
get_image_stats(b0)

DF shape: (56790, 11)
# Patients: 40331
# Cases: 46292

'DataFrame' object has no attribute 'png_path'
# PNG PATH: 123773
# ROI: 25308
0.0    100868
1.0     20687
2.0      2037
3.0       177
4.0         4
Name: num_roi, dtype: int64


### 3.3. BIRADS 1, 2

In [29]:
b12 = screening_magview_with_contralat.loc[screening_magview_with_contralat.asses.isin(["B", "N"])]

get_stats(b12)
get_image_stats(b12)

DF shape: (526770, 11)
# Patients: 99792
# Cases: 286134

'DataFrame' object has no attribute 'png_path'
# PNG PATH: 1177163
# ROI: 1006
0.0    1176248
1.0        840
2.0         59
3.0         16
Name: num_roi, dtype: int64


## 4. Diagnostic

In [30]:
diag_magview = magview.loc[magview.desc.str.contains('diag', case=False)]

get_stats(diag_magview)
print()
print(f"Asses Counts:\n{diag_magview.asses.value_counts()}")

DF shape: (117842, 10)
# Patients: 49420
# Cases: 87494

'DataFrame' object has no attribute 'png_path'

Asses Counts:
B    45655
P    28500
N    21452
S    15455
A     2908
K     2094
M     1667
X      111
Name: asses, dtype: int64


## 5. Screening BIRADS 0 and Diagnostic

In [31]:
b0_dx = pd.merge(b0, diag_magview, on='empi_anon', suffixes=[None, "_dx"])
b0_dx = b0_dx.loc[
    (b0_dx.side==b0_dx.side_dx)
    | (b0_dx.side_dx=="B")
    | (b0_dx.side_dx.isna())
]

In [32]:
# Getting only subsequent diagnostic studies within 3 months
b0_dx["delta_date_dx"] = (b0_dx.study_date_anon_dx - b0_dx.study_date_anon).dt.days
b0_dx_3mo = b0_dx.loc[b0_dx.delta_date_dx.isin(range(0, 91))]
b0_dx_3mo.sample(1)

Unnamed: 0,empi_anon,acc_anon,study_date_anon,desc,side,asses,path_severity,bside,procdate_anon,pdate_anon,exam_laterality,acc_anon_dx,study_date_anon_dx,desc_dx,side_dx,asses_dx,path_severity_dx,bside_dx,procdate_anon_dx,pdate_anon_dx,delta_date_dx
111674,99071319,4291883356856064,2018-02-08,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,A,,,,,B,9034500297208086,2018-02-26,MG Diagnostic Right w/CAD,R,B,,,,,18


### 5.1. BIRADS 0 (Screening) --> BIRADS 1, 2 (Diagnostic)

In [33]:
b0_12dx = b0_dx_3mo.loc[b0_dx_3mo.asses_dx.isin(["N", "B"])].copy()
get_stats(b0_12dx)
get_image_stats(b0_12dx)

DF shape: (19967, 21)
# Patients: 15602
# Cases: 16961

'DataFrame' object has no attribute 'png_path'
# PNG PATH: 40123
# ROI: 8519
0.0    32591
1.0     6607
2.0      864
3.0       60
4.0        1
Name: num_roi, dtype: int64


### 5.2. BIRADS 0 (Screening) --> BIRADS 3, 4, 5, 6 (Diagnostic)

In [34]:
b0_3456dx = b0_dx_3mo.loc[b0_dx_3mo.asses_dx.isin(["P", "S", "M", "K"])].copy()
get_stats(b0_3456dx)
get_image_stats(b0_3456dx)

DF shape: (18343, 21)
# Patients: 11177
# Cases: 11710

'DataFrame' object has no attribute 'png_path'
# PNG PATH: 28625
# ROI: 7214
0.0    22146
1.0     5817
2.0      591
3.0       69
4.0        2
Name: num_roi, dtype: int64


## 6. Negative group

In [35]:
# Negative group = BIRADS_12 + BIRADS_0_12dx
neg_group = pd.concat([b12, b0_12dx])
neg_group.drop_duplicates(inplace=True)

get_stats(neg_group)
get_image_stats(neg_group)

DF shape: (545663, 21)
# Patients: 100783
# Cases: 289024

'DataFrame' object has no attribute 'png_path'
# PNG PATH: 1216280
# ROI: 9382
0.0    1207956
1.0       7343
2.0        905
3.0         75
4.0          1
Name: num_roi, dtype: int64


In [36]:
# Include only ones with negative follow-up after 1 year
neg_group_b12 = pd.merge(neg_group, b12, on=["empi_anon"], suffixes=(None, "_1yrfu"))

neg_group_b12 = neg_group_b12.loc[
    (neg_group_b12.side==neg_group_b12.side_1yrfu)
]

neg_group_b12["delta_date_1yrfu"] = (neg_group_b12.study_date_anon_1yrfu - neg_group_b12.study_date_anon).dt.days

get_stats(neg_group_b12)
get_image_stats(neg_group_b12)

neg_group_b12.sample(2)

DF shape: (2329305, 32)
# Patients: 99792
# Cases: 288000

'DataFrame' object has no attribute 'png_path'
# PNG PATH: 1206833
# ROI: 6851
0.0    1200713
1.0       5437
2.0        636
3.0         46
4.0          1
Name: num_roi, dtype: int64


Unnamed: 0,empi_anon,acc_anon,study_date_anon,desc,side,asses,path_severity,bside,procdate_anon,pdate_anon,exam_laterality,acc_anon_dx,study_date_anon_dx,desc_dx,side_dx,asses_dx,path_severity_dx,bside_dx,procdate_anon_dx,pdate_anon_dx,delta_date_dx,acc_anon_1yrfu,study_date_anon_1yrfu,desc_1yrfu,side_1yrfu,asses_1yrfu,path_severity_1yrfu,bside_1yrfu,procdate_anon_1yrfu,pdate_anon_1yrfu,exam_laterality_1yrfu,delta_date_1yrfu
1251954,34595437,3721150955690973,2013-08-08,MG Screening Bilateral w/CAD,R,N,,,,,B,,NaT,,,,,,,,,4047169165502113,2016-09-02,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,N,,,,,B,1121
3488116,78037395,7295645242502555,2019-08-22,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,N,,,,,B,,NaT,,,,,,,,,7295645242502555,2019-08-22,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,N,,,,,B,0


In [37]:
neg_group_1yrfu = neg_group_b12.loc[
    (neg_group_b12.delta_date_1yrfu > 360)
]
get_stats(neg_group_1yrfu)
get_image_stats(neg_group_1yrfu)

DF shape: (892822, 32)
# Patients: 60473
# Cases: 185352

'DataFrame' object has no attribute 'png_path'
# PNG PATH: 813568
# ROI: 4213
0.0    809806
1.0      3348
2.0       378
3.0        35
4.0         1
Name: num_roi, dtype: int64


In [38]:
neg_group_1yrfu_first_study = neg_group_1yrfu.sort_values(["empi_anon", "acc_anon", "study_date_anon_1yrfu"]).drop_duplicates(subset=["acc_anon", "side"]) # to only get the first followup study
get_stats(neg_group_1yrfu_first_study)
get_image_stats(neg_group_1yrfu_first_study)

DF shape: (346327, 32)
# Patients: 60473
# Cases: 185352

'DataFrame' object has no attribute 'png_path'
# PNG PATH: 813568
# ROI: 4213
0.0    809806
1.0      3348
2.0       378
3.0        35
4.0         1
Name: num_roi, dtype: int64


In [39]:
neg_group_1yrfu_first_study.path_severity.value_counts()

4.0    172
2.0     52
0.0     37
1.0     13
3.0      2
5.0      2
Name: path_severity, dtype: int64

In [40]:
# Exclude any patient with any biopsy result
neg_group_1yrfu_first_study_no_biopsy = neg_group_1yrfu_first_study.loc[neg_group_1yrfu_first_study.path_severity.isna()].copy()

In [41]:
# Merging with METADATA to get the images
neg_group_1yrfu_first_study_no_biopsy_images = pd.merge(neg_group_1yrfu_first_study_no_biopsy, meta_2d, on=["empi_anon", "acc_anon", "study_date_anon"])
neg_group_1yrfu_first_study_no_biopsy_images = neg_group_1yrfu_first_study_no_biopsy_images.loc[
    (neg_group_1yrfu_first_study_no_biopsy_images.side == neg_group_1yrfu_first_study_no_biopsy_images.ImageLateralityFinal)
]
neg_group_1yrfu_first_study_no_biopsy_images.drop_duplicates(subset="png_path", inplace=True)
get_stats(neg_group_1yrfu_first_study_no_biopsy_images)

DF shape: (807779, 40)
# Patients: 58261
# Cases: 178934

# Images: 807779



In [42]:
print(f"ROIs = {neg_group_1yrfu_first_study_no_biopsy_images.num_roi.sum()}")
print(neg_group_1yrfu_first_study_no_biopsy_images.num_roi.value_counts())

ROIs = 4089
0    804117
1      3269
2       360
3        32
4         1
Name: num_roi, dtype: int64


## 7. Positive Group

In [43]:
pos_group_images = pd.merge(b0_3456dx, meta_2d, on=["empi_anon", "acc_anon", "study_date_anon"])
pos_group_images = pos_group_images.loc[
    (pos_group_images.side == pos_group_images.ImageLateralityFinal)
]
pos_group_images.drop_duplicates(subset="png_path", inplace=True)
get_stats(pos_group_images)

DF shape: (28496, 29)
# Patients: 10557
# Cases: 11022

# Images: 28496



In [44]:
print(f"ROIs  = {pos_group_images.num_roi.sum()}")
print(pos_group_images.num_roi.value_counts())

ROIs  = 7204
0    22025
1     5811
2      589
3       69
4        2
Name: num_roi, dtype: int64


## 8. Excluding Images from the Negative Group that are found in the Positive Group using acc_anon and side

In [45]:
# Merge negatives and positive groups
neg_pos = pd.merge(neg_group_1yrfu_first_study_no_biopsy_images, pos_group_images, on=["empi_anon", "acc_anon", "side"], suffixes=["_neg", "_pos"])
neg_pos.sample(2)

Unnamed: 0,empi_anon,acc_anon,study_date_anon_neg,desc_neg,side,asses_neg,path_severity_neg,bside_neg,procdate_anon_neg,pdate_anon_neg,exam_laterality_neg,acc_anon_dx_neg,study_date_anon_dx_neg,desc_dx_neg,side_dx_neg,asses_dx_neg,path_severity_dx_neg,bside_dx_neg,procdate_anon_dx_neg,pdate_anon_dx_neg,delta_date_dx_neg,acc_anon_1yrfu,study_date_anon_1yrfu,desc_1yrfu,side_1yrfu,asses_1yrfu,path_severity_1yrfu,bside_1yrfu,procdate_anon_1yrfu,pdate_anon_1yrfu,exam_laterality_1yrfu,delta_date_1yrfu,ImageLateralityFinal_neg,ViewPosition_neg,FinalImageType_neg,png_path_neg,StudyDescription_neg,match_level_neg,num_roi_neg,ROI_coords_neg,study_date_anon_pos,desc_pos,asses_pos,path_severity_pos,bside_pos,procdate_anon_pos,pdate_anon_pos,exam_laterality_pos,acc_anon_dx_pos,study_date_anon_dx_pos,desc_dx_pos,side_dx_pos,asses_dx_pos,path_severity_dx_pos,bside_dx_pos,procdate_anon_dx_pos,pdate_anon_dx_pos,delta_date_dx_pos,ImageLateralityFinal_pos,ViewPosition_pos,FinalImageType_pos,png_path_pos,StudyDescription_pos,match_level_pos,num_roi_pos,ROI_coords_pos
760,34789855,6476608733007008,2014-04-01,MG Screening Left w/Tomo/CAD,L,A,,,,,L,8993781340339809.0,2014-04-21,MG Diagnostic Left w/CAD,L,B,,,,,20.0,9992904429283646,2016-07-13,MG Screening Left w/Tomo/CAD,L,N,,,,,L,834,L,CC,2D,/data/mammo/png/cohort_9/extracted-images/f500...,MG Screening Left w/Tomo/CAD,[],0,(),2014-04-01,MG Screening Left w/Tomo/CAD,A,,,,,L,8993781340339809,2014-04-21,MG Diagnostic Left w/CAD,L,P,,,,,20,L,MLO,2D,/data/mammo/png/cohort_9/extracted-images/f500...,MG Screening Left w/Tomo/CAD,[],0,()
360,19703500,8975983036689789,2014-12-15,MG Screen Bilat w/Tomo/CAD Stnd Protocol,L,B,,,,,B,,NaT,,,,,,,,,4273910518306474,2015-12-19,MG Screen Bilat w/Tomo/CAD Stnd Protocol,L,B,,,,,B,369,L,MLO,2D,/data/mammo/png/cohort_1/extracted-images/52f9...,MG Screen Bilat w/Tomo/CAD Stnd Protocol,[],0,(),2014-12-15,MG Screen Bilat w/Tomo/CAD Stnd Protocol,A,4.0,L,2014-12-25,2014-12-26 00:00:00,B,3343455609167761,2014-12-20,MG Diagnostic Left w/CAD,L,S,4.0,L,2014-12-25,2014-12-26 00:00:00,5,L,CC,2D,/data/mammo/png/cohort_1/extracted-images/52f9...,MG Screen Bilat w/Tomo/CAD Stnd Protocol,[],0,()


In [46]:
# Create new KeyID of acc_anon + side on negative group and negative+positive group
neg_pos["acc_anon_side"] = neg_pos.acc_anon + neg_pos.side
neg_pos.sample(2)

Unnamed: 0,empi_anon,acc_anon,study_date_anon_neg,desc_neg,side,asses_neg,path_severity_neg,bside_neg,procdate_anon_neg,pdate_anon_neg,exam_laterality_neg,acc_anon_dx_neg,study_date_anon_dx_neg,desc_dx_neg,side_dx_neg,asses_dx_neg,path_severity_dx_neg,bside_dx_neg,procdate_anon_dx_neg,pdate_anon_dx_neg,delta_date_dx_neg,acc_anon_1yrfu,study_date_anon_1yrfu,desc_1yrfu,side_1yrfu,asses_1yrfu,path_severity_1yrfu,bside_1yrfu,procdate_anon_1yrfu,pdate_anon_1yrfu,exam_laterality_1yrfu,delta_date_1yrfu,ImageLateralityFinal_neg,ViewPosition_neg,FinalImageType_neg,png_path_neg,StudyDescription_neg,match_level_neg,num_roi_neg,ROI_coords_neg,study_date_anon_pos,desc_pos,asses_pos,path_severity_pos,bside_pos,procdate_anon_pos,pdate_anon_pos,exam_laterality_pos,acc_anon_dx_pos,study_date_anon_dx_pos,desc_dx_pos,side_dx_pos,asses_dx_pos,path_severity_dx_pos,bside_dx_pos,procdate_anon_dx_pos,pdate_anon_dx_pos,delta_date_dx_pos,ImageLateralityFinal_pos,ViewPosition_pos,FinalImageType_pos,png_path_pos,StudyDescription_pos,match_level_pos,num_roi_pos,ROI_coords_pos,acc_anon_side
1043,43961961,5243068495140390,2013-02-05,MG Screening Bilateral w/CAD,L,A,,,,,B,1335360084337623,2013-03-04,MG Diagnostic Left,L,B,,,,,27.0,4149785248271308,2016-04-30,MG Screen Bilat w/Tomo/CAD Stnd Protocol,L,N,,,,,B,1180,L,MLO,2D,/data/mammo/png/cohort_5/extracted-images/527f...,MG Screening Bilateral w/CAD,[],0,(),2013-02-05,MG Screening Bilateral w/CAD,A,,,,,B,7485410784607630,2013-03-02,MG Diagnostic Mammo Bilateral,L,S,,,,,25,L,CC,2D,/data/mammo/png/cohort_5/extracted-images/527f...,MG Screening Bilateral w/CAD,[],0,(),5243068495140390L
399,22680085,9748736436676015,2016-07-25,MG Screen Bilat w/Tomo/CAD Stnd Protocol,L,A,,,,,B,9677750844499584,2016-07-31,MG Diagnostic Left,L,B,,,,,6.0,3326862674671184,2019-09-30,MG Screen Bilat w/Tomo/CAD Stnd Protocol,L,N,,,,,B,1162,L,MLO,2D,/data/mammo/png/cohort_10/extracted-images/533...,MG Screening Digital BL w/Tomo/CAD,[],0,(),2016-07-25,MG Screen Bilat w/Tomo/CAD Stnd Protocol,A,,,,,B,9677750844499584,2016-07-31,MG Diagnostic Left,L,P,,,,,6,L,CC,2D,/data/mammo/png/cohort_10/extracted-images/533...,MG Screening Digital BL w/Tomo/CAD,[],0,(),9748736436676015L


In [47]:
neg_group_1yrfu_first_study_no_biopsy_images["acc_anon_side"] = neg_group_1yrfu_first_study_no_biopsy_images.acc_anon + neg_group_1yrfu_first_study_no_biopsy_images.side
neg_group_1yrfu_first_study_no_biopsy_images.sample(2)

Unnamed: 0,empi_anon,acc_anon,study_date_anon,desc,side,asses,path_severity,bside,procdate_anon,pdate_anon,exam_laterality,acc_anon_dx,study_date_anon_dx,desc_dx,side_dx,asses_dx,path_severity_dx,bside_dx,procdate_anon_dx,pdate_anon_dx,delta_date_dx,acc_anon_1yrfu,study_date_anon_1yrfu,desc_1yrfu,side_1yrfu,asses_1yrfu,path_severity_1yrfu,bside_1yrfu,procdate_anon_1yrfu,pdate_anon_1yrfu,exam_laterality_1yrfu,delta_date_1yrfu,ImageLateralityFinal,ViewPosition,FinalImageType,png_path,StudyDescription,match_level,num_roi,ROI_coords,acc_anon_side
885604,59718570,2415163519747488,2014-06-17,MG Screen Bilat w/Tomo/CAD Stnd Protocol,L,N,,,,,B,,NaT,,,,,,,,,6969519729345303,2015-06-17,MG Screen Bilat w/Tomo/CAD Stnd Protocol,L,N,,,,,B,365,L,CC,2D,/data/mammo/png/cohort_7/extracted-images/5867...,MG Screen Bilat w/Tomo/CAD Stnd Protocol,[],0,(),2415163519747488L
1279860,81763266,2409705155300596,2018-12-27,MG Screen Bilat w/Tomo/CAD Stnd Protocol,L,N,,,,,B,,NaT,,,,,,,,,2090480540250219,2020-05-17,MG Screen Bilat w/Tomo/CAD Stnd Protocol,L,N,,,,,B,507,L,CC,2D,/data/mammo/png/cohort_8/extracted-images/8be0...,MG Screen Bilat w/Tomo/CAD Stnd Protocol,[],0,(),2409705155300596L


In [48]:
# Removing any images that are found in the positive group from the negative group using the created KeyID (acc_anon+side)
neg_group_final = neg_group_1yrfu_first_study_no_biopsy_images.loc[~neg_group_1yrfu_first_study_no_biopsy_images.acc_anon_side.isin(neg_pos.acc_anon_side)]
neg_group_final.sample(2)

Unnamed: 0,empi_anon,acc_anon,study_date_anon,desc,side,asses,path_severity,bside,procdate_anon,pdate_anon,exam_laterality,acc_anon_dx,study_date_anon_dx,desc_dx,side_dx,asses_dx,path_severity_dx,bside_dx,procdate_anon_dx,pdate_anon_dx,delta_date_dx,acc_anon_1yrfu,study_date_anon_1yrfu,desc_1yrfu,side_1yrfu,asses_1yrfu,path_severity_1yrfu,bside_1yrfu,procdate_anon_1yrfu,pdate_anon_1yrfu,exam_laterality_1yrfu,delta_date_1yrfu,ImageLateralityFinal,ViewPosition,FinalImageType,png_path,StudyDescription,match_level,num_roi,ROI_coords,acc_anon_side
1043917,68580524,8507037717396147,2016-12-11,MG Screening Bilateral,L,N,,,,,B,,NaT,,,,,,,,,9344738692043744,2018-03-04,MG Screening Bilateral w/CAD,L,N,,,,,B,448,L,MLO,2D,/data/mammo/png/cohort_3/extracted-images/5940...,MG Screening Bilateral,[],0,(),8507037717396147L
67272,13784788,9704448443602447,2019-10-23,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,A,,,,,B,4035300999545663.0,2019-11-04,MG Diagnostic Right w/CAD,R,B,,,,,12.0,5935121117587892,2021-03-01,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,N,,,,,B,495,R,MLO,2D,/data/mammo/png/cohort_3/extracted-images/dac3...,MG Screen Bilat w/Tomo/CAD Stnd Protocol,[1],1,"((1447, 171, 1905, 501),)",9704448443602447R


In [49]:
get_stats(neg_group_final)

print(f"ROIs  = {neg_group_final.num_roi.sum()}")
print(neg_group_final.num_roi.value_counts())

DF shape: (806676, 41)
# Patients: 58233
# Cases: 178854

# Images: 806676

ROIs  = 3878
0    803184
1      3136
2       327
3        28
4         1
Name: num_roi, dtype: int64


## 9. Saving and Exporting

In [73]:
columns_to_save = [
    'empi_anon',
    'acc_anon',
    'desc',
    'asses',
    'asses_dx',
    'path_severity',
    'study_date_anon',
    'study_date_anon_dx',
    'side',
    'ImageLateralityFinal',
    'bside',
    'ViewPosition',
    'match_level',
    'num_roi',
    'ROI_coords',
    'png_path',
]

In [75]:
break
neg_group_final[columns_to_save].to_csv("NEGATIVE_GROUP.csv", index=False)

In [77]:
break
pos_group_images[columns_to_save].to_csv("POSITIVE_GROUP.csv", index=False)

SyntaxError: 'break' outside loop (2687054856.py, line 1)

# END