In [10]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedGroupKFold

In [7]:
def undersampling(df, times: float):
    dfcancer, dfnocancer = df[df["cancer"]==1], df[df["cancer"]==0]
    
    patient_views_sampled = dfnocancer["patient_view"].sample(n=int(len(dfcancer) * times), random_state=9990)
    dfnocancer = dfnocancer[dfnocancer['patient_view'].isin(patient_views_sampled.tolist())]

    return pd.concat([dfcancer, dfnocancer]).sample(frac=1).reset_index(drop=True)

In [12]:
df = pd.read_csv("/home/data4/share/rsna-breast-cancer-detection/train.csv")
df['patient_view'] = df['patient_id'].astype(str) + '_' + df['laterality']
df.tail()

Unnamed: 0,site_id,patient_id,image_id,laterality,view,age,cancer,biopsy,invasive,BIRADS,implant,density,machine_id,difficult_negative_case,patient_view
54701,1,9973,1729524723,R,MLO,43.0,0,0,0,1.0,0,C,49,False,9973_R
54702,1,9989,63473691,L,MLO,60.0,0,0,0,,0,C,216,False,9989_L
54703,1,9989,1078943060,L,CC,60.0,0,0,0,,0,C,216,False,9989_L
54704,1,9989,398038886,R,MLO,60.0,0,0,0,0.0,0,C,216,True,9989_R
54705,1,9989,439796429,R,CC,60.0,0,0,0,0.0,0,C,216,True,9989_R


In [14]:
seed = 42
random.seed(seed)
np.random.seed(seed)

X = np.ones((len(df), 2))
y = df["cancer"].values
groups = df["patient_view"].values

sgkf = StratifiedGroupKFold(n_splits=4)
sgkf.get_n_splits(X, y)

train_indices, val_indices = [], []
for i, (train_index, val_index) in enumerate(sgkf.split(X, y, groups)):
    train_indices.append(train_index)
    val_indices.append(val_index)
    print(f"Fold {i}:")
    print(f"  Train:      index={train_index}")
    print(f"              group={groups[train_index]}")
    print(f"  Validation: index={val_index}")
    print(f"              group={groups[val_index]}")

    # under-sampling
    train_set = df.iloc[train_indices[i]] 
    val_set = df.iloc[val_indices[i]] 
    train_set_us = undersampling(train_set, 4.4)
    val_set_us = undersampling(val_set, 4.4)
    
    # to pkl files
    train_set_us.to_pickle(f'df/train_f{i}_test.pkl')
    val_set_us.to_pickle(f'df/val_f{i}_test.pkl')

    print(len(train_set_us[train_set_us["cancer"]==1]), len(train_set_us[train_set_us["cancer"]==0]))
    print(len(val_set_us[val_set_us["cancer"]==1]), len(val_set_us[val_set_us["cancer"]==0]))
    print()

Fold 0:
  Train:      index=[    0     1     4 ... 54703 54704 54705]
              group=['10006_L' '10006_L' '10011_L' ... '9989_L' '9989_R' '9989_R']
  Validation: index=[    2     3    10 ... 54691 54694 54695]
              group=['10006_R' '10006_R' '10025_R' ... '9968_L' '997_L' '997_L']
868 8708
290 2869

Fold 1:
  Train:      index=[    0     1     2 ... 54703 54704 54705]
              group=['10006_L' '10006_L' '10006_R' ... '9989_L' '9989_R' '9989_R']
  Validation: index=[    4     5    12 ... 54693 54696 54697]
              group=['10011_L' '10011_L' '10038_L' ... '9968_R' '997_R' '997_R']
868 8639
290 2893

Fold 2:
  Train:      index=[    2     3     4 ... 54701 54704 54705]
              group=['10006_R' '10006_R' '10011_L' ... '9973_R' '9989_R' '9989_R']
  Validation: index=[    0     1     6 ... 54699 54702 54703]
              group=['10006_L' '10006_L' '10011_R' ... '9973_L' '9989_L' '9989_L']
869 8705
289 2886

Fold 3:
  Train:      index=[    0     1     2 ... 54