In [1]:
!pip install iterative-stratification



In [2]:
import pandas as pd
import numpy as np
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [3]:
train_labels = pd.read_csv("/home/ubuntu/datasets/human-protein-atlas/train.csv")
train_labels.shape, train_labels.head()

((31072, 2),
                                      Id   Target
 0  00070df0-bbc3-11e8-b2bc-ac1f6b6435d0     16 0
 1  000a6c98-bb9b-11e8-b2b9-ac1f6b6435d0  7 1 2 0
 2  000a9596-bbc4-11e8-b2bc-ac1f6b6435d0        5
 3  000c99ba-bba4-11e8-b2b9-ac1f6b6435d0        1
 4  001838f8-bbca-11e8-b2bc-ac1f6b6435d0       18)

In [4]:
train_labels_ex = pd.read_csv("/home/ubuntu/datasets/human-protein-atlas/external_data_raw/train.csv")
train_labels_ex.shape,train_labels_ex.head()

((74606, 2),
                 Id        Target
 0  10580_1610_C1_1  13 25 0 2 21
 1  10580_1610_C1_2  13 25 0 2 21
 2  10580_1756_B1_1  13 25 0 2 21
 3  10580_1756_B1_2  13 25 0 2 21
 4  10580_1758_B1_1  13 25 0 2 21)

In [5]:
train_df = pd.concat([train_labels, train_labels_ex])

In [6]:
train_df.reset_index(drop=True, inplace=True)

In [7]:
train_df.shape

(105678, 2)

In [8]:
train_df.head(),train_df.tail()

(                                     Id   Target
 0  00070df0-bbc3-11e8-b2bc-ac1f6b6435d0     16 0
 1  000a6c98-bb9b-11e8-b2b9-ac1f6b6435d0  7 1 2 0
 2  000a9596-bbc4-11e8-b2bc-ac1f6b6435d0        5
 3  000c99ba-bba4-11e8-b2b9-ac1f6b6435d0        1
 4  001838f8-bbca-11e8-b2bc-ac1f6b6435d0       18,
                       Id Target
 105673  49564_1125_A10_2      3
 105674   49564_1226_D8_1      3
 105675   49564_1226_D8_3      3
 105676   49564_1055_B2_1      3
 105677   49564_1055_B2_2      3)

In [11]:
train_df = train_df.drop_duplicates(subset='Id', keep=False)
train_df.shape

(105678, 2)

In [40]:
label_names = {
    0:  "Nucleoplasmn",  
    1:  "Nuclear membrane",   
    2:  "Nucleoli",   
    3:  "Nucleoli fibrillar center",   
    4:  "Nuclear speckles",
    5:  "Nuclear bodies",   
    6:  "Endoplasmic reticulum",   
    7:  "Golgi apparatus",   
    8:  "Peroxisomes",   
    9:  "Endosomes",   
    10:  "Lysosomes", 
    11:  "Intermediate filaments",   
    12:  "Actin filaments",   
    13:  "Focal adhesion sites",   
    14:  "Microtubules",   
    15:  "Microtubule ends",   
    16:  "Cytokinetic bridge",   
    17:  "Mitotic spindle",   
    18:  "Microtubule organizing center",   
    19:  "Centrosome",   
    20:  "Lipid droplets",   
    21:  "Plasma membrane",   
    22:  "Cell junctions",   
    23:  "Mitochondria",   
    24:  "Aggresome",   
    25:  "Cytosol",   
    26:  "Cytoplasmic bodies",   
    27:  "Rods & rings"
}

In [41]:
reverse_train_labels = dict((v,k) for k,v in label_names.items())

def fill_targets(row):
    row.Target = np.array(row.Target.split(" ")).astype(np.int)
    for num in row.Target:
        name = label_names[int(num)]
        row.loc[name] = 1
    return row

In [42]:
for key in label_names.keys():
    train_df[label_names[key]] = 0

train_df = train_df.apply(fill_targets, axis=1)

In [44]:
train_df.head()

Unnamed: 0,Id,Target,Nucleoplasmn,Nuclear membrane,Nucleoli,Nucleoli fibrillar center,Nuclear speckles,Nuclear bodies,Endoplasmic reticulum,Golgi apparatus,...,Microtubule organizing center,Centrosome,Lipid droplets,Plasma membrane,Cell junctions,Mitochondria,Aggresome,Cytosol,Cytoplasmic bodies,Rods & rings
0,00070df0-bbc3-11e8-b2bc-ac1f6b6435d0,"[16, 0]",1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,000a6c98-bb9b-11e8-b2b9-ac1f6b6435d0,"[7, 1, 2, 0]",1,1,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,000a9596-bbc4-11e8-b2bc-ac1f6b6435d0,[5],0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,000c99ba-bba4-11e8-b2b9-ac1f6b6435d0,[1],0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,001838f8-bbca-11e8-b2bc-ac1f6b6435d0,[18],0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [45]:
train_df.shape

(105678, 30)

In [46]:
train_df.loc[:,"kfold"] = -1

In [47]:
train_df.columns

Index(['Id', 'Target', 'Nucleoplasmn', 'Nuclear membrane', 'Nucleoli',
       'Nucleoli fibrillar center', 'Nuclear speckles', 'Nuclear bodies',
       'Endoplasmic reticulum', 'Golgi apparatus', 'Peroxisomes', 'Endosomes',
       'Lysosomes', 'Intermediate filaments', 'Actin filaments',
       'Focal adhesion sites', 'Microtubules', 'Microtubule ends',
       'Cytokinetic bridge', 'Mitotic spindle',
       'Microtubule organizing center', 'Centrosome', 'Lipid droplets',
       'Plasma membrane', 'Cell junctions', 'Mitochondria', 'Aggresome',
       'Cytosol', 'Cytoplasmic bodies', 'Rods & rings', 'kfold'],
      dtype='object')

In [48]:
# random shuffle with fraction = 1 then reset the index
train_labels = train_df.sample(frac=1).reset_index(drop=True)

X = train_labels.Id.values
#y = df[["grapheme_root","vowel_diacritic","consonant_diacritic"]].values
y = train_labels.iloc[:,2:].values

# define how many folds
mskf = MultilabelStratifiedKFold(n_splits=5)

In [49]:
X[:5], y[:5]

(array(['28815_1478_D12_4', '28656_251_D12_2', '53738_986_C2_1',
        '59977_1105_A8_8', '26745_si23_G4_4'], dtype=object),
 array([[ 1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0, -1],
        [ 0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1],
        [ 0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1],
        [ 1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0, -1],
        [ 1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1]]))

In [50]:
# Let's go through the iterator and get train and val indices
for fold, (trn_, val_) in enumerate(mskf.split(X, y)):
    print("TRAIN: ", trn_, "VALID : ",val_)
    train_labels.loc[val_, "kfold"] = fold

TRAIN:  [     0      1      2 ... 105675 105676 105677] VALID :  [     5      6     19 ... 105650 105657 105666]
TRAIN:  [     0      2      3 ... 105675 105676 105677] VALID :  [     1     15     17 ... 105659 105663 105664]
TRAIN:  [     0      1      3 ... 105672 105673 105676] VALID :  [     2      9     10 ... 105674 105675 105677]
TRAIN:  [     0      1      2 ... 105675 105676 105677] VALID :  [     7      8     14 ... 105669 105671 105672]
TRAIN:  [     1      2      5 ... 105674 105675 105677] VALID :  [     0      3      4 ... 105660 105673 105676]


In [51]:
train_labels["kfold"]

0         4
1         1
2         2
3         4
4         4
         ..
105673    4
105674    2
105675    2
105676    4
105677    2
Name: kfold, Length: 105678, dtype: int64

In [52]:
train_labels["kfold"].value_counts()

4    21136
2    21136
1    21136
3    21135
0    21135
Name: kfold, dtype: int64

Let's check a few values to ensure the splitting went as expected

In [53]:
# Common category
train_labels.groupby('kfold')['Nucleoplasmn'].sum()

kfold
0    8311
1    8213
2    8171
3    8138
4    8125
Name: Nucleoplasmn, dtype: int64

In [54]:
train_labels.groupby('kfold')['Nucleoli'].sum()

kfold
0    2174
1    2174
2    2174
3    2174
4    2175
Name: Nucleoli, dtype: int64

In [55]:
# Rare category
train_labels.groupby('kfold')['Peroxisomes'].sum()

kfold
0    44
1    43
2    43
3    43
4    44
Name: Peroxisomes, dtype: int64

In [56]:
# Rare category
train_labels.groupby('kfold')['Aggresome'].sum()

kfold
0    86
1    86
2    85
3    86
4    85
Name: Aggresome, dtype: int64

__^ Looks like the classes got equally distributed in all folds as expected.__

In [57]:
train_labels.to_csv("../input/train_folds_combined_data.csv", index=False)