# Preprocessing - Data Labeling

Load in the csv files prepared in notebook 1.1 and create text labels for multilabel classificaiton. 

### Train validaiton split

To better evaluate the model performance I increased the validation set size to 20% from the original ~12.3% (8630 / 70000). Train and validation were combined in this notebook and randomly split in the subsequent notebook.

### Upsampling

To improve the performance of minority classes i'll try upsampling the minority classes.

In [61]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai.vision import *
from fastai.metrics import error_rate
import os 

import config as cfg

In [62]:
current_module = sys.modules[__name__]
csvs = ['tiles_train', 'tiles_validation', 'tiles_test']
for csv in csvs:
    print('loading {}'.format(csv))
    df_name = csv
    setattr(current_module,
            df_name,
            pd.read_csv(os.path.join('data', 'CSVs', '{}.csv'.format(csv))))
    df = getattr(current_module, df_name)
    print(df.shape)
    display(df.head(2))

loading tiles_train
(155239, 8)


Unnamed: 0,id,imagename,cored,diffuse,CAA,negative,flag,notsure
0,10748,NA4757-02_AB/NA4757-02_AB_18_25_61.jpg,1.0,0.0,0.0,0.0,0.0,0.0
1,29503,NA4918-02_AB17-24/NA4918-02_AB17-24_9_18_12.jpg,0.0,2.832462,0.0,0.0,0.0,0.0


loading tiles_validation
(8630, 8)


Unnamed: 0,id,imagename,cored,diffuse,CAA,negative,flag,notsure
0,0,NA_4896_02_AB17-24/neg_NA_4896_02_AB17-24_0_10...,0.0,0.0,0.0,1.0,0.0,0.0
1,0,NA_4896_02_AB17-24/neg_NA_4896_02_AB17-24_0_18...,0.0,0.0,0.0,1.0,0.0,0.0


loading tiles_test
(10873, 8)


Unnamed: 0,id,imagename,cored,diffuse,CAA,negative,flag,notsure
0,14,NA4053-02_AB/NA4053-02_AB_18_18.jpg_0.jpg,0.0,3.003472,0.0,0.0,0.28125,1.0
1,15,NA4053-02_AB/NA4053-02_AB_18_18.jpg_1.jpg,0.0,5.214052,0.0,0.0,2.0,1.0


In [63]:
train = tiles_train[['imagename'] + cfg.image_classes].copy()
validation = tiles_validation[['imagename'] + cfg.image_classes].copy()
test = tiles_test[['imagename'] + cfg.image_classes].copy()

The threshold used for labeling is the same as the original dataset. Lowering the threshold could create more observations.

In [64]:
threshold = .99
for class_i in cfg.image_classes:
    train[class_i] = train[class_i].map(lambda x: class_i if x > threshold else '')
    validation[class_i] = validation[class_i].map(lambda x: class_i if x > threshold else '')
    test[class_i] = test[class_i].map(lambda x: class_i if x > threshold else '')
    

In [65]:
def create_text_label(df,
                       classes=[],
                       label_col_name='label',
                       negative_class_label='negative'):
    
    df[label_col_name] = df.apply(lambda row: ' '.join(filter(None,
                                                               [row[class_i] for class_i in classes] 
                                                              )), axis=1)
    
    # remove whitespace and label absence of any classes as negative class
    df[label_col_name] = df[label_col_name].map(lambda x: x.strip() if x.strip() else negative_class_label)
    return(df)



In [66]:
train = create_text_label(train,
                          classes=cfg.image_classes,
                          negative_class_label='negative')
validation = create_text_label(validation,
                               classes=cfg.image_classes,
                               negative_class_label='negative')
test = create_text_label(test,
                         classes=cfg.image_classes,
                         negative_class_label='negative')

train.drop(columns=cfg.image_classes, inplace=True)
validation.drop(columns=cfg.image_classes, inplace=True)
test.drop(columns=cfg.image_classes, inplace=True)
display(train.head(2))
display(validation.head(2))
display(test.head(2))

Unnamed: 0,imagename,label
0,NA4757-02_AB/NA4757-02_AB_18_25_61.jpg,cored
1,NA4918-02_AB17-24/NA4918-02_AB17-24_9_18_12.jpg,diffuse


Unnamed: 0,imagename,label
0,NA_4896_02_AB17-24/neg_NA_4896_02_AB17-24_0_10...,negative
1,NA_4896_02_AB17-24/neg_NA_4896_02_AB17-24_0_18...,negative


Unnamed: 0,imagename,label
0,NA4053-02_AB/NA4053-02_AB_18_18.jpg_0.jpg,diffuse
1,NA4053-02_AB/NA4053-02_AB_18_18.jpg_1.jpg,diffuse


## Upsample core class

In [74]:
core_upsample = train.loc[train.label.str.contains('cored')].sample(frac=0.4)
neg_upsample = train.loc[train.label.str.contains('negative')].sample(frac=1)
train_upsampled = pd.concat([train, core_upsample, neg_upsample])

assert (train.shape[0] + core_upsample.shape[0] + neg_upsample.shape[0]) == train_upsampled.shape[0]

In [75]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.io as pio
pio.renderers.default = "iframe"
train_unique = train.drop_duplicates(subset=['imagename'], keep='first')
# validation_unique = validation.drop_duplicates(subset=['imagename'], keep='first')
# test_unique = test.drop_duplicates(subset=['imagename'], keep='first')
fig = make_subplots(rows=3, cols=1, shared_xaxes=True)
fig.append_trace(go.Histogram(x=train_unique['label'],
                           histnorm='percent',
                           name='train_unique'), row=1, col=1)
fig.add_trace(go.Histogram(x=train['label'],
                           histnorm='percent',
                           name='train_plaquebox-paper'), row=1, col=1)
fig.add_trace(go.Histogram(x=train_upsampled['label'],
                           histnorm='percent',
                           name='train_upsampled'), row=1, col=1)
fig.append_trace(go.Histogram(x=validation['label'],
                           histnorm='percent',
                           name='validation'), row=2, col=1)
fig.append_trace(go.Histogram(x=test_unique['label'],
                           histnorm='percent',
                           name='test'), row=3, col=1)

fig.show()

In [76]:
train.to_csv(os.path.join(cfg.csv_dir, 'train_multilabel.csv'), index=False)
train_upsampled.to_csv(os.path.join(cfg.csv_dir, 'train_upsampled_multilabel.csv'), index=False)
validation.to_csv(os.path.join(cfg.csv_dir, 'validation_multilabel.csv'), index=False)
test.to_csv(os.path.join(cfg.csv_dir, 'test_multilabel.csv'), index=False)