# Preprocessing - Data Labeling

Load in the csv files prepared in notebook 1.1 and create text labels for multilabel classificaiton. 

### Negative images

Negatively labeled images (the absense of a label) were included in the `CSVs` files of the plaquebox paper however they were not used in the classifier. As the decision boundry between a positive label and a negative label can be close in some cases I experimented with including the negative label as a predictive class in the classifier. 

### Train validaiton split

To better evaluate the model performance I increased the validation set size to 20% from the original ~12.3% (8630 / 70000). Train and validation were combined in this notebook and randomly split in the subsequent notebook.

In [4]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai.vision import *
from fastai.metrics import error_rate
import os 

In [5]:
batch_size = 64

data_dir = os.path.join('/mnt', 'disks', 'disk-1', 'data')
csv_dir = os.path.join('data', 'CSVs')
models_dir = os.path.join(data_dir, 'models')
csv_path = {
    'train': os.path.join(csv_dir, 'tiles_train.csv'),
    'validation': os.path.join(csv_dir, 'tiles_validation.csv'),
    'test': os.path.join(csv_dir, 'tiles_test.csv'),
}

img_path = os.path.join(data_dir,
                        'tiles')

img_path_test = os.path.join(data_dir,
                        'tiles', 'tiles')
image_classes = ['cored', 'diffuse', 'CAA']

In [6]:
current_module = sys.modules[__name__]
csvs = ['tiles_train', 'tiles_validation', 'tiles_test']
for csv in csvs:
    print('loading {}'.format(csv))
    df_name = csv
    setattr(current_module,
            df_name,
            pd.read_csv(os.path.join('data', 'CSVs', '{}.csv'.format(csv))))
    df = getattr(current_module, df_name)
    print(df.shape)
    display(df.head(2))

loading tiles_train
(155239, 8)


Unnamed: 0,id,imagename,cored,diffuse,CAA,negative,flag,notsure
0,10748,NA4757-02_AB/NA4757-02_AB_18_25_61.jpg,1.0,0.0,0.0,0.0,0.0,0.0
1,29503,NA4918-02_AB17-24/NA4918-02_AB17-24_9_18_12.jpg,0.0,2.832462,0.0,0.0,0.0,0.0


loading tiles_validation
(8630, 8)


Unnamed: 0,id,imagename,cored,diffuse,CAA,negative,flag,notsure
0,0,NA_4896_02_AB17-24/neg_NA_4896_02_AB17-24_0_10...,0.0,0.0,0.0,1.0,0.0,0.0
1,0,NA_4896_02_AB17-24/neg_NA_4896_02_AB17-24_0_18...,0.0,0.0,0.0,1.0,0.0,0.0


loading tiles_test
(10873, 8)


Unnamed: 0,id,imagename,cored,diffuse,CAA,negative,flag,notsure
0,14,NA4053-02_AB/NA4053-02_AB_18_18.jpg_0.jpg,0.0,3.003472,0.0,0.0,0.28125,1.0
1,15,NA4053-02_AB/NA4053-02_AB_18_18.jpg_1.jpg,0.0,5.214052,0.0,0.0,2.0,1.0


In [7]:
train_n_validation = pd.concat([tiles_train, tiles_validation])[['imagename'] + image_classes]  
assert train_n_validation.shape[0] == (tiles_train.shape[0] + tiles_validation.shape[0])
test = tiles_test[['imagename'] + image_classes].copy()

The threshold used for labeling is the same as the original dataset. Lowering the threshold could create more observations.

In [8]:
threshold = .99
for class_i in image_classes:
    train_n_validation[class_i] = train_n_validation[class_i].map(lambda x: class_i if x > threshold else '')
    test[class_i] = test[class_i].map(lambda x: class_i if x > threshold else '')
    

In [9]:
def create_text_label(df,
                       classes=[],
                       label_col_name='label',
                       negative_class_label='negative'):
    
    df[label_col_name] = df.apply(lambda row: ' '.join(filter(None,
                                                               [row[class_i] for class_i in classes] 
                                                              )), axis=1)
    
    # remove whitespace and label absence of any classes as negative class
    df[label_col_name] = df[label_col_name].map(lambda x: x.strip() if x.strip() else negative_class_label)
    return(df)

train_n_validation = create_text_label(train_n_validation, classes=image_classes)
test = create_text_label(test, classes=image_classes)
train_n_validation.drop(columns=image_classes, inplace=True)
test.drop(columns=image_classes, inplace=True)
display(train_n_validation.head(2))
display(test.head(2))

Unnamed: 0,imagename,label
0,NA4757-02_AB/NA4757-02_AB_18_25_61.jpg,cored
1,NA4918-02_AB17-24/NA4918-02_AB17-24_9_18_12.jpg,diffuse


Unnamed: 0,imagename,label
0,NA4053-02_AB/NA4053-02_AB_18_18.jpg_0.jpg,diffuse
1,NA4053-02_AB/NA4053-02_AB_18_18.jpg_1.jpg,diffuse


In [15]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.io as pio
pio.renderers.default = "iframe"

fig = make_subplots(rows=2, cols=1, shared_xaxes=True)
fig.append_trace(go.Histogram(x=train_n_validation['label'],
                           histnorm='percent',
                           name='train'), row=1, col=1)
fig.append_trace(go.Histogram(x=test['label'],
                           histnorm='percent',
                           name='test'), row=2, col=1)

fig.show()

In [11]:
test.to_csv(os.path.join(csv_dir, 'test_multilabel.csv'), index=False)
train_n_validation.to_csv(os.path.join(csv_dir, 'train_n_validation_multilabel.csv'), index=False)