In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
%matplotlib inline

## Load Data

**CBIS-DDSM Data (Abnormal Images)**

In [2]:
#train data
CBIS_train_patches = np.load(os.path.join("./Processed_abnorm_256", "abnormal_train_patch.npy" ))
CBIS_train_labels = np.load(os.path.join("./Processed_abnorm_256", "abnormal_train_Lbl.npy" ))
CBIS_train_FNs = np.load(os.path.join("./Processed_abnorm_256", "abnormal_train_FN.npy" ))

#test data
CBIS_test_patches = np.load(os.path.join("./Processed_abnorm_256", "abnormal_test_patch.npy" ))
CBIS_test_labels = np.load(os.path.join("./Processed_abnorm_256", "abnormal_test_Lbl.npy" ))
CBIS_test_FNs = np.load(os.path.join("./Processed_abnorm_256", "abnormal_test_FN.npy" ))



print("Abnaormal train Patches:", CBIS_train_patches.shape)
print("Abnaormal train Labels:", CBIS_train_labels.shape)
print("Abnaormal train File Names:", CBIS_train_FNs.shape)
print("\n")
print("Abnaormal test Patches:", CBIS_test_patches.shape)
print("Abnaormal test Labels:", CBIS_test_labels.shape)
print("Abnaormal test File Names:", CBIS_test_FNs.shape)

Abnaormal train Patches: (8286, 256, 256, 1)
Abnaormal train Labels: (8286,)
Abnaormal train File Names: (8286,)


Abnaormal test Patches: (1863, 256, 256, 1)
Abnaormal test Labels: (1863,)
Abnaormal test File Names: (1863,)


In [3]:
#combine train and test data 

CBIS_all_patches = np.concatenate([CBIS_train_patches, CBIS_test_patches], axis=0)
CBIS_all_labels = np.concatenate([CBIS_train_labels, CBIS_test_labels], axis=0)
CBIS_all_FNs = np.concatenate([CBIS_train_FNs, CBIS_test_FNs], axis=0)

CBIS_all_patches, CBIS_all_labels, CBIS_all_FNs = \
shuffle(CBIS_all_patches, CBIS_all_labels, CBIS_all_FNs, random_state=19510705)

In [4]:
#split the combined data into train and test
train_patches, test_patches, train_labels, test_labels, train_FNs, test_FNs = \
train_test_split(CBIS_all_patches, CBIS_all_labels, CBIS_all_FNs, test_size = 0.183565, random_state=19430727)

**DDSM Data (Normal Images)**

In [5]:
#Howtek data
howtek_patches = np.load(os.path.join("./Processed_norm_256", "howtek_patches_all.npy" ))
howtek_labels = np.load(os.path.join("./Processed_norm_256", "howtek_labels_all.npy" ))
howtek_FNs = np.load(os.path.join("./Processed_norm_256", "howtek_FileNames_all.npy" ))

#Lumisys data
lumisys_patches = np.load(os.path.join("./Processed_norm_256", "howtek_patches_all.npy" ))
lumisys_labels = np.load(os.path.join("./Processed_norm_256", "howtek_labels_all.npy" ))
lumisys_FNs = np.load(os.path.join("./Processed_norm_256", "howtek_FileNames_all.npy" ))

#combined normal data 
normal_patches = np.concatenate([howtek_patches, lumisys_patches], axis = 0)
normal_labels = np.concatenate([howtek_labels, lumisys_labels], axis = 0)
normal_FNs = np.concatenate([howtek_FNs, lumisys_FNs], axis = 0)

# print("Normal Patches:", normal_patches.shape)
# print("Normal Labels:", normal_labels.shape)
# print("Normal File Names:", normal_FNs.shape)

#Shuffle and split DDSM into train and test dataset
normal_patches, normal_labels, normal_FNs = \
shuffle(normal_patches, normal_labels, normal_FNs, random_state=20170301)

print("No. of DDSM Images:", normal_patches.shape)

No. of DDSM Images: (79082, 256, 256, 1)


In [6]:
#select 51.3% of DDSM data considering the number of CBIS data size

DDSM_norm_patches, X_norm_patches, DDSM_norm_Lbls, y_norm_Lbls, DDSM_norm_FNs, Z2_norm_FNs = \
train_test_split(normal_patches, normal_labels, normal_FNs, test_size = 0.487, random_state=20200121)


X_norm_train, X_norm_test, y_norm_train, y_norm_test, norm_FNs_train, norm_FNs_test = \
train_test_split(DDSM_norm_patches, DDSM_norm_Lbls, DDSM_norm_FNs, test_size = 0.183565, random_state=6325)


print("DDSM Train Images:", X_norm_train.shape)
print("DDSM Train Labels:", y_norm_train.shape)
print("DDSM Train File Names:", norm_FNs_train.shape)
print("\n")
print("DDSM Test Images:", X_norm_test.shape)
print("DDSM Test Labels:", y_norm_test.shape)
print("DDSM Test File Names:", norm_FNs_test.shape)

DDSM Train Images: (33121, 256, 256, 1)
DDSM Train Labels: (33121,)
DDSM Train File Names: (33121,)


DDSM Test Images: (7448, 256, 256, 1)
DDSM Test Labels: (7448,)
DDSM Test File Names: (7448,)


In [7]:
#check % of train data in the CBIS data and apply to DDSM train and test data split
pct_train = train_patches.shape[0]/(train_patches.shape[0]+test_patches.shape[0])
num_train_ddsm = normal_patches.shape[0]*pct_train
num_test_ddsm = normal_patches.shape[0]*(1-pct_train)

print("% of preferred DDSM train data:", np.round(pct_train, 2))
print("Preferred No. of DDSM train data:", np.round(num_train_ddsm))
print("Preferred No. of DDSM test data:", np.round(num_test_ddsm))

% of preferred DDSM train data: 0.82
Preferred No. of DDSM train data: 64558.0
Preferred No. of DDSM test data: 14524.0


**Merged train and test dataset**

In [8]:
#train data
train_images = np.concatenate([X_norm_train, train_patches], axis=0)
train_labels = np.concatenate([y_norm_train, train_labels], axis=0)
train_FNs = np.concatenate([norm_FNs_train, train_FNs], axis=0)

#test data
test_images = np.concatenate([X_norm_test, test_patches], axis=0)
test_labels = np.concatenate([y_norm_test, test_labels], axis=0)
test_FNs = np.concatenate([norm_FNs_test, test_FNs], axis=0)

**Label encoding**

In [9]:
le = preprocessing.LabelEncoder()
le.fit(train_labels)

LabelEncoder()

In [10]:
list(le.classes_)

['BENIGN_calcification',
 'BENIGN_mass',
 'MALIGNANT_calcification',
 'MALIGNANT_mass',
 'NORMAL']

In [11]:
#Convert Normal to 0 
train_labels_en = le.transform(train_labels) + 1
train_labels_en[train_labels_en==5]=0

test_labels_en = le.transform(test_labels) + 1
test_labels_en[test_labels_en==5]=0

In [12]:
np.unique(train_labels_en)

array([0, 1, 2, 3, 4], dtype=int64)

In [13]:
np.unique(test_labels_en)

array([0, 1, 2, 3, 4], dtype=int64)

In [14]:
classes = le.classes_
classes = np.insert(classes, 0, 'NORMAL', axis=0)
classes = classes[0:5]

In [15]:
classes

array(['NORMAL', 'BENIGN_calcification', 'BENIGN_mass',
       'MALIGNANT_calcification', 'MALIGNANT_mass'], dtype='<U23')

In [16]:
train_bin_labels = np.zeros(len(train_labels_en)).astype(np.int32)
train_bin_labels[train_labels_en != 0] = 1

test_bin_labels = np.zeros(len(test_labels_en)).astype(np.int32)
test_bin_labels[test_labels_en != 0] = 1

In [17]:
np.unique(train_labels_en)

array([0, 1, 2, 3, 4], dtype=int64)

In [18]:
np.unique(train_bin_labels)

array([0, 1])

In [19]:
np.unique(test_labels_en)

array([0, 1, 2, 3, 4], dtype=int64)

In [20]:
np.unique(test_bin_labels)

array([0, 1])

**Save Labels**

In [21]:
np.save(os.path.join("Label", "train_labels_en.npy"), train_labels_en)
np.save(os.path.join("Label", "test_labels_en.npy"), test_labels_en)
np.save(os.path.join("Label", "train_bin_labels.npy"), train_bin_labels)
np.save(os.path.join("Label", "test_bin_labels.npy"), test_bin_labels)

**Distribution of data**

In [22]:
pd.value_counts(train_labels_en, normalize = True)

0    0.799908
1    0.069507
2    0.048592
4    0.042240
3    0.039753
dtype: float64

In [23]:
pd.value_counts(test_labels_en, normalize = True)

0    0.799828
1    0.069480
2    0.048432
4    0.043492
3    0.038767
dtype: float64

In [24]:
pd.value_counts(train_bin_labels, normalize = True)

0    0.799908
1    0.200092
dtype: float64

In [25]:
pd.value_counts(test_bin_labels, normalize = True)

0    0.799828
1    0.200172
dtype: float64

**Test and Validation Data Preparation**

In [26]:
X_val, X_test, y_val, y_test, y_val_multi, y_test_multi = \
    train_test_split(test_images, test_bin_labels, test_labels_en, test_size=0.5, random_state=19730104)
X_train, y_train, y_train_multi = \
     shuffle(train_images, train_bin_labels, train_labels_en, random_state=100)



In [27]:
X_train.shape

(41406, 256, 256, 1)

In [28]:
y_train.shape

(41406,)

**Save Final Data**

In [31]:
np.save(os.path.join("Data/256", 'X_train.npy'), X_train)
np.save(os.path.join("Data/256", 'y_train.npy'), y_train)
np.save(os.path.join("Data/256", 'train_labels_multi.npy'), y_train_multi)

In [34]:
np.save(os.path.join("Data/256", 'X_val.npy'), X_val)
np.save(os.path.join("Data/256", 'y_val.npy'), y_val)
np.save(os.path.join("Data/256", 'y_val_labels_multi.npy'), y_val_multi)

In [35]:
np.save(os.path.join("Data/256", 'X_test.npy'), X_test)
np.save(os.path.join("Data/256", 'y_test.npy'), y_test)
np.save(os.path.join("Data/256", 'y_test_labels_multi.npy'), y_test_multi)