# Standardization, or mean removal and variance scaling

In [1]:
from sklearn import preprocessing
import numpy as np

In [2]:
X = np.array([[ 1., -1.,  2.],[ 2.,  0.,  0.],[ 0.,  1., -1.]])

In [3]:
X_scaled = preprocessing.scale(X)

In [4]:
X_scaled

array([[ 0.        , -1.22474487,  1.33630621],
       [ 1.22474487,  0.        , -0.26726124],
       [-1.22474487,  1.22474487, -1.06904497]])

Scaled data has zero mean and unit variance:

In [5]:
X_scaled.mean(axis=0)

array([ 0.,  0.,  0.])

In [6]:
X_scaled.std(axis=0)

array([ 1.,  1.,  1.])

alternate method

In [7]:
min_max_scaler = preprocessing.MinMaxScaler()

In [8]:
X_train_minmax = min_max_scaler.fit_transform(X)

In [9]:
X_train_minmax

array([[ 0.5       ,  0.        ,  1.        ],
       [ 1.        ,  0.5       ,  0.33333333],
       [ 0.        ,  1.        ,  0.        ]])

In [10]:
X_scaled = X_train_minmax * 255
X_scaled

array([[ 127.5,    0. ,  255. ],
       [ 255. ,  127.5,   85. ],
       [   0. ,  255. ,    0. ]])

In [11]:
import os
from scipy import misc

In [12]:
%pylab inline --no-import-all
from PIL import Image 



Populating the interactive namespace from numpy and matplotlib


In [13]:
# define the current work directory
cwd=os.getcwd()
print cwd

one_folder_up = os.path.dirname(os.getcwd())
print one_folder_up

/Users/peterhirt/datascience/ild-cnn/notebooks
/Users/peterhirt/datascience/ild-cnn


In [32]:
(cwdtop,tail) = os.path.split(cwd)

print cwdtop
print 'taken out ', tail

patch_dir = os.path.join(cwdtop,'patches_may28')
category_list = os.walk(patch_dir).next()[1]
print category_list

/Users/peterhirt/datascience/ild-cnn
taken out  notebooks
['consolidation', 'fibrosis', 'ground_glass', 'healthy', 'micronodules', 'reticulation']


In [33]:
ind = 0

for category in category_list:
    category_dir = os.path.join(patch_dir, category)
    print category
    
    sub_category_list = (os.listdir(category_dir))
    print sub_category_list
    
    category_count = 0
    
    for sub_category in sub_category_list:
        
        sub_category_dir = os.path.join(category_dir, sub_category)
        attribute_list = (os.listdir(sub_category_dir))
         
        print sub_category , len(attribute_list)
        
        category_count = category_count + len(attribute_list)
        
        ind += 1
    
    print 'the count of the entire category is : ', category_count
    print ''

consolidation
['apical', 'diffuse', 'non-relevant', 'peripheral_sub_pleural']
apical 6
diffuse 23
non-relevant 14
peripheral_sub_pleural 15
the count of the entire category is :  58

fibrosis
['apical', 'basal', 'diffuse', 'non-relevant', 'perihilar', 'peripheral_sub_pleural']
apical 4
basal 367
diffuse 365
non-relevant 359
perihilar 38
peripheral_sub_pleural 11
the count of the entire category is :  1144

ground_glass
['apical', 'basal', 'diffuse', 'non-relevant', 'peripheral_sub_pleural']
apical 12
basal 186
diffuse 140
non-relevant 486
peripheral_sub_pleural 38
the count of the entire category is :  862

healthy
['apical', 'non-relevant']
apical 59
non-relevant 1571
the count of the entire category is :  1630

micronodules
['diffuse', 'non-relevant', 'peripheral_sub_pleural']
diffuse 1036
non-relevant 1160
peripheral_sub_pleural 131
the count of the entire category is :  2327

reticulation
['apical', 'basal', 'non-relevant', 'peripheral_sub_pleural']
apical 39
basal 109
non-relevant

In [34]:
# choose the dataset to look at

category_chosen = category_list[1]
print category_chosen

category_dir = os.path.join(patch_dir, category_chosen)

sub_category_list = (os.listdir(category_dir))

sub_category_chosen = sub_category_list[1]
print sub_category_chosen

fibrosis
basal


In [35]:
images = []
filenames = []

category = category_chosen
    
category_dir = os.path.join(patch_dir, category)
# print  'the path into the categories is: ', category_dir
    
sub_categories_dir_list = (os.listdir(category_dir))
    
subCategory = sub_category_chosen
            
subCategory_dir = os.path.join(category_dir, subCategory) 
        
subCat = (os.listdir(subCategory_dir))
        
num_patches = 0       
for file in subCat:
                
    if file.find('.bmp') > 0:
                
        # load the .bmp file into array       
        image = misc.imread(os.path.join(subCategory_dir,file), flatten= 0)
        # append the array to the dataset list
                
        images.append(image)
        filenames.append(file)
        
        num_patches+=1
print 'found', num_patches, 'patches' 

found 367 patches


In [36]:
images[0][1].mean()

162.0

In [26]:
images[0][1].std()

40.174120537200317

In [28]:
npimages = np.array(images)

In [29]:
npimages.shape

(367, 32, 32)

In [30]:
npimages.max()

255

In [31]:
npimages.min()

0