# Data Preperation

### Input:
##### 1) CSV of Targets for each image
##### 2) All folders of train images from Kaggle in a new folder called train in a local only SML_Project_Data folder (38.1 GB total). The SML_Project_Data folder is in the same folder as the GitHub folder of DS-5220-Supervised-Machine-Learning-Project
### Output:
##### High level directory called data is created. Inside of high level folder are two major directories of train and val. Each major director has a seperate folder for each of the 5 classes. This results in 10 folders with resized images in them.
### Next Steps:
##### Move high level directory and all contents next to script which will train model. The script which trains the model can create all of the folders, even though they would be empty of course.

In [1]:
import numpy as np
import cv2
import os

# Input Main Folder of Data



In [2]:
main_data_folder = '../SML_Project_Data'

# Analysis of a Single Image

In [3]:
# Load an color image in color
img = cv2.imread( main_data_folder + '/train/10_left.jpeg',1)
print(img.shape)
img

(3168, 4752, 3)


array([[[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        ...,
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]],

       [[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        ...,
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]],

       [[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        ...,
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]],

       ...,

       [[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        ...,
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]],

       [[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        ...,
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]],

       [[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        ...,
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]]], dtype=uint8)

### Create PNG of Image Object

In [4]:
cv2.imwrite('10_left_new.jpeg',img) # Confirmed manually the exported image is the same number of bytes as the original image

True

### Inspect Image Object

In [5]:
img.shape

(3168, 4752, 3)

In [6]:
img[1500][2300][1]

146

In [7]:
np.max(img)

255

In [8]:
np.min(img)

0

In [9]:
print('Original Size', img.shape)

print("%d bytes in original image" % (img.size * img.itemsize))

print('MB in the original array image is', img.size * img.itemsize / 1000000, 'versus the original image JPEG has 1.5 MB') # 1 million bytes in a MB

resized = cv2.resize(img, (100, 100), interpolation = cv2.INTER_AREA)
print('Resized Size', resized.shape)
cv2.imwrite('10_left_reshaped_new.jpeg',resized) # Goes from 1.5 MB to 3 KB which redices the size by a factor 1 K

print("%d bytes in compressed image" % (resized.size * resized.itemsize))

print('Original pixel size is', np.prod(img.shape))
print('New pixel size is', np.prod(resized.shape))
print('Factor of reduction is by', np.prod(img.shape) / np.prod(resized.shape))


Original Size (3168, 4752, 3)
45163008 bytes in original image
MB in the original array image is 45.163008 versus the original image JPEG has 1.5 MB
Resized Size (100, 100, 3)
30000 bytes in compressed image
Original pixel size is 45163008
New pixel size is 30000
Factor of reduction is by 1505.4336


# Downscale Images to 100 x 100 x 3 and Export

In [10]:
from pathlib import Path
import pandas as pd

In [11]:
train_target = pd.read_csv(main_data_folder + '/trainLabels.csv', delimiter=',')
print(len(train_target))
train_target.head()

35126


Unnamed: 0,image,level
0,10_left,0
1,10_right,0
2,13_left,0
3,13_right,0
4,15_left,1


### Count Images

In [12]:
total_images = 0
for index, row in train_target.iterrows():
    split_image = row['image'].split('_')
    image_num = split_image[0]
    side = split_image[1]
    
    image_name = str(image_num) + '_' + side + '.jpeg'

    train_image_path = main_data_folder + '/train/' + image_name

    my_file = Path(train_image_path)

    try:
        my_abs_path = my_file.resolve(strict=True)
    except FileNotFoundError:
        pass
    else:
        total_images = total_images + 1
total_images

35126

## Downsizes all images and moves to subfolders in the main Data folder

In [16]:
# Create folders for each
downsized_data_path = main_data_folder + '/downsized_data'
try:
    os.mkdir(downsized_data_path)
except FileExistsError:
    print('Data Folder Already Exists \n')
else:
    print('Creating Data Folder \n')

Data Folder Already Exists 



In [19]:
for disease in (0, 1, 2, 3, 4):
    path = downsized_data_path + '/resized_' + str(disease)
    print('Looking to create folder of', path)
    try:
        os.mkdir(path)
    except FileExistsError:
        print('Folder Already Exists \n')
    else:
        print('Creating Folder \n')

Looking to create folder of ../SML_Project_Data/downsized_data/resized_0
Folder Already Exists 

Looking to create folder of ../SML_Project_Data/downsized_data/resized_1
Folder Already Exists 

Looking to create folder of ../SML_Project_Data/downsized_data/resized_2
Folder Already Exists 

Looking to create folder of ../SML_Project_Data/downsized_data/resized_3
Folder Already Exists 

Looking to create folder of ../SML_Project_Data/downsized_data/resized_4
Folder Already Exists 



# Downsizing
All images in Train folder and moving images to different folders inside of downsized_data

In [22]:
# Distribute Images to each Folder
counter = 0

status = int(total_images / 20)
print('Will give a status every', status, 'images')

new_shape = (500, 500)

for index, row in train_target.iterrows():
    #print(row['image'], row['level'])
    
    split_image = row['image'].split('_')
    image_num = split_image[0]
    side = split_image[1]

    image_name = str(image_num) + '_' + side + '.jpeg'

    train_image_path = main_data_folder + '/train/' + image_name

    my_file = Path(train_image_path)

    try:
        my_abs_path = my_file.resolve(strict=True)
    except FileNotFoundError:
        print('File not found')
        pass
    else:
        counter = counter + 1
        
        # Pull Image
        img2 = cv2.imread(train_image_path, 1)
        # Resize
        resized = cv2.resize(img2, new_shape, interpolation = cv2.INTER_AREA)
        # Create New Image File
        path = downsized_data_path + '/resized_' + str(row['level']) + '/' + image_name
        #print('Path:', path)
        cv2.imwrite(path, resized)
        resized
    if counter % status == 0:
        print('At', counter, 'images out of', total_images)
        
print('Processed Imnage Count', counter)

Will give a status every 1756 images
At 1756 images out of 35126
At 3512 images out of 35126
At 5268 images out of 35126
At 7024 images out of 35126
At 8780 images out of 35126
At 10536 images out of 35126
At 12292 images out of 35126
At 14048 images out of 35126
At 15804 images out of 35126
At 17560 images out of 35126
At 19316 images out of 35126
At 21072 images out of 35126
At 22828 images out of 35126
At 24584 images out of 35126
At 26340 images out of 35126
At 28096 images out of 35126
At 29852 images out of 35126
At 31608 images out of 35126
At 33364 images out of 35126
At 35120 images out of 35126
Processed Imnage Count 35126


# Counting Downsized Images

In [55]:
import os
import os, shutil

min_file_count = np.inf
for disease_type in (0, 1, 2, 3, 4):
    path, dirs, files = next(os.walk(downsized_data_path + '/resized_' + str(disease_type)))
    file_count = len(files)
    if file_count < min_file_count:
        min_file_count = file_count
    print('Disease Class of', disease_type, 'has count of images of', file_count)
print('Mininum count by class is', min_file_count)
    
train_file_count = int(min_file_count * .80)
print('Training set will have', train_file_count)
val_file_count = min_file_count - train_file_count
print('Validation/Test set will have', val_file_count)

Disease Class of 0 has count of images of 25811
Disease Class of 1 has count of images of 2444
Disease Class of 2 has count of images of 5292
Disease Class of 3 has count of images of 873
Disease Class of 4 has count of images of 708
Mininum count by class is 708
Training set will have 566
Validation/Test set will have 142


## Creating Folders for Training and Validation Set

In [24]:
main_data_folder

'../SML_Project_Data'

In [71]:
try:
    os.mkdir(main_data_folder + '/data/train')
    os.mkdir(main_data_folder + '/data/val')
except FileExistsError:
    print('Train and/or Val Folder Already Exists \n')
else:
    pass

# Create Training Set
for disease in (0, 1, 2, 3, 4):
    path = 'resized_' + str(disease)
    
    train_path = main_data_folder + '/data/train/resized_' + str(disease)
    
    val_path = main_data_folder + '/data/val/resized_' + str(disease)
    
    print('Looking to create folders of', path)
    try:
        os.mkdir(train_path)
    except FileExistsError:
        print('Train Folder Already Exists in ' + path)
    else:
        pass
    
    try:
        os.mkdir(val_path)
    except FileExistsError:
        print('Val Folder Already Exists in ' + path)
    else:
        pass
            
    print('')

Train and/or Val Folder Already Exists 

Looking to create folders of resized_0
Train Folder Already Exists in resized_0

Looking to create folders of resized_1
Train Folder Already Exists in resized_1

Looking to create folders of resized_2
Train Folder Already Exists in resized_2

Looking to create folders of resized_3
Train Folder Already Exists in resized_3

Looking to create folders of resized_4
Train Folder Already Exists in resized_4



# Move Data from downsized_data to a validate Folder

In [76]:
counter = 0

track ={
    0 : 0,
    1 : 0,
    2 : 0,
    3 : 0,
    4 : 0
    }
missing = 0

# https://stackoverflow.com/questions/29576430/shuffle-dataframe-rows
train_target_shuffled = train_target.sample(frac=1, random_state = 14).reset_index(drop=True)
print(train_target_shuffled.head())

for index, row in train_target_shuffled.iterrows():
    #print(row['image'], row['level'])
    
    split_image = row['image'].split('_')
    image_num = split_image[0]
    side = split_image[1]

    image_name = str(image_num) + '_' + side + '.jpeg'
    #print(image_name)
    
    #train_image_path = main_data_folder + '/train/' + image_name

    my_file = Path(train_image_path)

    try:
        my_abs_path = my_file.resolve(strict=True)
    except FileNotFoundError:
        #print('File not found')
        pass
    else:
        #print('File found')
        
        # Check that the class of interest isn't full
        if track.get(row['level']) < val_file_count:
            # Pull Image
            #img2 = cv2.imread(train_image_path, 1)
            # Resize
            #resized = cv2.resize(img2, new_shape, interpolation = cv2.INTER_AREA)
            # Create New Image File
            resized_path = downsized_data_path + '/resized_' + str(row['level']) + '/' + image_name
            #print('Path:', path)
            #img2 = cv2.imread(path, 1)
            
            
            #d = {1: "one", 2: "three"}
            #d1 = {2: "two"}

            # updates the value of key 2
            #d.update(d1)
            #print(d)

            #d1 = {3: "three"}
            new_value = track.get(row['level']) + 1
            track.update({row['level'] : new_value})

            output_path = main_data_folder + '/data/val/resized_' + str(row['level']) + '/' + image_name
            #cv2.imwrite(path, img2)
            if os.path.isfile(resized_path):
                #print('File moved')
                shutil.move(src=resized_path, dst=output_path)
                #shutil.copyfile(src=resized_path, dst=output_path)
                counter = counter + 1
            else:
                missing = missing + 1
        #else:
        #    print('Limit for image class met')
        
#    if counter % status == 0:
#        print('At', counter, 'images out of', total_images)
        
print('Processed Image Count', counter)

print(track, missing )

         image  level
0  33008_right      0
1   20325_left      0
2  21262_right      0
3  32291_right      0
4   29321_left      0
Processed Image Count 710
{0: 142, 1: 142, 2: 142, 3: 142, 4: 142} 0


In [None]:
# Some unzipping errors to google colab 1
1544_right.jpeg
Upload failed
1669_right.jpeg
Upload failed
2802_right.jpeg
Upload failed
4533_left.jpeg
Upload failed
4614_right.jpeg
Upload failed
4711_right.jpeg
Upload failed
4852_left.jpeg
Upload failed
4885_right.jpeg
Upload failed
6947_left.jpeg
Upload failed
9404_right.jpeg
Upload failed