# Project 3: 'Image Analysis'
## First Year Project  
### ITU, Spring 2021

This notebook contains all the code developed to explore, wrangle and analyse the raw data sets for our project, 'Image Analysis'.

Contributors:  
- Andy Bao Nguyen (anbn)
- Florian Micliuc (flmi)
- Mattias Wohlert 
- Sofia Elena Terenziani (sote)

Created: 06-04-2021 

Last modified:

### Imports

In [None]:
import cv2
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import matplotlib.cm as cm
import timeit
import missingno as msno
from skimage import morphology
from scipy.spatial.distance import cdist
from scipy.stats.stats import mode
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score
from PIL import Image
from collections import Counter

### Functions

Sanity check functions

In [None]:
def check_null_values(file, string):
        if file.isnull().values.any():
            print('There are null values in {} dataset'.format(string))
        else:
            print('There are no null values in {} dataset'.format(string))

In [None]:
def dataset_checker_values(dataset,value):
    SA = dataset.copy()
    SA.replace(value, np.nan, inplace=True)
    missingdata_df = SA.columns[SA.isnull().any()].tolist()
    msno.matrix(SA);

Colour analysis functions

In [None]:
def crop(image):
    y_nonzero, x_nonzero, _ = np.nonzero(image)
    return image[np.min(y_nonzero):np.max(y_nonzero), np.min(x_nonzero):np.max(x_nonzero)]

In [None]:
def RGB2HEX(color):
     return "#{:02x}{:02x}{:02x}".format(int(color[0]), int(color[1]), int(color[2]))

### Loading the data

Data loading description pending

In [None]:
image_folder = '../data/raw/example_image/'
segmentation_folder = '../data/raw/example_segmentation/'
ground_truth = '../data/raw/example_ground_truth.csv'
features = '../data/features/features.csv'

In [None]:
image_files = os.listdir(image_folder)
segmentation_files = os.listdir(segmentation_folder)
ground_truth = pd.read_csv(ground_truth)
features = pd.read_csv(features)

## Task 0 - Data checking and filtering

### CSV files sanity checks

In [None]:
check_null_values(ground_truth,'ground_truth')
check_null_values(features,'features')

In [None]:
dataset_checker_values(ground_truth,-1)
dataset_checker_values(features,-1)

There are no gaps in the plots, thus the value -1 (missing data) does not occur in the dataset

In [None]:
if True in list(ground_truth.duplicated()):
    print("Duplicate rows")
else: 
    print("No duplicate rows")

In [None]:
if True in list(features.duplicated()):
    print("Duplicate rows")
else:
    print("No duplicate rows")

### Data Manipulation

In [None]:
ground_truth = ground_truth.set_index('image_id')
features = features.set_index('id')

In [None]:
color_images_paths = []
segmentation = []

# getting the image_paths
for i in image_files[1:]:
    image_path= os.path.join(image_folder,i)
    if not i.endswith('superpixels.png'):
        color_images_paths.append(image_path)
for i in segmentation_files:
    image_path1 = os.path.join(segmentation_folder,i)
    segmentation.append(image_path1)

# would be nice to create a function that actually checks if the tuple contains the right colored image and segemntation one
both_images = list(zip(color_images_paths,segmentation))

#splitting the images
keratosis_images = ground_truth.index[ground_truth['seborrheic_keratosis'] == 1.0].tolist()
melanoma_images = ground_truth.index[ground_truth['melanoma'] == 1.0].tolist()
healthy_images = ground_truth[(ground_truth['seborrheic_keratosis'] == 0.0) & (ground_truth['melanoma'] == 0.0)]
#print(healthy_images.index.tolist) - to get only the healthy images in a list hence the healthy_images is a df

keratosis = [os.path.join(image_folder,i + '.jpg') for i in keratosis_images] #positive keratosis colour_images path
melanoma = [os.path.join(image_folder,i + '.jpg') for i in melanoma_images] #positive melanoma colour_images path
healthy = [os.path.join(image_folder,i + '.jpg') for i in healthy_images.index.tolist()] #healthy colour_images path

k_segmentation = [os.path.join(segmentation_folder,i + '_segmentation.png') for i in keratosis_images] #positive keratosis segmentation path
m_segmentation = [os.path.join(segmentation_folder,i + '_segmentation.png') for i in melanoma_images] #positive melanoma segmentation path
h_segmentation = [os.path.join(segmentation_folder,i + '_segmentation.png') for i in healthy_images.index.tolist()]#healthy segmentation path"

## Task 1 - Feature analysis

### 1.1 Colour analysis

Reading the coloured and the segmentation files

In [None]:
im = plt.imread(keratosis[10])
mask = plt.imread(k_segmentation[10])

Basic knowledge about the lession such as area and perimeter (not absolutely necessary for colour analysis)

In [None]:
area = np.sum(mask)
struct_el = morphology.disk(1)
mask_eroded = morphology.binary_erosion(mask, struct_el)
image_perimeter = mask - mask_eroded
perimeter = np.sum(image_perimeter)
#plt.imshow(image_perimeter,cmap = 'cool') # really small but visible with cool cmap
#print(' The area is -> ',area,'\\n','The perimeter is -> ',perimeter)

Taking the lession and displaying it over the segmentation

In [None]:
im1 = im.copy()
im1[mask==0] = 0
new_arr_no_0 = im1[np.where(im1!=0)]
#cropping the image for better performance
im1 = crop(im1)
#plt.imshow(im1) #not best crop but manageable

Getting every pixel's coordinates

In [None]:
xy_coords = np.flip(np.column_stack(np.where(im1 >= 0)), axis=1)
a_del = np.delete(xy_coords, 0, 1)
a_del = a_del[::3][:, [0, 1]] #python dark magic and true coordinates

Getting every pixel's RGB values and converting them to HEX codes

In [None]:
image = Image.fromarray(im1)
rgb_image = image.convert('RGB')
rgb1 = [rgb_image.getpixel((int(i[0]),int(i[1]))) for i in a_del]
hex_codes = [RGB2HEX(i) for i in rgb1]
counted_colours = Counter(hex_codes)
counted_colours.pop('#000000')
#print(len(rgb1)) #3,379,770 

In [None]:
#print(counted_colours)

The conversion of the RGB values to hex codes is due to the fact RGB values come in tuples of three (R,G,B), therefore, for a better performance we turn them to HEX codes ( which are basically #RGB values ) and then we count their appearance

## Task 2 - Diagnosis prediction

## Task 3 - Open question

# WORK IN PROGRESS

In [None]:
#Example_ground_truth
#Features
ex_ground = pd.read_csv("../data/raw/example_ground_truth.csv")
features = pd.read_csv("../data/features/features.csv")
images = ("../data/raw/example_image/")
segmentations = ("../data/raw/example_segmentation/")

In [None]:
# Create a DataFrame object
df_exground = pd.DataFrame(ex_ground, columns =['image_id', 'melanoma', 'seborrheic_keratosis'])

# Iterate over the index range from
# 0 to max number of columns in dataframe
for i in range(df_exground.shape[1]):

    print('Column Number : ', i)

    # Select column by index position using iloc[]
    columnSeriesObj = df_exground.iloc[:, i]
    print('Column Contents : ', columnSeriesObj.values)

In [None]:
features

# Loading of data and scripts

In [None]:
%run ../Scripts/fyp2021p3_group00_functions.py

In [None]:
im = plt.imread("../Data/Raw/example_image/ISIC_0014310.jpg")
print(im.shape)
plt.imshow(im)
plt.imshow(im[:,:,0], cmap='gray')

In [None]:
def rgb2gray(rgb):
    r, g, b = rgb[:, :, 0], rgb[:, :, 1], rgb[:, :, 2]
    gray = 0.2989 * r * 0.5870 * g + 0.1140 * b
    return gray

grey = rgb2gray(im)
plt.imshow(grey,  cmap = "gray")

# Lesion is darker, and extract a mask of it

In [None]:
plt.hist(grey)
# Will run for a minute or two

In [None]:
# Our lesion is in the smaller clusters (The darker pixels), and the greater ones must be the pixels of the skin
im_lim = grey < 4500
plt.imshow(im_lim, cmap = "gray") # We see a little noice within our lesion

## Holy shit some messed op edge detections
The resized image testing here is somewhat crazy if you run it, you will get the image in gray, and an edge detection via Laplacian method.

In [None]:
test_image = cv2.imread("../Data/Raw/example_image/ISIC_0014310.jpg", cv2.IMREAD_GRAYSCALE)
imageS = cv2.resize(test_image, (960, 540))

laplacian_image = cv2.Laplacian(imageS, cv2.CV_64F, ksize=13) #Change the ksize to get different edge detections, only odd numbers works
canny = cv2.Canny(imageS, 20, 30) #You can change the numbers of values, to where it sees something as an edge

cv2.imshow("Image", imageS)
cv2.imshow("Laplacian", laplacian_image)
cv2.imshow("Canny", canny) #The Canny edge detection is kinda weird

cv2.waitKey(0)
cv2.destroyAllWindows()

# Folder images multiple ways

In [None]:
# This will get all the files in a folder in onlyfiles. And then it will read them all and store them in the array images.
onlyfiles = [f for f in os.listdir(images) if os.path.isfile(os.path.join(images, f))]
image_s = np.empty(len(onlyfiles), dtype = object)
for n in range(0, len(onlyfiles)):
    image_s[n] = cv2.imread(os.path.join(images, onlyfiles[n]))

In [None]:
for i in image_s:
    print(i)

In [None]:
def load_images_from_folder(folder):
    images = []
    for filename in os.listdir(folder):
        img = cv2.imread(os.path.join(folder,filename))
        if img is not None:
            images.append(img)
    return images

In [None]:
load_images_from_folder(images)