# Dataset concatenation

This notebook serves to convert the figshare .mat files into .jpg files. Additionally, it concatenates the four archives into one archive for use later.

### Imports

In [None]:
from zipfile import ZipFile, ZIP_DEFLATED
import os
import io

import numpy as np
import h5py
from PIL import Image
import pandas as pd
import cv2
from google.colab import drive

In [None]:
# Connecting to colab
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Conversion

In [None]:
data_path = "/content/drive/Shareddrives/CS539 Group 6/data"

In [None]:
def mat_to_jpg(f):
  #reading v 7.3 mat file in python
  #https://stackoverflow.com/questions/17316880/reading-v-7-3-mat-file-in-python

  # f = h5py.File(filepath, 'r') #Open mat file for reading

  cjdata = f['cjdata'] #<HDF5 group "/cjdata" (5 members)>

  image = np.array(cjdata.get('image')).astype(np.float64) #In MATLAB: image = cjdata.image

  label = cjdata.get('label')[0,0] #Use [0,0] indexing in order to convert lable to scalar

  PID = cjdata.get('PID') # <HDF5 dataset "PID": shape (6, 1), type "<u2">
  PID = ''.join(chr(c) for c in PID) #Convert to string https://stackoverflow.com/questions/12036304/loading-hdf5-matlab-strings-into-python

  tumorBorder = np.array(cjdata.get('tumorBorder'))[0] #Use [0] indexing - convert from 2D array to 1D array.
  tumorMask = np.array(cjdata.get('tumorMask'))

  f.close()

  hi = np.max(image)
  lo = np.min(image)
  image = (((image - lo)/(hi-lo))*255).astype(np.uint8)

  #Save as jpeg
  #https://stackoverflow.com/questions/902761/saving-a-numpy-array-as-an-image
  im = Image.fromarray(image)
  return (im, label)

In [None]:
tumor_type_counter = {
    1.0 : 1,
    2.0 : 1,
    3.0 : 1,
}

label_dict = {
    1.0 : "meningioma",
    2.0 : "glioma",
    3.0 : "pituitary",
}

new_archive_path = f"{data_path}/figshare_jpg.zip"

with ZipFile(new_archive_path, "w", ZIP_DEFLATED) as new_zf:
  # Loop through all of our figshare archives
  for archive in os.listdir(data_path):
    if "figshare" in archive and "jpg" not in archive:
      curr_archive_path = f"{data_path}/{archive}"

      # Opening the current archive
      print(f"Beginning the converstion of {curr_archive_path.split('/')[-1]}")
      with ZipFile(curr_archive_path) as zf:
        for mat_file in zf.namelist():

          # Converting fro .mat to .jpg code adapted from:
          # https://stackoverflow.com/questions/59208896/converting-mat-file-extension-image-to-jpg-via-python
          if mat_file.endswith(".mat"):
            byte_file = io.BytesIO(zf.read(mat_file))
            f = h5py.File(byte_file, 'r')
            im, label = mat_to_jpg(f)
            jpg_filename = f"{tumor_type_counter[label]}_{label_dict[label]}.jpg"
            tumor_type_counter[label] += 1

            # Writing Image to ZipFile. Code adapted from:
            # https://stackoverflow.com/questions/21734313/in-memory-image-to-zipfile
            with io.BytesIO() as image_file:
              im.save(image_file, format="JPEG")
              new_zf.writestr(jpg_filename, image_file.getvalue())
      print("Converstion complete\n")

Beginning the converstion of figshare_1-766.zip
Converstion complete

Beginning the converstion of figshare_1533-2298.zip
Converstion complete

Beginning the converstion of figshare_2299-3064.zip
Converstion complete

Beginning the converstion of figshare_767-1532.zip
Converstion complete



## Checking Accuracy of Conversion

In [None]:
# Initial dataset contains 3064 images. Let's check to see if ours matches
with ZipFile(new_archive_path) as zf:
  namelist = zf.namelist()
print(f"Initial dataset: 3064 images\nNew archive:     {len(namelist)} images\n")
print(f"Initial dataset: 3064 unique images\nNew archive:     {len(pd.Series(namelist).unique())} unique images")

Initial dataset: 3064 images
New archive:     3064 images

Initial dataset: 3064 unique images
New archive:     3064


## Converting JPG to CSV

In [None]:
image_size = 150
images = []

with ZipFile(f"{new_archive_path}") as zf:
    for name in zf.namelist():
        label = f"{name.split('_')[-1].split('.')[0]}_tumor"
        f = zf.read(name)
        img = np.asarray(Image.open(io.BytesIO(f)))
        scan = np.stack((img,)*3, axis=-1)
        img = cv2.resize(scan, (image_size,image_size))
        images.append((label, img))

In [None]:
data = [[x[0]] + x[1].flatten().tolist() for x in images]
df = pd.DataFrame(data)
df.to_csv(f"{data_path}/figshare.csv")

In [None]:
print("Class Distributions")
for t_type in list(df[0].unique()):
  print(f"\t{t_type}: {len(df.loc[df[0] == t_type])}")

Class Distributions
	meningioma_tumor: 708
	glioma_tumor: 1426
	pituitary_tumor: 930
