In this note bookbook, we will create and save images by reading parquet files

In [1]:
%matplotlib inline 
# If we don't do this then image will open as pop-up and not in notebook

In [2]:
#!pip3 install tensorflow

In [3]:
!pip3 show tensorflow

Name: tensorflow
Version: 1.14.0
Summary: TensorFlow is an open source machine learning framework for everyone.
Home-page: https://www.tensorflow.org/
Author: Google Inc.
Author-email: packages@tensorflow.org
License: Apache 2.0
Location: /home/ubuntu/.local/lib/python3.6/site-packages
Requires: six, astor, keras-applications, keras-preprocessing, numpy, wrapt, grpcio, tensorflow-estimator, absl-py, wheel, termcolor, google-pasta, gast, tensorboard, protobuf


In [4]:
# !export PYTHONPATH=/home/ubuntu/.local/lib/python3.6/site-packages:$PYTHONPATH.
# ^ This makes all other libraries inaccessible

In [7]:
import tensorflow as tf

In [2]:
import pyarrow.parquet as pq 
import pandas as pd
import numpy as np
from PIL import Image as im
import matplotlib as plt
from matplotlib.pyplot import imshow

In [3]:
import wget
import time
import os
import copy
from skimage import io

In [6]:
!ls -l --block=M /home/ubuntu/datasets/mammography/

total 2951M
drwxr-xr-x 2 ubuntu ubuntu    1M Jan 28 21:05 cv10_data
-rw-r--r-- 1 ubuntu ubuntu    1M Sep 30 23:37 cv10_labels.npy
-rw-rw-r-- 1 ubuntu ubuntu 2951M Jan 28 20:52 ddsm-mammography.zip
drwxr-xr-x 2 ubuntu ubuntu    1M Jan 28 21:05 test10_data
-rw-r--r-- 1 ubuntu ubuntu    1M Sep 30 23:38 test10_labels.npy
drwxr-xr-x 2 ubuntu ubuntu    1M Jan 28 21:05 training10_0
drwxr-xr-x 2 ubuntu ubuntu    1M Jan 28 21:05 training10_1
drwxr-xr-x 2 ubuntu ubuntu    1M Jan 28 21:05 training10_2
drwxr-xr-x 2 ubuntu ubuntu    1M Jan 28 21:05 training10_3
drwxr-xr-x 2 ubuntu ubuntu    1M Jan 28 21:05 training10_4


As tensorflow could not be loaded we are going to use tfrecord library:
https://github.com/vahidk/tfrecord

In [8]:
# make sure everything was written properly by reading it back out
def read_and_decode_single_example(filenames):
    filename_queue = tf.train.string_input_producer(filenames, num_epochs=1)
    
    reader = tf.TFRecordReader()
    
    _, serialized_example = reader.read(filename_queue)
    features = tf.parse_single_example(
        serialized_example,
        features={
            'label_normal': tf.FixedLenFeature([], tf.int64),
            'image': tf.FixedLenFeature([], tf.string)
        })
    
    # now return the converted data
    label = features['label_normal']
    image = tf.decode_raw(features['image'], tf.uint8)
    image = tf.reshape(image, [299, 299, 1])
    
    return label, image

In [9]:
label, image = read_and_decode_single_example(["~/datasets/mammography/training10_0/training10_0.tfrecords", "~/datasets/mammography/training10_1/training10_1.tfrecords"])
images_batch, labels_batch = tf.train.batch([image, label], batch_size=16, capacity=2000)
global_step = tf.Variable(0, trainable=False)

Instructions for updating:
Queue-based input pipelines have been replaced by `tf.data`. Use `tf.data.Dataset.from_tensor_slices(string_tensor).shuffle(tf.shape(input_tensor, out_type=tf.int64)[0]).repeat(num_epochs)`. If `shuffle=False`, omit the `.shuffle(...)`.
Instructions for updating:
Queue-based input pipelines have been replaced by `tf.data`. Use `tf.data.Dataset.from_tensor_slices(input_tensor).shuffle(tf.shape(input_tensor, out_type=tf.int64)[0]).repeat(num_epochs)`. If `shuffle=False`, omit the `.shuffle(...)`.
Instructions for updating:
Queue-based input pipelines have been replaced by `tf.data`. Use `tf.data.Dataset.from_tensors(tensor).repeat(num_epochs)`.
Instructions for updating:
Prefer Dataset.range instead.
Instructions for updating:
Prefer Dataset.range instead.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
Instructions for updating:
Queue-based inp

In [11]:
labels_batch.get_shape

<bound method Tensor.get_shape of <tf.Tensor 'batch:1' shape=(16,) dtype=int64>>

### The Train Files

In [6]:
def show_image_resized(dataframe, n):
    img_name = dataframe.iloc[n, 0]
    img_data = np.array(dataframe.iloc[n, 1:])
    img_data = img_data.astype('float').reshape(137,236)
    #img_data = im.resize img_data
    img_data = cv2.resize(img_data, dsize=(128, 128), interpolation=cv2.INTER_CUBIC)
    # normalizing the image
    img_data = (img_data*(255.0/img_data.max())).astype(np.uint8)
    # show
    imshow(img_data, cmap='gray')

In [1]:
# show_image_resized(train, 69)

In [7]:
import cv2

In [8]:
img_dim = 224

### Reading and converting training data

In [None]:
train_0 = pq.read_pandas('/home/ubuntu/datasets/bengali-ai/train_image_data_0.parquet').to_pandas()

In [None]:
dataframe = train_0

for row in range(len(dataframe)):
    img_name = dataframe.iloc[row, 0]
    img_data = np.array(dataframe.iloc[row, 1:])
    img_data = img_data.astype('float').reshape(137,236)
    img_data = cv2.resize(img_data, dsize=(img_dim, img_dim), interpolation=cv2.INTER_CUBIC)
    # normalizing the image
    img_data = (img_data*(255.0/img_data.max())).astype(np.uint8)
    # save
    plt.image.imsave('/home/ubuntu/datasets/bengali-ai/training_images/train_0/'+img_name+'.png', img_data, cmap='gray')

In [94]:
del train_0

In [10]:
train_1 = pq.read_pandas('/home/ubuntu/datasets/bengali-ai/train_image_data_1.parquet').to_pandas()

In [11]:
dataframe = train_1

for row in range(len(dataframe)):
    img_name = dataframe.iloc[row, 0]
    img_data = np.array(dataframe.iloc[row, 1:])
    img_data = img_data.astype('float').reshape(137,236)
    img_data = cv2.resize(img_data, dsize=(img_dim, img_dim), interpolation=cv2.INTER_CUBIC)
    # normalizing the image
    img_data = (img_data*(255.0/img_data.max())).astype(np.uint8)
    # save
    plt.image.imsave('/home/ubuntu/datasets/bengali-ai/training_images/train_1/'+img_name+'.png', img_data, cmap='gray')

In [None]:
del train_1

### Reading the test data

In [None]:
test_0 = pq.read_pandas('/home/ubuntu/datasets/bengali-ai/test_image_data_0.parquet').to_pandas()

In [13]:
test_0.shape

(3, 32333)

In [10]:
dataframe = test_0

for row in range(len(dataframe)):
    img_name = dataframe.iloc[row, 0]
    img_data = np.array(dataframe.iloc[row, 1:])
    img_data = img_data.astype('float').reshape(137,236)
    img_data = cv2.resize(img_data, dsize=(img_dim, img_dim), interpolation=cv2.INTER_CUBIC)
    # normalizing the image
    img_data = (img_data*(255.0/img_data.max())).astype(np.uint8)
    # save
    plt.image.imsave('/home/ubuntu/datasets/bengali-ai/testing_images/test_0/'+img_name+'.png', img_data, cmap='gray')    

In [23]:
del test_0