<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Image-processing-with-Keras" data-toc-modified-id="Image-processing-with-Keras-1">Image processing with Keras</a></span><ul class="toc-item"><li><ul class="toc-item"><li><span><a href="#Get-images" data-toc-modified-id="Get-images-1.0.1">Get images</a></span></li><li><span><a href="#Proccess-images-as-arrays" data-toc-modified-id="Proccess-images-as-arrays-1.0.2">Proccess images as arrays</a></span></li><li><span><a href="#Exploratory-analysis" data-toc-modified-id="Exploratory-analysis-1.0.3">Exploratory analysis</a></span></li><li><span><a href="#Build-the-model" data-toc-modified-id="Build-the-model-1.0.4">Build the model</a></span></li></ul></li></ul></li></ul></div>

__Disclaimer__:

This lesson could be significantly improved. It does not run as is.

# Image processing with Keras

Keras is a deep learning library build on top of TensorFlow. We can use it to process our image data to arrays. Often times, we use deep learning to do image processing. In this example, I will use naive bayes to later prove how deep learning will do much better than naive bayes.

### Get images

In [9]:
from selenium import webdriver
import os
import time
import requests

# Set up Google search url with term
searchterm = 'memes'
url = "https://www.google.co.in/search?q="+searchterm+"&source=lnms&tbm=isch"

browser = webdriver.Chrome() # Make sure ChromeDriver is intalled https://chromedriver.chromium.org/getting-started
browser.get(url)

browser.execute_script("window.scrollBy(0,10000)")

elements = browser.find_elements_by_class_name('rg_i')
print(len(elements))

# Set up variable to count successful downloads
counter = 0
succounter = 0

# Makes the folder if it doesn't already exist
if not os.path.exists(searchterm):
    os.mkdir(searchterm)

for x in elements:
	x.click()
	time.sleep(1)
	element = browser.find_elements_by_class_name('v4dQwb')

	print("Total Count:", counter)
	print("Succsessful Count:", succounter)
	
	if counter == 0:
		img = element[0].find_element_by_class_name('n3VNCb')
	else:
		img = element[1].find_element_by_class_name('n3VNCb')

	# Saves the image
	try:

		r = requests.get(img.get_attribute("src"))
		
		if r.status_code == 200:
			with open(searchterm+"/image_"+str(counter)+".png", 'wb') as f:
				f.write(r.content)
			
		succounter = succounter + 1
		
	except Exception as e:
		print("could not load : "+img)
		print(e)

	counter = counter + 1
	    
print(succounter, "pictures succesfully downloaded")
browser.close()

WebDriverException: Message: 'chromedriver' executable needs to be in PATH. Please see https://sites.google.com/a/chromium.org/chromedriver/home


In [None]:
import os
from subprocess import check_output
import sys
from time import time, sleep

import numpy as np 
import pandas as pd 
import seaborn as sns

from IPython.display import display
from IPython.display import Image as _Imgdis
from PIL import Image
from scipy import ndimage

from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

### Proccess images as arrays

In [10]:
# Create a dataset of memes
folder_1 = "memes"

# Create a list of files in the folder specified
meme_files = [f for f in os.listdir(folder_1) if os.path.isfile(os.path.join(folder_1, f))]

print("Working with {0} images".format(len(meme_files)))
print("Image examples: ")

# Print two examples using display(_Imgdis()), which can read the image files
for i in range(150, 152):
    print(meme_files[i])
    display(_Imgdis(filename=folder_1 + "/" + meme_files[i], width=240, height=320))

FileNotFoundError: [Errno 2] No such file or directory: 'memes'

In [None]:
# Create also a dataset of non-memes, pulled from this dataset: 
# http://press.liacs.nl/mirflickr/mirdownload.html

folder_0 = "non_memes"

# Create a list of files in the folder specified
non_meme_files = [f for f in os.listdir(folder_0) if os.path.isfile(os.path.join(folder_0, f))]

print("Working with {0} images".format(len(non_meme_files)))
print("Image examples: ")

# Print two examples using display(_Imgdis()), which can read the image files
for i in range(150, 152):
    print(non_meme_files[i])
    display(_Imgdis(filename=folder_0 + "/" + non_meme_files[i], width=240, height=320))

In [None]:
# Prepare arrays for data to be saved in image processing for loops
y_data = [1]*len(meme_files) + [0]*len(non_meme_files)
meme_colors = np.ndarray(shape = (len(meme_files), 3), dtype=np.float32)
non_meme_colors = np.ndarray(shape = (len(non_meme_files), 3), dtype=np.float32)
image_size_areas = []

# Dimensions to standardize the images to
image_height = 120
image_width = 160
channels = 3

# Make a 3-layered array (3 for RGB or number of channels)
dataset = np.ndarray(shape=(len(y_data), channels, image_height, image_width), dtype=np.float32)

In [None]:
# Add an array of each meme image to our dataset (note this code can be improved by creating a read_image function instead of repeating the for loop twice)
i = 0
for j in range(len(meme_files)):
    img = load_img(folder_1 + "/" + meme_files[j])  # this is a PIL image
    # Save initial dimensions before resizing
    image_size_areas.append(img.size[0] * img.size[1])
    img = img.resize((image_height, image_width))
    # Convert to numpy array and save colors
    x = img_to_array(img)
    meme_colors[j] = [x[0].sum(), x[1].sum(), x[2].sum()]
    x = x.reshape((channels, image_height, image_width))
    try:
        dataset[i] = x
        i += 1
        if i % 250 == 0:
            print("%d images to array" % i)
    except Exception as e:
        i += 1
        print("failed on %d" %i, e)

# Add an array of each non-meme image to our dataset
for k in range(len(non_meme_files)):
    img = load_img(folder_0 + "/" + non_meme_files[k])  # this is a PIL image
    # Save initial dimensions before resizing
    image_size_areas.append(img.size[0] * img.size[1])
    img = img.resize((image_height, image_width))
    # Convert to numpy array and save colors
    x = img_to_array(img)
    non_meme_colors[k] = [x[0].sum(), x[1].sum(), x[2].sum()]
    x = x.reshape((channels, image_height, image_width))
    try:
        dataset[i] = x
        i += 1
        if i % 250 == 0:
            print("%d images to array" %i)
    except Exception as e:
        i += 1
        print("failed on %d" %i, e)

### Exploratory analysis

In [None]:
# Determine the meme versus non-meme split
print(len(y_data))
print("memes:", sum(y_data)/len(y_data), "non-memes:", (len(y_data)-sum(y_data))/len(y_data))

In [None]:
# Plot the distribution of sizes before the images were cropped
pd.Series(data = image_size_areas).hist()

In [None]:
# Plot a histogram of colors for memes
sns.distplot(meme_colors[:,0], color = 'r')
sns.distplot(meme_colors[:,1], color = 'g')
sns.distplot(meme_colors[:,2], color = 'b')

In [None]:
# Plot a histogram of colors for non-memes
sns.distplot(non_meme_colors[:,0], color = 'r')
sns.distplot(non_meme_colors[:,1], color = 'g')
sns.distplot(non_meme_colors[:,2], color = 'b')

### Build the model

In [None]:
#Model will expect a 2-D array, so we can flatten a 4-D array to a 2-D one
dataset_flattened = dataset.reshape(len(y_data) * channels, image_height * image_width)
y_data_flattened = [1]*len(meme_files)*3 + [0]*len(non_meme_files)*3

In [None]:
print(len(dataset_flattened), len(dataset)*3)

In [None]:
#Split the array data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(dataset_flattened, y_data_flattened, test_size=0.2, random_state=33)

In [None]:
#Test the randomness of the y_train and y_test set
print(sum(y_train)/len(y_train), sum(y_test)/len(y_test))

In [None]:
#Train your data set using multinomial NB from sklearn library
nb = MultinomialNB()
nb.fit(X_train, y_train)

In [None]:
#Test your data set on your test data
preds = nb.predict(X_test)

#Print the accuracy of your model
accuracy = (preds == y_test)
'Accuracy : {:.2%}'.format(accuracy.sum() / len(accuracy))

This is pretty bad performance, considering the accuracy by assigning every picture to a meme would be ~40%.

Source: https://www.kaggle.com/lgmoneda/from-image-files-to-numpy-arrays