In [1]:
%env JAVA_HOME=/usr/lib/jvm/java-7-openjdk-amd64



# This is a notebook for exploring the results of image-analyzer
# after it has run all steps of the example with default configuration.



from pyspark import SparkConf
from pyspark import SparkContext 
from StringIO import StringIO
import yaml
import numpy as np
import os
from pprint import pprint
import operator 
import sys
# set up Spark
conf = SparkConf()
conf.set('spark.executor.instances', 10)
sc = SparkContext()

def load_image(image):
    """Load one image, where image = (key, blob)"""
    from StringIO import StringIO
    from PIL import Image
    img_quads = []
    img = Image.open(StringIO(image[1]))
    image_object = np.asarray(img, dtype=np.uint8)
    return  image_object


env: JAVA_HOME=/usr/lib/jvm/java-7-openjdk-amd64


In [20]:

print("Example of measurements done on each image or image selection\n\n")
example = sc.pickleFile('hdfs:///t1/map_each_image/measures').take(1)[0]
print("Keys:", example[1].keys())
print("Centroids in 1 image flattened:", example[1]['cen'])
print("Histogram flattened:", example[1]['histo'])
print("Perceptive hash (abbrev.):", example[1]['phash'][:5])
print("Ward cluster hash (abbrev.):", example[1]['ward'][:5])
print('PCA factors and variance', example[1]['pca_fac'], example[1]['pca_var'])
print("\n\nAnd the candidate images being searched have the same measurements")
cand_example = sc.pickleFile('hdfs:///t1/candidates/c1/measures').take(1)[0]
print("candidate's keys:", cand_example[1].keys())

Example of measurements done on each image or image selection


('Keys:', ['cen', 'pca_var', 'meta', 'phash', 'histo', 'ward', 'pca_fac', 'id'])
('Centroids in 1 image flattened:', array([27, 27, 27,  0,  0,  0, 43, 43, 43, 16, 16, 16,  8,  8,  8], dtype=int32))
('Histogram flattened:', array([ 0,  0,  0,  0,  0,  6, 40, 43, 44, 46, 46,  0,  0,  0,  0,  0,  6,
       40, 43, 44, 46, 46,  0,  0,  0,  0,  0,  6, 40, 43, 44, 46, 46], dtype=int32))
('Perceptive hash (abbrev.):', [6418015912433839526, 4942993201567720280, -7597260727172374833, -3141915621314173119, 386254976385395965])
('Ward cluster hash (abbrev.):', (3483578612713787623, 1888239287327517496, -8917634196879128416, 3483578612713787623, 1888239287327517496))
('PCA factors and variance', array([  1.77710678e-02,   1.77710678e-02,   1.77710678e-02,
        -8.77369024e+14,   4.38684512e+14,   4.38684512e+14,
        -0.00000000e+00,   3.95131881e+30,  -3.95131881e+30]), array([  1.05548410e+03,   8.66052353e-31,   3.20247583e-

In [3]:
print("A kmeans algorithm is run among all images and lookup tables are made between clusters and hashes")

print("Lookup of kmeans cluster in all images to perceptive hashes in one image (abbrev.):\n\n")
pprint(sc.pickleFile("hdfs:///t1/km/cluster_to_phash").take(2))

print("\n\nLookup of kmeans cluster in all images to ward cluster in one image  (abbrev.):\n\n")
pprint(sc.pickleFile("hdfs:///t1/km/cluster_to_phash").take(2))

A kmeans algorithm is run among all images and lookup tables are made between clusters and hashes
Lookup of kmeans cluster in all images to perceptive hashes in one image (abbrev.):


[(3, (6418015912433839526, 4942993201567720280)),
 (3, (4942993201567720280, -7597260727172374833))]


Lookup of kmeans cluster in all images to ward cluster in one image  (abbrev.):


[(3, (6418015912433839526, 4942993201567720280)),
 (3, (4942993201567720280, -7597260727172374833))]


In [4]:
print("Same hash to cluster mapping idea but with hashed ward clusters. \n\n")
print("Lookup of one image's perceptive hash to kmeans cluster within all images (abbrev.):\n\n")
pprint(sc.pickleFile("hdfs:///t1/km/phash_to_cluster").take(2))

print("\n\n\nLookup of one image's ward cluster hashes to kmeans cluster within all images (abbrev.):\n\n")
pprint(sc.pickleFile("hdfs:///t1/km/ward_to_cluster").take(2))

Same hash to cluster mapping idea but with hashed ward clusters. 


Lookup of one image's perceptive hash to kmeans cluster within all images (abbrev.):


[((6418015912433839526, 4942993201567720280), 3),
 ((4942993201567720280, -7597260727172374833), 3)]



Lookup of one image's ward cluster hashes to kmeans cluster within all images (abbrev.):


[(3483578612713787623, 3), (1888239287327517496, 3)]


In [15]:
print("During each iteration of kmeans algorithm among all images,\n the top N hash chunks are counted within each kmeans cluster.")
print("The most common hashes of ward clusters within individual images of the kmeans clsuter 0.\n\n")
clust0_ward = sc.pickleFile('hdfs:////t1/km/ward_unions').take(1)[0]
pprint({k:v for k,v in clust0_ward.items() if v > 2})

During each iteration of kmeans algorithm among all images,
 the top N hash chunks are counted within each kmeans cluster.
The most common hashes of ward clusters within individual images of the kmeans clsuter 0.


{-8917634196879128416: 5,
 -8800265721242882188: 6,
 -8238206338916862869: 4,
 -6538646933861875957: 5,
 -6371097860695599930: 4,
 -5434838886404571572: 6,
 -5013803343762532022: 3,
 -4669024366043471636: 3,
 -4035475343318357777: 4,
 -2699164510210933549: 5,
 -2213663990102495809: 5,
 -2098177597065484460: 5,
 -1967460664227047773: 4,
 -1215487010474602740: 5,
 281200309521801431: 4,
 502896730504143507: 5,
 1675627364594718983: 7,
 1888239287327517496: 3,
 1969635022121237368: 5,
 2470317628725495884: 6,
 3466675275644008814: 6,
 5239582709862753648: 8,
 6276995473865984280: 6,
 6808367820307410009: 3,
 7306362158214069439: 7,
 7393274562421598044: 3,
 7497007922500697532: 6,
 7544431165752988482: 3,
 8391636610095633345: 3,
 8792051771637971719: 3}


In [5]:
print("The most common hashes of perceptive hash chunks within individual images of the kmeans cluster 0.\n\n")
cluster_0 = sc.pickleFile('hdfs:////t1/km/phash_unions').take(1)[0]
pprint(cluster_0)

The most common hashes of perceptive hash chunks within individual images of the kmeans cluster 0.


{-9207030593366120554: 1,
 -9160729706283187099: 1,
 -9134634260714918065: 1,
 -9065867973970490965: 1,
 -9002885334171243746: 1,
 -8985433723281079263: 1,
 -8964603192568826419: 1,
 -8951287419636932573: 1,
 -8944010668599428209: 1,
 -8905142883192741634: 1,
 -8755874599093647724: 1,
 -8676440398739998034: 1,
 -8663852602604624138: 1,
 -8594210990454501429: 1,
 -8555685088220152735: 1,
 -8480300397091823366: 1,
 -8463904529031336148: 1,
 -8457498263555334304: 1,
 -8370670216263085183: 1,
 -8366583772102580628: 1,
 -8360027961837531581: 1,
 -8357665396323878717: 1,
 -8308195073582477461: 1,
 -8268030327445981211: 1,
 -8180684610229367531: 1,
 -8150078092241151397: 1,
 -8125781973210080626: 1,
 -8122404218998938704: 1,
 -8106848623974677867: 1,
 -8102947245783820102: 1,
 -8058069629144039474: 1,
 -8047639015399360126: 1,
 -8034018258211072512: 1,
 -7854216928884826527: 1,
 -7805260430411

In [22]:
print("Kmeans saves a number of lookups. The final ones used are named ward_to_key, phash_to_key\n\n")
print('ward_to_key (ward cluster to training image key name)\n\n')
print(sc.pickleFile('hdfs:///t1/km/ward_to_key').take(2))
print('\n\nphash_to_key (perceptive hash chunk to training image key name)\n\n')
print(sc.pickleFile('hdfs:///t1/km/phash_to_key').take(2))

Kmeans saves a number of lookups. The final ones used are named ward_to_key, phash_to_key


ward_to_key (ward cluster to training image key name)


[(3483578612713787623, 'hdfs://ip-10-237-187-224:9000/imgs/newtest_J-L_Picard.Baldy.gif'), (1888239287327517496, 'hdfs://ip-10-237-187-224:9000/imgs/newtest_J-L_Picard.Baldy.gif')]


phash_to_key (perceptive hash chunk to training image key name)


[((6418015912433839526, 4942993201567720280), 'hdfs://ip-10-237-187-224:9000/imgs/newtest_J-L_Picard.Baldy.gif'), ((4942993201567720280, -7597260727172374833), 'hdfs://ip-10-237-187-224:9000/imgs/newtest_J-L_Picard.Baldy.gif')]


In [21]:
print('The find_similar function in search.py joins images on hashes and kmeans cluster.')
print('Here is a joined table example showing ward cluster hashes that joined similar files.')
ward_join = sc.pickleFile('hdfs:///t1/candidates/c1/ward_matches_full_join').take(2)
pprint(ward_join)

The find_similar function in search.py joins images on hashes and kmeans cluster.
Here is a joined table example showing ward cluster hashes that joined similar files.
[(6276995473865984280,
  ((3,
    ((6276995473865984280,
      'hdfs://ip-10-237-187-224:9000/fuzzy/newtest_bwolen.gif'),
     3483578612713787623)),
   'hdfs://ip-10-237-187-224:9000/imgs/newtest_ew-friends.gif')),
 (6276995473865984280,
  ((3,
    ((6276995473865984280,
      'hdfs://ip-10-237-187-224:9000/fuzzy/newtest_bwolen.gif'),
     3483578612713787623)),
   'hdfs://ip-10-237-187-224:9000/imgs/newtest_ew-friends.gif'))]


In [15]:
print("Finally the join above is condensed to image name pairs.\n")
print('In the example search, we should see image keys with /fuzzy/ filenames lining up same filename in /imgs/.')
ward_matches = sc.pickleFile('hdfs:///t1/candidates/c1/ward_matches_key_counts')
phash_matches = sc.pickleFile('hdfs:///t1/candidates/c1/phash_matches_key_counts')
#joined = ward_matches.join(phash_matches)
print("Ward")
wm = ward_matches.take(1)
pprint(wm)
print('Note these counts need to be de-duplicated first, but they show the idea of voting based on hash.')

Finally the join above is condensed to image name pairs.

In the example search, we should see image keys with /fuzzy/ filenames lining up same filename in /imgs/.
Ward
[('hdfs://ip-10-237-187-224:9000/fuzzy/newtest_gpripe.gif',
  ('hdfs://ip-10-237-187-224:9000/imgs/newtest_gpripe.gif', 31920),
  {'hdfs://ip-10-237-187-224:9000/imgs/newtest_J-L_Picard.Baldy.gif': 1140,
   'hdfs://ip-10-237-187-224:9000/imgs/newtest_addams-family.gif': 2280,
   'hdfs://ip-10-237-187-224:9000/imgs/newtest_aeon1a.gif': 3420,
   'hdfs://ip-10-237-187-224:9000/imgs/newtest_audrey1.gif': 2280,
   'hdfs://ip-10-237-187-224:9000/imgs/newtest_audrey2.gif': 3420,
   'hdfs://ip-10-237-187-224:9000/imgs/newtest_audrybt1.gif': 12540,
   'hdfs://ip-10-237-187-224:9000/imgs/newtest_bksomels.gif': 4560,
   'hdfs://ip-10-237-187-224:9000/imgs/newtest_bttf206.gif': 10260,
   'hdfs://ip-10-237-187-224:9000/imgs/newtest_bwolen.gif': 4560,
   'hdfs://ip-10-237-187-224:9000/imgs/newtest_cfb.gif': 1140,
   'hdfs://ip-10-237

In [10]:
print("Top 5 perceptive hash matched image keys generated with stats batch t1 and candidate batch c1\n\n")
phash_matches = sc.pickleFile('hdfs:///t1/candidates/c1/phash_matches_key_counts').take(5)
pprint(phash_matches)

Top 5 perceptive hash matched image keys generated with stats batch t1 and candidate batch c1


[('hdfs://ip-10-237-187-224:9000/fuzzy/newtest_tress-photo-2.gif',
  ('hdfs://ip-10-237-187-224:9000/imgs/newtest_tress-photo-2.gif', 4500),
  {'hdfs://ip-10-237-187-224:9000/imgs/newtest_tress-photo-2.gif': 4500}),
 ('hdfs://ip-10-237-187-224:9000/fuzzy/newtest_uprooted-tree.gif',
  ('hdfs://ip-10-237-187-224:9000/imgs/newtest_uprooted-tree.gif', 1734),
  {'hdfs://ip-10-237-187-224:9000/imgs/newtest_uprooted-tree.gif': 1734}),
 ('hdfs://ip-10-237-187-224:9000/fuzzy/newtest_rehg-thanksgiving-1994.gif',
  ('hdfs://ip-10-237-187-224:9000/imgs/newtest_rehg-thanksgiving-1994.gif',
   17340),
  {'hdfs://ip-10-237-187-224:9000/imgs/newtest_rehg-thanksgiving-1994.gif': 17340}),
 ('hdfs://ip-10-237-187-224:9000/fuzzy/newtest_tress-photo.gif',
  ('hdfs://ip-10-237-187-224:9000/imgs/newtest_tress-photo.gif', 9936),
  {'hdfs://ip-10-237-187-224:9000/imgs/newtest_tress-photo.gif': 9936}),
 ('hdfs://ip-1