In [1]:
import glob
import os
import time
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import tensorflow as tf
import argparse
import sys
import time

### Code Snippets from TensorFlow for Poets instructions: 
(https://codelabs.developers.google.com/codelabs/tensorflow-for-poets-2).

In [None]:
def load_graph(model_file):
    graph = tf.Graph()
    graph_def = tf.GraphDef()

    with open(model_file, "rb") as f:
        graph_def.ParseFromString(f.read())
    with graph.as_default():
        tf.import_graph_def(graph_def)

    return graph

In [None]:
def read_tensor_from_image_file(file_name, input_height=299, input_width=299, input_mean=0, input_std=255):
    input_name = "file_reader"
    output_name = "normalized"
    file_reader = tf.read_file(file_name, input_name)
    if file_name.endswith(".png"):
        image_reader = tf.image.decode_png(file_reader, channels = 3,
                                       name='png_reader')
    elif file_name.endswith(".gif"):
        image_reader = tf.squeeze(tf.image.decode_gif(file_reader,
                                                  name='gif_reader'))
    elif file_name.endswith(".bmp"):
        image_reader = tf.image.decode_bmp(file_reader, name='bmp_reader')
    else:
        image_reader = tf.image.decode_jpeg(file_reader, channels = 3,
                                        name='jpeg_reader')
    float_caster = tf.cast(image_reader, tf.float32)
    dims_expander = tf.expand_dims(float_caster, 0);
    resized = tf.image.resize_bilinear(dims_expander, [input_height, input_width])
    normalized = tf.divide(tf.subtract(resized, [input_mean]), [input_std])
    sess = tf.Session()
    result = sess.run(normalized)

    return result

In [None]:
def load_labels(label_file):
    label = []
    proto_as_ascii_lines = tf.gfile.GFile(label_file).readlines()
    for l in proto_as_ascii_lines:
        label.append(l.rstrip())
    return label

### Process each image through the trained model to predict a classification. 

This cell will first creat a new dictionary object to save classifications to. Next it creates a loop to iterate over each filename in the subsquent 'test_files' folder (change this part of the path to the eventual correct place). 

Next it loads the graph.pb file we built while training the model with inception V3. 

We make a prediction on the file's class and adds the count to the dictionary first created. 

In [2]:
test_dict = {}

In [3]:
labels = load_labels("tf_files/retrained_labels.txt")

In [4]:
for im in glob.glob('tf_files/test_files2/*/*'):
    test_dict[im.split('/')[-2]]={labels[i]:0 for i in range(len(labels))}

In [22]:
image_classification_df_1 = pd.DataFrame.from_dict(test_dict, orient='index')

Trial with tf.session set outside of for loop:

In [24]:
graph = load_graph("tf_files/retrained_graph.pb")
labels = load_labels("tf_files/retrained_labels.txt")
input_height = 224
input_width = 224
input_mean = 128
input_std = 128
input_layer = "input"
output_layer = "final_result"
input_name = "import/" + input_layer
output_name = "import/" + output_layer
sess = tf.Session(graph=graph)
input_operation = graph.get_operation_by_name(input_name);
output_operation = graph.get_operation_by_name(output_name);
for im in glob.glob('tf_files/test_files2/*/*'):
    start = time.time()
#     file_name = "tf_files/test_files2/" + im.split('/')[-2]+'/'+ im.split('/')[-1] 
    try:
        t = read_tensor_from_image_file(im,
                                    input_height=input_height,
                                    input_width=input_width,
                                    input_mean=input_mean,
                                    input_std=input_std)
        
        results = sess.run(output_operation.outputs[0],
                                  {input_operation.outputs[0]: t})
        results = np.squeeze(results)
        
            
        i = int(results.argsort()[-1:][::-1])
        
        image_classification_df_1.loc[im.split('/')[-2], labels[i]]+=1
        
    except:
        pass
    end=time.time()
    print('Evaluation time (1-image): {:.3f}s'.format(end-start))

Evaluation time (1-image): 0.796s
Evaluation time (1-image): 0.296s
Evaluation time (1-image): 0.471s
Evaluation time (1-image): 0.308s
Evaluation time (1-image): 0.290s
Evaluation time (1-image): 0.297s
Evaluation time (1-image): 0.430s
Evaluation time (1-image): 0.321s
Evaluation time (1-image): 0.296s
Evaluation time (1-image): 0.313s
Evaluation time (1-image): 0.296s
Evaluation time (1-image): 0.288s
Evaluation time (1-image): 0.296s
Evaluation time (1-image): 0.290s
Evaluation time (1-image): 0.300s
Evaluation time (1-image): 0.333s
Evaluation time (1-image): 0.459s
Evaluation time (1-image): 0.305s
Evaluation time (1-image): 0.296s
Evaluation time (1-image): 0.319s
Evaluation time (1-image): 0.423s
Evaluation time (1-image): 0.378s
Evaluation time (1-image): 0.299s
Evaluation time (1-image): 0.314s
Evaluation time (1-image): 0.307s
Evaluation time (1-image): 0.292s
Evaluation time (1-image): 0.295s
Evaluation time (1-image): 0.293s
Evaluation time (1-image): 0.299s
Evaluation tim

As a matter of creating a checkpoint in large routines, I save this file to a .csv. Just in case the following proceses need to be run again, I can start here by simply loading the csv, rather than reprocessing the files.

In [None]:
image_classification_df.to_csv('image_classification_df.csv')

### Cosine Similarty

Finally we create a cosine similarity among each user given their photos. 

Due to computational time, I was forced to break up each classification into bins of 2000. The following cells append each group of 2000 rows into a single dataframe. 

In [28]:
df_1 = pd.read_csv('../2000/image_csv/image_classification_df_1.csv', index_col=[0])
df_2 = pd.read_csv('../2000/image_csv/image_classification_df_2.csv',index_col=[0])
df_3 = pd.read_csv('../2000/image_csv/image_classification_df_3.csv',index_col=[0])
df_4 = pd.read_csv('../2000/image_csv/image_classification_df_4.csv',index_col=[0])
df_5 = pd.read_csv('../2000/image_csv/image_classification_df_5.csv',index_col=[0])
df_6 = pd.read_csv('../2000/image_csv/image_classification_df_6.csv',index_col=[0])
df_7 = pd.read_csv('../2000/image_csv/image_classification_df_7.csv',index_col=[0])
df_8 = pd.read_csv('../2000/image_csv/image_classification_df_8.csv',index_col=[0])
df_9 = pd.read_csv('../2000/image_csv/image_classification_df_9.csv',index_col=[0])
df_10 = pd.read_csv('../2000/image_csv/image_classification_df_10.csv',index_col=[0])

In [38]:
df_11 = pd.read_csv('../2000/image_csv/image_classification_df_11.csv',index_col=[0])
df_12= pd.read_csv('../2000/image_csv/image_classification_df_12.csv',index_col=[0])
df_13= pd.read_csv('../2000/image_csv/image_classification_df_13.csv',index_col=[0])
df_14 = pd.read_csv('../2000/image_csv/image_classification_df_14.csv',index_col=[0])
df_15 = pd.read_csv('../2000/image_csv/image_classification_df_15.csv',index_col=[0])
df_16 = pd.read_csv('../2000/image_csv/image_classification_df_16.csv',index_col=[0])
df_17 = pd.read_csv('../2000/image_csv/image_classification_df_17.csv',index_col=[0])
df_18 = pd.read_csv('../2000/image_csv/image_classification_df_18.csv',index_col=[0])
df_19 = pd.read_csv('../2000/image_csv/image_classification_df_19.csv',index_col=[0])
df_20 = pd.read_csv('../2000/image_csv/image_classification_df_20.csv',index_col=[0])

In [39]:
df_list = [df_1, df_2, df_3, df_4, df_5, df_6, df_7, df_8, df_9, df_10, df_11, df_12, df_13, df_14, df_15, df_16, df_17, df_18, df_19, df_20]
df_final = pd.DataFrame()
for i in df_list:
    df_final = df_final.append(i)

In [40]:
df_final.describe()

Unnamed: 0,beauty,group,jeans,abs,architecture,baby,bag,bikini,car,close,...,menswear,ocean,outfit,pet,plant,selfie,shoes,sun,text,wedding
count,1948.0,1948.0,1948.0,1948.0,1948.0,1948.0,1948.0,1948.0,1948.0,1948.0,...,1948.0,1948.0,1948.0,1948.0,1948.0,1948.0,1948.0,1948.0,1948.0,1948.0
mean,0.433265,0.583676,0.51848,0.115503,0.850103,0.942505,0.14117,0.682752,0.185832,0.800821,...,0.948665,0.561088,1.856776,0.531828,0.532341,0.501027,0.191992,0.079055,1.255647,0.219713
std,1.594856,1.056533,1.010559,0.522239,1.778464,1.60052,0.451089,1.466969,0.73322,1.558435,...,1.854888,1.424658,2.479401,1.94757,1.047158,1.074282,0.517885,0.361064,2.695341,0.64737
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,...,1.0,0.0,3.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
max,20.0,10.0,11.0,11.0,15.0,20.0,6.0,15.0,14.0,19.0,...,17.0,12.0,16.0,20.0,11.0,17.0,5.0,6.0,20.0,9.0


In [41]:
cos_sim_images = pd.DataFrame(cosine_similarity(df_final, dense_output=False), index=df_final.index, columns=df_final.index)

In [42]:
cos_sim_images.head()

Unnamed: 0,5592,56040,56085,5610,5620,56207,56221,5627,56304,5658,...,67893,67919,68023,68071,6816,68289,68379,68395,68433,6846
5592,1.0,0.553912,0.37037,0.202113,0.368932,0.060193,0.398962,0.132099,0.596913,0.255155,...,0.226805,0.153522,0.411302,0.160817,0.150616,0.600099,0.309142,0.161416,0.300557,0.174603
56040,0.553912,1.0,0.287213,0.134343,0.506024,0.200049,0.44198,0.125436,0.475,0.226134,...,0.276385,0.048593,0.403075,0.257337,0.166856,0.423057,0.402911,0.22759,0.399556,0.070338
56085,0.37037,0.287213,1.0,0.754555,0.175682,0.42135,0.356966,0.207584,0.277438,0.170103,...,0.181444,0.690849,0.110735,0.178685,0.365782,0.272772,0.181848,0.278809,0.525974,0.674603
5610,0.202113,0.134343,0.754555,1.0,0.0,0.394171,0.213896,0.192232,0.012234,0.0,...,0.330049,0.941503,0.126613,0.247024,0.25049,0.052926,0.066157,0.053385,0.75447,0.883523
5620,0.368932,0.506024,0.175682,0.0,1.0,0.114208,0.478091,0.107417,0.271177,0.677772,...,0.193649,0.010403,0.390195,0.203419,0.020412,0.20702,0.327781,0.167054,0.08554,0.037646


Finally, I save this cosine similarity file to a csv, to be joined with NLP data and explicit user input to make a final prediction on similarity. 

In [43]:
cos_sim_images.to_csv('../2000/2000_cos_sim_images.csv')