In [162]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [163]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from random import shuffle
%matplotlib inline
import tensorflow as tf
from tensorflow.keras.applications import MobileNetV2
import os
from functools import reduce
from functions import fetch_image_from_s3_to_array, prepare_image
from pyspark.ml.recommendation import ALSModel
from scipy.spatial.distance import cdist

In [165]:
img_size = 160 # All images will be resized to 160x160
img_shape = (160, 160, 3)

# Create the base model from the pre-trained model MobileNet V2
base_model = MobileNetV2(input_shape=img_shape, include_top=False, weights='imagenet')

global_average_layer = tf.keras.layers.GlobalAveragePooling2D()

neural_network = tf.keras.Sequential([
  base_model,
  global_average_layer,
])

In [None]:
#Load ALS model

In [166]:
#Initialize Pyspark

import pyspark
spark = (pyspark.sql
                .SparkSession
                .builder
                .master("local[*]")
                .getOrCreate())

In [167]:
als_model = ALSModel.load('als')

In [168]:
img_features = als_model.userFactors.toPandas()
hashtag_features = als_model.itemFactors.toPandas()

In [169]:
img_features.head()

Unnamed: 0,id,features
0,2,"[0.4584904909133911, 0.4471224844455719, 0.370..."
1,12,"[0.9679425358772278, 0.5206466913223267, 0.273..."
2,22,"[0.7548763751983643, -0.18160919845104218, 0.1..."
3,32,"[0.7369276881217957, -0.05160527303814888, 0.2..."
4,42,"[0.09432432055473328, 0.04093838483095169, 0.4..."


In [170]:
#Load deep features

In [171]:
deep_features = pd.read_csv('deep_features.csv', )

In [172]:
deep_features.index = deep_features['name']

In [173]:
deep_features.head()

Unnamed: 0_level_0,Unnamed: 0,deep_features,hashtag,name,pic
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
91b02603-8267-47fa-acd8-af383f865ee9.jpg,0,[0. 0. 0.4694226 ... 2.7348056 0...,babies,91b02603-8267-47fa-acd8-af383f865ee9.jpg,tf.Tensor(\n[[[ 0.5862745 0.5392157 0.4294...
cdc2e321-8b86-426b-bcf6-ec55f94e3642.jpg,1,[0.22494766 0.07435133 2.8670275 ... 0. ...,babies,cdc2e321-8b86-426b-bcf6-ec55f94e3642.jpg,tf.Tensor(\n[[[-0.25686273 -0.36666664 -0.4686...
b3c53d46-1c6f-4a59-aa23-1227058757cf.jpg,2,[0.03928627 0. 0. ... 0.511356...,babies,b3c53d46-1c6f-4a59-aa23-1227058757cf.jpg,tf.Tensor(\n[[[-0.17058823 -0.48039216 -0.6117...
376d45c7-2a68-4f0d-9b4f-2436d396b35f.jpg,3,[1.525151 0.1844926 0.01535643 ... 0.085905...,babies,376d45c7-2a68-4f0d-9b4f-2436d396b35f.jpg,tf.Tensor(\n[[[-0.83137256 -0.83137256 -0.8313...
bbaae8bd-96e1-4c9a-8d52-6d260ff67645.jpg,4,[0. 0. 0.00435753 ... 0.917396...,animals,bbaae8bd-96e1-4c9a-8d52-6d260ff67645.jpg,tf.Tensor(\n[[[ 2.00000048e-01 5.29412329e-02...


In [75]:
#Extract recs dataframe
recs = pd.read_pickle('recs.pkl')

In [76]:
recs.head()

Unnamed: 0_level_0,recommendations,recommended_hashtags,hashtags,image_local_name,search_hashtag
image_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1580,"[(11307, 1.089327335357666), (2452, 1.07650852...","[#jdm, #car, #cars, #carporn, #carlifestyle, #...","[#gt, #toyota, #ej, #r, #wrx, #impreza, #bmw, ...",98b75f05-52d5-460d-9580-f95f0f86f353.jpg,cars
1591,"[(3212, 0.7940346598625183), (2452, 0.66501080...","[#cars, #car, #auto, #bmw, #audi, #toyota, #me...","[#bpautomobilemachtsm, #bpautomobile, #weilwir...",ba04590d-a8f1-495b-aa9e-b3cc575048fc.jpg,cars
463,"[(4811, 0.959574818611145), (8661, 0.927076160...","[#follow, #like, #selfie, #instagood, #food, #...","[#instagood, #instadaily, #koreanfood, #selca,...",e4ec8a33-df23-4d5e-b8b0-52168ba37a06.jpg,selfie
833,"[(2304, 0.918540358543396), (6029, 0.788918375...","[#art, #foodie, #architecture, #summer, #photo...","[#arte, #interiordesign, #archilovers, #argent...",535bc893-ebbe-46aa-9ae9-d2cd5f4dd426.jpg,travel
1645,"[(3212, 0.719140350818634), (2452, 0.540755271...","[#cars, #car, #auto, #bmw, #toyota, #jdm, #mer...","[#toyota, #minicarros, #vendas, #vendasonline,...",3cd627fc-a80d-4ed0-86c9-33f5c80f562d.jpg,cars


In [77]:
#add deep features information to recs dataframe
recs_deep = recs.join(deep_features, on='image_local_name', how='inner')

In [78]:
recs_deep.head()

Unnamed: 0_level_0,recommendations,recommended_hashtags,hashtags,image_local_name,search_hashtag,Unnamed: 0,deep_features,hashtag,name,pic
image_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
833,"[(2304, 0.918540358543396), (6029, 0.788918375...","[#art, #foodie, #architecture, #summer, #photo...","[#arte, #interiordesign, #archilovers, #argent...",535bc893-ebbe-46aa-9ae9-d2cd5f4dd426.jpg,travel,985,[0.40183023 0. 0.02886293 ... 0.790806...,travel,535bc893-ebbe-46aa-9ae9-d2cd5f4dd426.jpg,tf.Tensor(\n[[[0.99215686 0.99215686 0.9921568...
1829,"[(4068, 0.7660794258117676), (6029, 0.61289584...","[#food, #foodie, #instafood, #foodporn, #yummy...","[#bhcidadecriativadagastronomia, #prato, #come...",56018a9a-8def-48d1-9a7d-03136314b0d9.jpg,food,31,[0. 0. 0. ... 0. 0...,food,56018a9a-8def-48d1-9a7d-03136314b0d9.jpg,tf.Tensor(\n[[[ 0.7882353 -0.19215685 -0.4274...
1959,"[(745, 1.0295597314834595), (6029, 1.029177784...","[#foodporn, #foodie, #instafood, #foodstagram,...","[#hagerstownmd, #washingtoncountymd, #hswhmdbu...",2b98b605-8df1-4fcb-addf-887751caaaaa.jpg,food,797,[0.255129 0.6026034 0. ... 1.761153...,food,2b98b605-8df1-4fcb-addf-887751caaaaa.jpg,tf.Tensor(\n[[[-1. -0.99215686 -1. ...
2659,"[(2771, 0.8910551071166992), (7418, 0.86865395...","[#pets, #animals, #animal, #pet, #dogstagram, ...","[#sportdogminsk, #dogminsk, #labrador, #labrad...",c62f34b0-da93-485c-94a0-d56be4b72018.jpg,animals,1717,[0. 1.7648932 0. ... 1.359356...,animals,c62f34b0-da93-485c-94a0-d56be4b72018.jpg,tf.Tensor(\n[[[ 0.07843143 0.5568628 0.9803...
1990,"[(4811, 0.9960041046142578), (8661, 0.98882216...","[#follow, #like, #followme, #instadaily, #inst...","[#follow, #photooftheday, #love, #instalike, #...",7263d6f8-422f-4484-a306-cf9984bfa8bc.jpg,food,611,[0. 0. 0.2469749 ... 0. 0...,food,7263d6f8-422f-4484-a306-cf9984bfa8bc.jpg,tf.Tensor(\n[[[-0.9079044 -0.9157475 -0.9471...


In [79]:
#only use certain columns
recs_deep_clean = recs_deep[['image_local_name', 'hashtags', 'deep_features']]

In [81]:
img_features.index = img_features['id']

In [82]:
img_features.drop(['id'], axis=1)

Unnamed: 0_level_0,features
id,Unnamed: 1_level_1
2,"[0.4584904909133911, 0.4471224844455719, 0.370..."
12,"[0.9679425358772278, 0.5206466913223267, 0.273..."
22,"[0.7548763751983643, -0.18160919845104218, 0.1..."
32,"[0.7369276881217957, -0.05160527303814888, 0.2..."
42,"[0.09432432055473328, 0.04093838483095169, 0.4..."
52,"[0.8289591073989868, 0.2017640918493271, -0.17..."
62,"[1.0785496234893799, -0.30856823921203613, -0...."
72,"[0.8945685029029846, -0.4905155301094055, -0.0..."
82,"[0.6198602914810181, 0.38461706042289734, 0.18..."
92,"[0.8192901015281677, -0.25275376439094543, 0.3..."


In [84]:
#add image feature into dataframe

recommender_df = recs_deep_clean.join(img_features, how='inner')

In [85]:
from scipy.spatial.distance import cosine

In [87]:
#code used for testing purposes

from imp import reload
import functions
reload(functions)
prepare_image = functions.prepare_image

In [129]:
#function that finds k nearest neighbors by cosine similarity

def find_neighbor_vectors(image_path, k=5, recommender_df=recommender_df):
    """Find image features (user vectors) for similar images."""
    prep_image = functions.prepare_image(image_path, where='local')
    deep_features = functions.extract_features_for_one_image(prep_image, neural_network)
    print(deep_features)
    rdf = recommender_df.copy()
    rdf['dist'] = rdf['deep_features'].apply(lambda x: cosine(x, deep_features))
    rdf = rdf.sort_values(by='dist')
    return rdf.head(k)['features']

def equal_len(a, b): #a, b: lists
    '''Fill with 0 the vector that has smaller length'''
    while len(a) != len(b):
        if len(a) > len(b):
            b.append(0)
        else:
            a.append(0)
    return a, b
    

In [176]:
#test the function on a local image

find_neighbor_vectors('/Users/Anna/Documents/pictures/alpujarra/portada.jpg', k=5, recommender_df=recommender_df)

[1.0703073  0.08364    0.2391143  ... 0.1678103  0.00501854 2.0710332 ]


TypeError: ufunc 'multiply' did not contain a loop with signature matching types dtype('<U71') dtype('<U71') dtype('<U71')

### The error seems to come from the fact that x is a list wrapped in a string

### Next step to do: after identifying 5 nearest neighbors, look at ALS's image features (user features), get the average for 5. Then compute dot product with every hashtag features vector (item features). Take hashtags woth the highest reasult as recommendations.