In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from random import shuffle
%matplotlib inline
import tensorflow as tf
from tensorflow.keras.applications import MobileNetV2
import os
from functools import reduce
from functions import fetch_image_from_s3_to_array, prepare_image
from pyspark.ml.recommendation import ALSModel
from scipy.spatial.distance import cdist

In [3]:
img_size = 160 # All images will be resized to 160x160
img_shape = (160, 160, 3)

# Create the base model from the pre-trained model MobileNet V2
base_model = MobileNetV2(input_shape=img_shape, include_top=False, weights='imagenet')

global_average_layer = tf.keras.layers.GlobalAveragePooling2D()

neural_network = tf.keras.Sequential([
  base_model,
  global_average_layer,
])

In [4]:
#Load ALS model

In [5]:
#Initialize Pyspark

import pyspark
spark = (pyspark.sql
                .SparkSession
                .builder
                .master("local[*]")
                .getOrCreate())

In [6]:
als_model = ALSModel.load('als')

In [7]:
img_features = als_model.userFactors.toPandas()
hashtag_features = als_model.itemFactors.toPandas()

In [8]:
img_features.head()

Unnamed: 0,id,features
0,2,"[0.4584904909133911, 0.4471224844455719, 0.370..."
1,12,"[0.9679425358772278, 0.5206466913223267, 0.273..."
2,22,"[0.7548763751983643, -0.18160919845104218, 0.1..."
3,32,"[0.7369276881217957, -0.05160527303814888, 0.2..."
4,42,"[0.09432432055473328, 0.04093838483095169, 0.4..."


In [9]:
#Load deep features

In [51]:
#deep_features = pd.read_csv('deep_features.csv', )
deep_features = pd.read_pickle("df_deep_features.pkl")

In [52]:
deep_features.index = deep_features['name']

In [53]:
deep_features.head()

Unnamed: 0_level_0,deep_features,hashtag,name,pic
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
c408b61a-7986-4cfb-878b-4fc9dda16108.jpg,"[2.2193785, 0.1846841, 0.9348104, 0.015347958,...",babies,c408b61a-7986-4cfb-878b-4fc9dda16108.jpg,"(((tf.Tensor(0.70392156, shape=(), dtype=float..."
b6316377-299c-4f5a-b17e-c7879dce5220.jpg,"[0.49179092, 0.0, 0.7123833, 0.0, 0.25765607, ...",travel,b6316377-299c-4f5a-b17e-c7879dce5220.jpg,"(((tf.Tensor(-0.37843138, shape=(), dtype=floa..."
6bd88a68-95ba-41a0-9f92-b0ecc8d27b26.jpg,"[0.0, 0.6780646, 0.0, 0.0, 1.1511155, 0.443687...",food,6bd88a68-95ba-41a0-9f92-b0ecc8d27b26.jpg,"(((tf.Tensor(0.28627455, shape=(), dtype=float..."
e41dda23-d612-42a2-937b-e5520e1aeacf.jpg,"[0.0, 1.6623932, 0.2608316, 0.38453564, 0.4322...",travel,e41dda23-d612-42a2-937b-e5520e1aeacf.jpg,"(((tf.Tensor(0.20133343, shape=(), dtype=float..."
8b1dda25-42df-4129-a053-f6eabeff6202.jpg,"[0.64541465, 1.4916105, 0.28248668, 0.0, 0.0, ...",animals,8b1dda25-42df-4129-a053-f6eabeff6202.jpg,"(((tf.Tensor(-0.42745095, shape=(), dtype=floa..."


In [55]:
deep_features.iloc[0, 0]

array([2.2193785 , 0.1846841 , 0.9348104 , ..., 0.12125196, 0.64810914,
       0.        ], dtype=float32)

In [56]:
#Extract recs dataframe
recs = pd.read_pickle('recs.pkl')

In [57]:
recs.head()

Unnamed: 0_level_0,recommendations,recommended_hashtags,hashtags,image_local_name,search_hashtag
image_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1580,"[(11307, 1.089327335357666), (2452, 1.07650852...","[#jdm, #car, #cars, #carporn, #carlifestyle, #...","[#gt, #toyota, #ej, #r, #wrx, #impreza, #bmw, ...",98b75f05-52d5-460d-9580-f95f0f86f353.jpg,cars
1591,"[(3212, 0.7940346598625183), (2452, 0.66501080...","[#cars, #car, #auto, #bmw, #audi, #toyota, #me...","[#bpautomobilemachtsm, #bpautomobile, #weilwir...",ba04590d-a8f1-495b-aa9e-b3cc575048fc.jpg,cars
463,"[(4811, 0.959574818611145), (8661, 0.927076160...","[#follow, #like, #selfie, #instagood, #food, #...","[#instagood, #instadaily, #koreanfood, #selca,...",e4ec8a33-df23-4d5e-b8b0-52168ba37a06.jpg,selfie
833,"[(2304, 0.918540358543396), (6029, 0.788918375...","[#art, #foodie, #architecture, #summer, #photo...","[#arte, #interiordesign, #archilovers, #argent...",535bc893-ebbe-46aa-9ae9-d2cd5f4dd426.jpg,travel
1645,"[(3212, 0.719140350818634), (2452, 0.540755271...","[#cars, #car, #auto, #bmw, #toyota, #jdm, #mer...","[#toyota, #minicarros, #vendas, #vendasonline,...",3cd627fc-a80d-4ed0-86c9-33f5c80f562d.jpg,cars


In [58]:
#add deep features information to recs dataframe
recs_deep = recs.join(deep_features, on='image_local_name', how='inner')

In [59]:
recs_deep.head()

Unnamed: 0_level_0,recommendations,recommended_hashtags,hashtags,image_local_name,search_hashtag,deep_features,hashtag,name,pic
image_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
833,"[(2304, 0.918540358543396), (6029, 0.788918375...","[#art, #foodie, #architecture, #summer, #photo...","[#arte, #interiordesign, #archilovers, #argent...",535bc893-ebbe-46aa-9ae9-d2cd5f4dd426.jpg,travel,"[0.40183023, 0.0, 0.028862935, 0.0, 0.13101277...",travel,535bc893-ebbe-46aa-9ae9-d2cd5f4dd426.jpg,"(((tf.Tensor(0.99215686, shape=(), dtype=float..."
1829,"[(4068, 0.7660794258117676), (6029, 0.61289584...","[#food, #foodie, #instafood, #foodporn, #yummy...","[#bhcidadecriativadagastronomia, #prato, #come...",56018a9a-8def-48d1-9a7d-03136314b0d9.jpg,food,"[0.0, 0.0, 0.0, 2.0659456, 0.060433093, 0.0328...",food,56018a9a-8def-48d1-9a7d-03136314b0d9.jpg,"(((tf.Tensor(0.7882353, shape=(), dtype=float3..."
1959,"[(745, 1.0295597314834595), (6029, 1.029177784...","[#foodporn, #foodie, #instafood, #foodstagram,...","[#hagerstownmd, #washingtoncountymd, #hswhmdbu...",2b98b605-8df1-4fcb-addf-887751caaaaa.jpg,food,"[0.255129, 0.6026034, 0.0, 0.0, 0.2649025, 0.1...",food,2b98b605-8df1-4fcb-addf-887751caaaaa.jpg,"(((tf.Tensor(-1.0, shape=(), dtype=float32), t..."
2659,"[(2771, 0.8910551071166992), (7418, 0.86865395...","[#pets, #animals, #animal, #pet, #dogstagram, ...","[#sportdogminsk, #dogminsk, #labrador, #labrad...",c62f34b0-da93-485c-94a0-d56be4b72018.jpg,animals,"[0.0, 1.7648932, 0.0, 0.014228115, 0.11418705,...",animals,c62f34b0-da93-485c-94a0-d56be4b72018.jpg,"(((tf.Tensor(0.07843143, shape=(), dtype=float..."
1990,"[(4811, 0.9960041046142578), (8661, 0.98882216...","[#follow, #like, #followme, #instadaily, #inst...","[#follow, #photooftheday, #love, #instalike, #...",7263d6f8-422f-4484-a306-cf9984bfa8bc.jpg,food,"[0.0, 0.0, 0.2469749, 0.0, 0.0, 0.025176544, 0...",food,7263d6f8-422f-4484-a306-cf9984bfa8bc.jpg,"(((tf.Tensor(-0.9079044, shape=(), dtype=float..."


In [60]:
#only use certain columns
recs_deep_clean = recs_deep[['image_local_name', 'hashtags', 'deep_features']]

In [61]:
img_features.index = img_features['id']

In [62]:
img_features.drop(['id'], axis=1)

Unnamed: 0_level_0,features
id,Unnamed: 1_level_1
2,"[0.4584904909133911, 0.4471224844455719, 0.370..."
12,"[0.9679425358772278, 0.5206466913223267, 0.273..."
22,"[0.7548763751983643, -0.18160919845104218, 0.1..."
32,"[0.7369276881217957, -0.05160527303814888, 0.2..."
42,"[0.09432432055473328, 0.04093838483095169, 0.4..."
52,"[0.8289591073989868, 0.2017640918493271, -0.17..."
62,"[1.0785496234893799, -0.30856823921203613, -0...."
72,"[0.8945685029029846, -0.4905155301094055, -0.0..."
82,"[0.6198602914810181, 0.38461706042289734, 0.18..."
92,"[0.8192901015281677, -0.25275376439094543, 0.3..."


In [63]:
#add image feature into dataframe

recommender_df = recs_deep_clean.join(img_features, how='inner')

In [64]:
recommender_df.head()

Unnamed: 0,image_local_name,hashtags,deep_features,id,features
833,535bc893-ebbe-46aa-9ae9-d2cd5f4dd426.jpg,"[#arte, #interiordesign, #archilovers, #argent...","[0.40183023, 0.0, 0.028862935, 0.0, 0.13101277...",833,"[-0.334102600812912, 0.3247702717781067, -0.35..."
1829,56018a9a-8def-48d1-9a7d-03136314b0d9.jpg,"[#bhcidadecriativadagastronomia, #prato, #come...","[0.0, 0.0, 0.0, 2.0659456, 0.060433093, 0.0328...",1829,"[-0.10602129250764847, 0.004272814840078354, -..."
1959,2b98b605-8df1-4fcb-addf-887751caaaaa.jpg,"[#hagerstownmd, #washingtoncountymd, #hswhmdbu...","[0.255129, 0.6026034, 0.0, 0.0, 0.2649025, 0.1...",1959,"[-0.2808643877506256, 0.5266156792640686, -0.0..."
2659,c62f34b0-da93-485c-94a0-d56be4b72018.jpg,"[#sportdogminsk, #dogminsk, #labrador, #labrad...","[0.0, 1.7648932, 0.0, 0.014228115, 0.11418705,...",2659,"[0.02834787406027317, -0.5530852675437927, -0...."
1990,7263d6f8-422f-4484-a306-cf9984bfa8bc.jpg,"[#follow, #photooftheday, #love, #instalike, #...","[0.0, 0.0, 0.2469749, 0.0, 0.0, 0.025176544, 0...",1990,"[-0.033458560705184937, -0.372518926858902, 0...."


In [65]:
from scipy.spatial.distance import cosine

In [66]:
#code used for testing purposes

from imp import reload
import functions
reload(functions)
prepare_image = functions.prepare_image

In [71]:
#function that finds k nearest neighbors by cosine similarity

def find_neighbor_vectors(image_path, k=5, recommender_df=recommender_df):
    """Find image features (user vectors) for similar images."""
    prep_image = functions.prepare_image(image_path, where='local')
    deep_features = functions.extract_features_for_one_image(prep_image, neural_network)
    #print(deep_features)
    rdf = recommender_df.copy()
    rdf['dist'] = rdf['deep_features'].apply(lambda x: cosine(x, deep_features))
    rdf = rdf.sort_values(by='dist')
    #return rdf.head(k)['features']
    return rdf.head(k)

def equal_len(a, b): #a, b: lists
    '''Fill with 0 the vector that has smaller length'''
    while len(a) != len(b):
        if len(a) > len(b):
            b.append(0)
        else:
            a.append(0)
    return a, b
    

In [130]:
#test the function on a local image

fnv = find_neighbor_vectors('/Users/Anna/Downloads/test_wedding.jpg', 
                k=5, recommender_df=recommender_df)

In [131]:
fnv

Unnamed: 0,image_local_name,hashtags,deep_features,id,features,dist
763,edf9daee-f603-46b5-bf4a-ae9974bb89ad.jpg,"[#wanderlust, #toddlers, #woods, #forest, #nc,...","[0.74691576, 1.2644926, 0.0, 0.23730333, 2.684...",763,"[0.07949178665876389, 0.34489336609840393, -0....",0.420474
882,c3b7232e-ea7c-4766-91e3-99207f7d8d8b.jpg,"[#photooftheday, #hasekio, #bs, #travel, #phot...","[0.78267944, 1.8151414, 0.3264247, 0.43329898,...",882,"[0.14343763887882233, 0.0062960293143987656, -...",0.437234
682,e1626e8a-eaa8-4812-917b-4a4aeadeb10f.jpg,"[#follow, #repost, #photooftheday, #love, #fas...","[0.17159182, 0.45351338, 0.02343401, 0.1585963...",682,"[0.1731979250907898, 0.0020151834469288588, 0....",0.456072
773,72569f9f-ebea-4b5a-b238-c2b15e9b288d.jpg,"[#musician, #newmusic, #producer, #futuremusic...","[0.0191253, 1.6645195, 0.0, 0.0, 2.546588, 0.0...",773,"[0.345413476228714, 0.31353116035461426, 0.196...",0.461755
737,f2a6f5f1-64c2-4ee3-9609-d409e28fa816.jpg,"[#honolulu, #hawaii, #dream, #travel, #hoomalu...","[0.13907264, 0.9570855, 0.052211456, 0.0, 1.52...",737,"[0.3647169768810272, 0.1129966750741005, -0.06...",0.470721


### Next step to do: after identifying 5 nearest neighbors, look at ALS's image features (user features), get the average for 5. Then compute dot product with every hashtag features vector (item features). Take hashtags woth the highest reasult as recommendations.

In [132]:
#check length of the user features
len(fnv.iloc[3, 4])

10

## Find the average of the 5 user features found based on cosine similarity.

In [133]:
# extract features from dataframe

features = []
for item in fnv.features.values:
    features.append(item)

In [134]:
avg_features = np.mean(np.asarray(features), axis=0)

## Find the dot product with each hashtag features

In [135]:
# take a look at the hashtag features extracted from ALS model

hashtag_features.head()

Unnamed: 0,id,features,dot_product
0,1,"[0.12350701540708542, 0.2328486442565918, -0.0...",0.080287
1,11,"[-0.05738811939954758, -0.0456802099943161, -0...",-0.021376
2,21,"[0.12918414175510406, -0.013316345401108265, -...",0.055002
3,31,"[0.059465594589710236, 0.01814868673682213, 0....",0.008303
4,41,"[-0.01782803051173687, 0.0011641709133982658, ...",-0.002376


In [136]:
# add new column to the hashtag features which will be the dot product with the average image(user) features

hashtag_features['dot_product'] = hashtag_features['features'].apply(lambda x: np.asarray(x).dot(avg_features))

In [137]:
# get 10 ids with the highest dot product

df_ten_highest = hashtag_features.sort_values(by='dot_product', ascending=False).head(10)

In [138]:
rec_hashtag_ids = df_ten_highest.id.values

## Find hashtags that correspond to the ids

In [139]:
# Unpickle hashtag data

hashtags_df = pd.read_pickle("hashtags_df.pkl")

In [140]:
hashtags_df.head()

Unnamed: 0_level_0,hashtag
id,Unnamed: 1_level_1
0,#francecars
1,#AUDIBESTPICTURE
2,#tattoostyle
3,#cakes
4,#hotwheelscollector


In [141]:
for i in rec_hashtag_ids:
    print(hashtags_df.loc[hashtags_df.index==i]['hashtag'])

id
3066    #steps
Name: hashtag, dtype: object
id
2315    #fresh
Name: hashtag, dtype: object
id
4029    #dreamfood
Name: hashtag, dtype: object
id
5032    #blitzzforza
Name: hashtag, dtype: object
id
7006    #fitnessprofessionals
Name: hashtag, dtype: object
id
6680    #catlady
Name: hashtag, dtype: object
id
519    #lipstick
Name: hashtag, dtype: object
id
5836    #huskyoftheday
Name: hashtag, dtype: object
id
2304    #viajar
Name: hashtag, dtype: object
id
7311    #birdstagram
Name: hashtag, dtype: object
