In [3]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
#!pip install 'numpy<1.17'

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from random import shuffle
%matplotlib inline
import tensorflow as tf
from tensorflow.keras.applications import MobileNetV2
import os
from functools import reduce
from functions import fetch_image_from_s3_to_array, prepare_image
from pyspark.ml.recommendation import ALSModel
from scipy.spatial.distance import cdist

In [6]:
img_size = 160 # All images will be resized to 160x160
img_shape = (160, 160, 3)

# Create the base model from the pre-trained model MobileNet V2
base_model = MobileNetV2(input_shape=img_shape, include_top=False, weights='imagenet')

global_average_layer = tf.keras.layers.GlobalAveragePooling2D()

neural_network = tf.keras.Sequential([
  base_model,
  global_average_layer,
])

In [7]:
#Load ALS model

In [8]:
#Initialize Pyspark

import pyspark
spark = (pyspark.sql
                .SparkSession
                .builder
                .master("local[*]")
                .getOrCreate())

In [9]:
als_model = ALSModel.load('als')

In [10]:
img_features = als_model.userFactors.toPandas()
hashtag_features = als_model.itemFactors.toPandas()

In [11]:
img_features.head()

Unnamed: 0,id,features
0,6,"[-0.18551933765411377, 0.10412514209747314, -0..."
1,16,"[-0.8608502745628357, -0.2844527065753937, -0...."
2,26,"[-0.3587254285812378, 0.09064590185880661, -0...."
3,36,"[-0.13999195396900177, -0.16264992952346802, -..."
4,46,"[-0.04733074828982353, -0.14490081369876862, 0..."


In [12]:
#Load deep features

In [13]:
deep_features = pd.read_pickle("pkl_files/df_deep_features.pkl")

In [14]:
deep_features.index = deep_features['name']

In [15]:
deep_features.head()

Unnamed: 0_level_0,deep_features,hashtag,name,pic
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ff25ba01-800e-4e46-99bd-062fff5b0ee8.jpg,"[0.20390263, 0.35791588, 0.0, 0.11959089, 0.00...",animals,ff25ba01-800e-4e46-99bd-062fff5b0ee8.jpg,"(((tf.Tensor(0.9605086, shape=(), dtype=float3..."
1dd5f33d-86d5-46db-876c-776cd2bcc29e.jpg,"[1.9869195, 0.053759612, 0.0, 0.0658462, 0.0, ...",architecture,1dd5f33d-86d5-46db-876c-776cd2bcc29e.jpg,"(((tf.Tensor(0.6745098, shape=(), dtype=float3..."
6fe315be-a598-4a87-aefb-835b07e55676.jpg,"[0.0, 0.0, 0.047620296, 0.0, 0.043269824, 0.13...",travel,6fe315be-a598-4a87-aefb-835b07e55676.jpg,"(((tf.Tensor(-0.7254902, shape=(), dtype=float..."
a3384a2d-1d24-4108-8f25-cde7ed68f7df.jpg,"[0.0, 0.0, 1.8810314, 0.052851543, 0.0, 3.6374...",cars,a3384a2d-1d24-4108-8f25-cde7ed68f7df.jpg,"(((tf.Tensor(-0.7490196, shape=(), dtype=float..."
852ef348-da2d-4273-9bba-4dbd65b8b327.jpg,"[0.047524728, 0.0241852, 0.13619101, 0.5469168...",nature,852ef348-da2d-4273-9bba-4dbd65b8b327.jpg,"(((tf.Tensor(0.13333338, shape=(), dtype=float..."


In [16]:
deep_features.iloc[0, 0]

array([0.20390263, 0.35791588, 0.        , ..., 0.5969461 , 0.06213484,
       0.01105122], dtype=float32)

In [17]:
#Extract recs dataframe
recs = pd.read_pickle('pkl_files/recs.pkl')

In [18]:
recs.head()

Unnamed: 0_level_0,recommendations,recommended_hashtags,hashtags,image_local_name,search_hashtag
image_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1580,"[(10406, 1.146733283996582), (6872, 1.06506204...","[#luxury, #cars, #car, #lifestyle, #race, #aut...","[#checkeredflag, #atv, #flagmanracing, #dirtbi...",88039d35-d189-4db3-926e-a120d88a3efc.jpg,cars
471,"[(12028, 1.152119755744934), (13442, 1.1455943...","[#photo, #me, #ootd, #selfie, #daily, #like, #...","[#followme, #fashiondaily, #summer, #food, #ma...",fdc6c8d4-5a13-4330-9098-07e09e4858d2.jpg,selfie
1591,"[(6872, 0.9910037517547607), (12076, 0.7656257...","[#cars, #car, #luxury, #carsofinstagram, #carl...","[#bidlemanchevrolet, #suvs, #cars, #cardeals, ...",1b7a8fbc-1797-4418-b8b2-c65500e2bc6f.jpg,cars
463,"[(5798, 0.8138610124588013), (5481, 0.74504423...","[#selfie, #daily, #food, #ootd, #like, #follow...","[#followme, #hotguy, #instaboy, #gay, #instaga...",a78d0e30-6afa-436a-a953-633ef3f64325.jpg,selfie
833,"[(7160, 1.0691499710083008), (6119, 0.91959476...","[#nature, #art, #love, #photography, #architec...","[#arte, #interiordesign, #archilovers, #argent...",535bc893-ebbe-46aa-9ae9-d2cd5f4dd426.jpg,travel


In [19]:
#add deep features information to recs dataframe
recs_deep = recs.join(deep_features, on='image_local_name', how='inner')

In [20]:
recs_deep.head()

Unnamed: 0_level_0,recommendations,recommended_hashtags,hashtags,image_local_name,search_hashtag,deep_features,hashtag,name,pic
image_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1580,"[(10406, 1.146733283996582), (6872, 1.06506204...","[#luxury, #cars, #car, #lifestyle, #race, #aut...","[#checkeredflag, #atv, #flagmanracing, #dirtbi...",88039d35-d189-4db3-926e-a120d88a3efc.jpg,cars,"[0.05277525, 0.008059711, 0.3715715, 1.259724,...",cars,88039d35-d189-4db3-926e-a120d88a3efc.jpg,"(((tf.Tensor(0.845098, shape=(), dtype=float32..."
471,"[(12028, 1.152119755744934), (13442, 1.1455943...","[#photo, #me, #ootd, #selfie, #daily, #like, #...","[#followme, #fashiondaily, #summer, #food, #ma...",fdc6c8d4-5a13-4330-9098-07e09e4858d2.jpg,selfie,"[0.8612818, 0.0, 0.0, 0.0, 0.47858134, 0.28118...",selfie,fdc6c8d4-5a13-4330-9098-07e09e4858d2.jpg,"(((tf.Tensor(0.69411767, shape=(), dtype=float..."
1591,"[(6872, 0.9910037517547607), (12076, 0.7656257...","[#cars, #car, #luxury, #carsofinstagram, #carl...","[#bidlemanchevrolet, #suvs, #cars, #cardeals, ...",1b7a8fbc-1797-4418-b8b2-c65500e2bc6f.jpg,cars,"[0.0, 0.0, 0.0, 2.6671624, 0.11534082, 0.01738...",cars,1b7a8fbc-1797-4418-b8b2-c65500e2bc6f.jpg,"(((tf.Tensor(0.92941177, shape=(), dtype=float..."
463,"[(5798, 0.8138610124588013), (5481, 0.74504423...","[#selfie, #daily, #food, #ootd, #like, #follow...","[#followme, #hotguy, #instaboy, #gay, #instaga...",a78d0e30-6afa-436a-a953-633ef3f64325.jpg,selfie,"[0.11383941, 0.0, 1.6760664, 0.46152607, 0.062...",selfie,a78d0e30-6afa-436a-a953-633ef3f64325.jpg,"(((tf.Tensor(-0.7254902, shape=(), dtype=float..."
833,"[(7160, 1.0691499710083008), (6119, 0.91959476...","[#nature, #art, #love, #photography, #architec...","[#arte, #interiordesign, #archilovers, #argent...",535bc893-ebbe-46aa-9ae9-d2cd5f4dd426.jpg,travel,"[0.40183023, 0.0, 0.028862935, 0.0, 0.13101277...",travel,535bc893-ebbe-46aa-9ae9-d2cd5f4dd426.jpg,"(((tf.Tensor(0.99215686, shape=(), dtype=float..."


In [21]:
#only use certain columns
recs_deep_clean = recs_deep[['image_local_name', 'hashtags', 'deep_features']]

In [22]:
img_features.index = img_features['id']

In [23]:
img_features.drop(['id'], axis=1)

Unnamed: 0_level_0,features
id,Unnamed: 1_level_1
6,"[-0.18551933765411377, 0.10412514209747314, -0..."
16,"[-0.8608502745628357, -0.2844527065753937, -0...."
26,"[-0.3587254285812378, 0.09064590185880661, -0...."
36,"[-0.13999195396900177, -0.16264992952346802, -..."
46,"[-0.04733074828982353, -0.14490081369876862, 0..."
56,"[-0.5950252413749695, -0.4368746280670166, -0...."
66,"[-0.4571835398674011, -0.26486632227897644, -1..."
76,"[0.02066698856651783, -0.48553407192230225, -0..."
86,"[-0.13193461298942566, 0.04446610435843468, -0..."
96,"[-0.710654079914093, -0.10638371855020523, -0...."


In [24]:
#add image feature into dataframe

recommender_df = recs_deep_clean.join(img_features, how='inner')

In [25]:
recommender_df.head()

Unnamed: 0,image_local_name,hashtags,deep_features,id,features
1580,88039d35-d189-4db3-926e-a120d88a3efc.jpg,"[#checkeredflag, #atv, #flagmanracing, #dirtbi...","[0.05277525, 0.008059711, 0.3715715, 1.259724,...",1580,"[-0.39753857254981995, 0.03570248559117317, -0..."
471,fdc6c8d4-5a13-4330-9098-07e09e4858d2.jpg,"[#followme, #fashiondaily, #summer, #food, #ma...","[0.8612818, 0.0, 0.0, 0.0, 0.47858134, 0.28118...",471,"[-0.6170209646224976, -0.6904682517051697, 0.3..."
1591,1b7a8fbc-1797-4418-b8b2-c65500e2bc6f.jpg,"[#bidlemanchevrolet, #suvs, #cars, #cardeals, ...","[0.0, 0.0, 0.0, 2.6671624, 0.11534082, 0.01738...",1591,"[0.010120289400219917, -0.14125625789165497, 0..."
463,a78d0e30-6afa-436a-a953-633ef3f64325.jpg,"[#followme, #hotguy, #instaboy, #gay, #instaga...","[0.11383941, 0.0, 1.6760664, 0.46152607, 0.062...",463,"[-0.21066850423812866, -0.1887151151895523, 0...."
833,535bc893-ebbe-46aa-9ae9-d2cd5f4dd426.jpg,"[#arte, #interiordesign, #archilovers, #argent...","[0.40183023, 0.0, 0.028862935, 0.0, 0.13101277...",833,"[-0.3970666527748108, 0.7391157150268555, -0.3..."


In [26]:
from scipy.spatial.distance import cosine

In [27]:
#code used for testing purposes

from imp import reload
import functions
reload(functions)
prepare_image = functions.prepare_image

In [28]:
#function that finds k nearest neighbors by cosine similarity

def find_neighbor_vectors(image_path, k=5, recommender_df=recommender_df):
    """Find image features (user vectors) for similar images."""
    prep_image = functions.prepare_image(image_path, where='local')
    deep_features = functions.extract_features_for_one_image(prep_image, neural_network)
    #print(deep_features)
    rdf = recommender_df.copy()
    rdf['dist'] = rdf['deep_features'].apply(lambda x: cosine(x, deep_features))
    rdf = rdf.sort_values(by='dist')
    #return rdf.head(k)['features']
    return rdf.head(k)

def equal_len(a, b): #a, b: lists
    '''Fill with 0 the vector that has smaller length'''
    while len(a) != len(b):
        if len(a) > len(b):
            b.append(0)
        else:
            a.append(0)
    return a, b
    

In [29]:
#test the function on a local image

fnv = find_neighbor_vectors('test_wedding.jpg', 
                k=5, recommender_df=recommender_df)

In [30]:
fnv

Unnamed: 0,image_local_name,hashtags,deep_features,id,features,dist
279,4098cbe2-7a78-42e2-b91a-56e136309235.jpg,"[#ootd, #cute, #cuteoutfits, #fashionblogger, ...","[0.59664005, 2.841192, 0.0, 1.0908409, 3.63056...",279,"[-0.7807010412216187, -0.023483864963054657, 0...",0.394075
2968,c001507f-02e6-4fb4-aa49-27a139018cd4.jpg,"[#wedding, #gythio, #greece, #MarriedInMani, #...","[0.15775155, 1.9940039, 0.6179221, 0.0, 3.2098...",2968,"[-0.22371426224708557, 0.08788613975048065, -0...",0.416801
2827,7a8c4976-7026-4fc1-acb8-e142e68bc954.jpg,"[#wedding, #instacool, #likelike, #like, #like...","[1.484782, 0.20136833, 0.0, 0.0, 3.6014848, 0....",2827,"[-0.24263130128383636, -0.40929079055786133, -...",0.417307
763,edf9daee-f603-46b5-bf4a-ae9974bb89ad.jpg,"[#wanderlust, #toddlers, #woods, #forest, #nc,...","[0.74691576, 1.2644926, 0.0, 0.23730333, 2.684...",763,"[0.006454684305936098, 0.024608485400676727, -...",0.420474
223,d5c1c965-47e1-492e-842b-118ea059ba32.jpg,"[#instagram, #nehakakkar, #trending, #beauty, ...","[0.0, 0.0, 0.0, 0.9554576, 2.4236367, 0.069097...",223,"[-0.3381338119506836, -0.274901807308197, -0.0...",0.421033


### Next step to do: after identifying 5 nearest neighbors, look at ALS's image features (user features), get the average for 5. Then compute dot product with every hashtag features vector (item features). Take hashtags woth the highest reasult as recommendations.

In [31]:
#check length of the user features
len(fnv.iloc[3, 4])

10

## Find the average of the 5 user features found based on cosine similarity.

In [32]:
# extract features from dataframe

features = []
for item in fnv.features.values:
    features.append(item)

In [33]:
avg_features = np.mean(np.asarray(features), axis=0)

## Find the dot product with each hashtag features

In [34]:
# take a look at the hashtag features extracted from ALS model

hashtag_features.head()

Unnamed: 0,id,features
0,2,"[-0.02908919006586075, 0.010887430980801582, 0..."
1,12,"[-0.039385922253131866, 0.047334976494312286, ..."
2,22,"[0.12862631678581238, 0.10405722260475159, -0...."
3,32,"[-0.019268620759248734, -0.020667145028710365,..."
4,42,"[0.036955464631319046, -0.0918664038181305, -0..."


In [35]:
# add new column to the hashtag features which will be the dot product with the average image(user) features

hashtag_features['dot_product'] = hashtag_features['features'].apply(lambda x: np.asarray(x).dot(avg_features))

In [36]:
# get 10 ids with the highest dot product

df_ten_highest = hashtag_features.sort_values(by='dot_product', ascending=False).head(10)

In [37]:
rec_hashtag_ids = df_ten_highest.id.values

## Find hashtags that correspond to the ids

In [38]:
# Unpickle hashtag data

hashtags_df = pd.read_pickle("pkl_files/hashtags_df.pkl")

In [39]:
hashtags_df.head()

Unnamed: 0_level_0,hashtag
id,Unnamed: 1_level_1
0,#instahub
1,#thejourney
2,#mfw
3,#alpacalover
4,#pilatesstudio


In [40]:
for i in rec_hashtag_ids:
    print(hashtags_df.loc[hashtags_df.index==i]['hashtag'])

id
11995    #love
Name: hashtag, dtype: object
id
5798    #selfie
Name: hashtag, dtype: object
id
10858    #fashion
Name: hashtag, dtype: object
id
5440    #instagood
Name: hashtag, dtype: object
id
9866    #picoftheday
Name: hashtag, dtype: object
id
10209    #photography
Name: hashtag, dtype: object
id
5613    #summer
Name: hashtag, dtype: object
id
13695    #happy
Name: hashtag, dtype: object
id
5225    #instagram
Name: hashtag, dtype: object
id
8556    #beautiful
Name: hashtag, dtype: object
