In [1]:
#!pip install 'numpy<1.17'

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from random import shuffle
%matplotlib inline
import tensorflow as tf
from tensorflow.keras.applications import MobileNetV2
import os
from functools import reduce
from functions import fetch_image_from_s3_to_array, prepare_image
from pyspark.ml.recommendation import ALSModel
from scipy.spatial.distance import cdist, cosine

In [4]:
img_size = 160 # All images will be resized to 160x160
img_shape = (160, 160, 3)

# Create the base model from the pre-trained model MobileNet V2
base_model = MobileNetV2(input_shape=img_shape, include_top=False, weights='imagenet')

global_average_layer = tf.keras.layers.GlobalAveragePooling2D()

neural_network = tf.keras.Sequential([
  base_model,
  global_average_layer,
])

In [6]:
# Initialize Pyspark
import pyspark
spark = (pyspark.sql
                .SparkSession
                .builder
                .master("local[*]")
                .getOrCreate())

In [None]:
#Load ALS model
als_model = ALSModel.load('als')

In [8]:
img_features = als_model.userFactors.toPandas()
hashtag_features = als_model.itemFactors.toPandas()

In [9]:
img_features.head()

Unnamed: 0,id,features
0,5,"[-0.4633004665374756, 0.2045188844203949, 0.04..."
1,15,"[-0.26573067903518677, -0.2782086730003357, 0...."
2,25,"[-0.12432768940925598, 0.043011318892240524, 0..."
3,35,"[0.09362601488828659, -0.31278151273727417, 0...."
4,45,"[-0.6471619606018066, 0.31757891178131104, 0.3..."


In [None]:
#Load deep features
deep_features = pd.read_pickle("pkl_files/df_deep_features.pkl")
deep_features.index = deep_features['name']

In [13]:
deep_features.head()

Unnamed: 0_level_0,deep_features,hashtag,name,pic
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
c7859ea9-d0d4-491e-af91-d91e9a8c122b.jpg,"[0.09537401, 0.0, 0.0, 1.3010054, 0.06189001, ...",babies,c7859ea9-d0d4-491e-af91-d91e9a8c122b.jpg,"(((tf.Tensor(-0.3705882, shape=(), dtype=float..."
45d6c48c-c25e-42fc-854f-df71bf6de43e.jpg,"[0.0, 0.08503536, 0.9841172, 0.79477465, 2.028...",animals,45d6c48c-c25e-42fc-854f-df71bf6de43e.jpg,"(((tf.Tensor(0.627451, shape=(), dtype=float32..."
0b51899b-dc3c-4f30-9230-3f5223cf79a6.jpg,"[0.0, 0.1421825, 0.29680955, 0.461614, 0.10577...",architecture,0b51899b-dc3c-4f30-9230-3f5223cf79a6.jpg,"(((tf.Tensor(0.52156866, shape=(), dtype=float..."
6ca3e818-6023-43d8-bf47-e958a52f0530.jpg,"[0.2723638, 0.07413437, 1.3867798, 0.0, 0.3256...",animals,6ca3e818-6023-43d8-bf47-e958a52f0530.jpg,"(((tf.Tensor(0.3921569, shape=(), dtype=float3..."
b1d7803e-68cc-4bed-9d7c-ed0590899403.jpg,"[0.0, 0.0, 0.4716914, 0.0, 0.0, 2.7325175, 0.8...",animals,b1d7803e-68cc-4bed-9d7c-ed0590899403.jpg,"(((tf.Tensor(-0.1607843, shape=(), dtype=float..."


In [14]:
deep_features.iloc[0, 0]

array([0.09537401, 0.        , 0.        , ..., 0.29116523, 0.18538919,
       0.20294176], dtype=float32)

In [15]:
# Extract recs dataframe
recs = pd.read_pickle('pkl_files/recs.pkl')

In [16]:
recs.head()

Unnamed: 0_level_0,recommendations,recommended_hashtags,hashtags,image_local_name,search_hashtag
image_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1580,"[(5249, 1.1879390478134155), (6758, 1.06600213...","[#luxury, #car, #lifestyle, #cars, #luxurylife...","[#checkeredflag, #atv, #flagmanracing, #dirtbi...",88039d35-d189-4db3-926e-a120d88a3efc.jpg,cars
471,"[(5728, 1.1468442678451538), (1048, 1.14258015...","[#selfie, #amazing, #fashion, #black, #photo, ...","[#followme, #fashiondaily, #summer, #food, #ma...",fdc6c8d4-5a13-4330-9098-07e09e4858d2.jpg,selfie
1591,"[(15043, 0.8404358625411987), (6758, 0.6228702...","[#cars, #car, #bmw, #trucks, #amazing, #carpor...","[#bidlemanchevrolet, #suvs, #cars, #cardeals, ...",1b7a8fbc-1797-4418-b8b2-c65500e2bc6f.jpg,cars
463,"[(5728, 0.781302809715271), (7432, 0.773969054...","[#selfie, #beauty, #smile, #style, #fashion, #...","[#followme, #hotguy, #instaboy, #gay, #instaga...",a78d0e30-6afa-436a-a953-633ef3f64325.jpg,selfie
833,"[(2932, 1.0711742639541626), (11435, 0.8931126...","[#explore, #art, #nature, #photography, #archi...","[#arte, #interiordesign, #archilovers, #argent...",535bc893-ebbe-46aa-9ae9-d2cd5f4dd426.jpg,travel


In [17]:
# Add deep features information to recs dataframe
recs_deep = recs.join(deep_features, on='image_local_name', how='inner')

In [18]:
recs_deep.head()

Unnamed: 0_level_0,recommendations,recommended_hashtags,hashtags,image_local_name,search_hashtag,deep_features,hashtag,name,pic
image_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
833,"[(2932, 1.0711742639541626), (11435, 0.8931126...","[#explore, #art, #nature, #photography, #archi...","[#arte, #interiordesign, #archilovers, #argent...",535bc893-ebbe-46aa-9ae9-d2cd5f4dd426.jpg,travel,"[0.40182838, 0.0, 0.028864231, 0.0, 0.13101286...",travel,535bc893-ebbe-46aa-9ae9-d2cd5f4dd426.jpg,"(((tf.Tensor(0.99215686, shape=(), dtype=float..."
1829,"[(8334, 0.6680111885070801), (7957, 0.56452876...","[#food, #fitness, #instafood, #foodporn, #nyc,...","[#bhcidadecriativadagastronomia, #prato, #come...",56018a9a-8def-48d1-9a7d-03136314b0d9.jpg,food,"[0.0, 0.0, 0.0, 2.0659437, 0.06043254, 0.03285...",food,56018a9a-8def-48d1-9a7d-03136314b0d9.jpg,"(((tf.Tensor(0.7882353, shape=(), dtype=float3..."
1959,"[(7436, 1.0163105726242065), (8334, 0.96548527...","[#instafood, #food, #foodporn, #foodie, #yummy...","[#hagerstownmd, #washingtoncountymd, #hswhmdbu...",2b98b605-8df1-4fcb-addf-887751caaaaa.jpg,food,"[0.2551278, 0.6026027, 0.0, 0.0, 0.2649013, 0....",food,2b98b605-8df1-4fcb-addf-887751caaaaa.jpg,"(((tf.Tensor(-1.0, shape=(), dtype=float32), t..."
2659,"[(15161, 0.9028208255767822), (11773, 0.811946...","[#animals, #pets, #animal, #dogs, #dogstagram,...","[#sportdogminsk, #dogminsk, #labrador, #labrad...",c62f34b0-da93-485c-94a0-d56be4b72018.jpg,animals,"[0.0, 1.7648926, 0.0, 0.014228077, 0.114186764...",animals,c62f34b0-da93-485c-94a0-d56be4b72018.jpg,"(((tf.Tensor(0.07843143, shape=(), dtype=float..."
1990,"[(2753, 1.0034533739089966), (12956, 0.9917454...","[#follow, #like, #followme, #likeforlike, #ins...","[#follow, #photooftheday, #love, #instalike, #...",7263d6f8-422f-4484-a306-cf9984bfa8bc.jpg,food,"[0.0, 0.0, 0.24697302, 0.0, 0.0, 0.025176601, ...",food,7263d6f8-422f-4484-a306-cf9984bfa8bc.jpg,"(((tf.Tensor(-0.9079044, shape=(), dtype=float..."


In [None]:
# Only use certain columns
recs_deep_clean = recs_deep[['image_local_name', 'hashtags', 'deep_features']]

img_features.index = img_features['id']
img_features.drop(['id'], axis=1)

# Add image feature into dataframe
recommender_df = recs_deep_clean.join(img_features, how='inner')

recommender_df.head()

In [25]:
#code used for testing purposes

from imp import reload
import functions
reload(functions)
prepare_image = functions.prepare_image

In [26]:
# Function that finds k nearest neighbors by cosine similarity

def find_neighbor_vectors(image_path, k=5, recommender_df=recommender_df):
    """Find image features (user vectors) for similar images."""
    prep_image = functions.prepare_image(image_path, where='local')
    deep_features = functions.extract_features_for_one_image(prep_image, neural_network)
    rdf = recommender_df.copy()
    rdf['dist'] = rdf['deep_features'].apply(lambda x: cosine(x, deep_features))
    rdf = rdf.sort_values(by='dist')
    #return rdf.head(k)['features']
    return rdf.head(k)
    

In [27]:
#test the function on a local image

fnv = find_neighbor_vectors('test_wedding.jpg', 
                k=5, recommender_df=recommender_df)

In [28]:
fnv

Unnamed: 0,image_local_name,hashtags,deep_features,id,features,dist
763,edf9daee-f603-46b5-bf4a-ae9974bb89ad.jpg,"[#wanderlust, #toddlers, #woods, #forest, #nc,...","[0.74691504, 1.2644943, 0.0, 0.23730093, 2.684...",763,"[0.4171702265739441, -0.017076052725315094, -0...",0.420474
882,c3b7232e-ea7c-4766-91e3-99207f7d8d8b.jpg,"[#photooftheday, #hasekio, #bs, #travel, #phot...","[0.7826781, 1.8151407, 0.3264235, 0.4332988, 1...",882,"[-0.2053070217370987, -0.8706239461898804, 0.3...",0.437234
682,e1626e8a-eaa8-4812-917b-4a4aeadeb10f.jpg,"[#follow, #repost, #photooftheday, #love, #fas...","[0.17159231, 0.4535152, 0.023434391, 0.1585964...",682,"[0.07097786664962769, -0.32791000604629517, 0....",0.456072
773,72569f9f-ebea-4b5a-b238-c2b15e9b288d.jpg,"[#musician, #newmusic, #producer, #futuremusic...","[0.019124918, 1.6645181, 0.0, 0.0, 2.546588, 0...",773,"[0.1564265638589859, 0.18779335916042328, 0.08...",0.461755
737,f2a6f5f1-64c2-4ee3-9609-d409e28fa816.jpg,"[#honolulu, #hawaii, #dream, #travel, #hoomalu...","[0.13907114, 0.9570846, 0.052211534, 0.0, 1.52...",737,"[0.1638672947883606, 0.05539854243397713, 0.13...",0.470721


### Next step to do: after identifying 5 nearest neighbors, look at ALS's image features (user features), get the average for 5. Then compute dot product with every hashtag features vector (item features). Take hashtags woth the highest reasult as recommendations.

In [29]:
#check length of the user features
len(fnv.iloc[3, 4])

10

## Find the average of the 5 user features found based on cosine similarity.

In [30]:
# extract features from dataframe

features = []
for item in fnv.features.values:
    features.append(item)

In [31]:
avg_features = np.mean(np.asarray(features), axis=0)

## Find the dot product with each hashtag features

In [32]:
# take a look at the hashtag features extracted from ALS model

hashtag_features.head()

Unnamed: 0,id,features
0,1,"[0.0744522213935852, 0.0064070881344377995, 0...."
1,11,"[0.054107919335365295, 0.08664730191230774, 0...."
2,21,"[-0.04549521207809448, -0.06504690647125244, -..."
3,31,"[0.05305836722254753, 0.07028129696846008, -0...."
4,41,"[-0.03151167184114456, 0.029394743964076042, 0..."


In [33]:
# add new column to the hashtag features which will be the dot product with the average image(user) features

hashtag_features['dot_product'] = hashtag_features['features'].apply(lambda x: np.asarray(x).dot(avg_features))

In [34]:
# get 10 ids with the highest dot product

df_ten_highest = hashtag_features.sort_values(by='dot_product', ascending=False).head(10)

In [35]:
rec_hashtag_ids = df_ten_highest.id.values

## Find hashtags that correspond to the ids

In [36]:
# Unpickle hashtag data

hashtags_df = pd.read_pickle("pkl_files/hashtags_df.pkl")

In [37]:
hashtags_df.head()

Unnamed: 0_level_0,hashtag
id,Unnamed: 1_level_1
0,#sushiemcasa
1,#retarded
2,#weddingflowersdecor
3,#booster
4,#kumas


In [38]:
for i in rec_hashtag_ids:
    print(hashtags_df.loc[hashtags_df.index==i]['hashtag'])

id
14094    #travel
Name: hashtag, dtype: object
id
4515    #photography
Name: hashtag, dtype: object
id
2638    #nature
Name: hashtag, dtype: object
id
4556    #love
Name: hashtag, dtype: object
id
305    #photooftheday
Name: hashtag, dtype: object
id
14520    #instagood
Name: hashtag, dtype: object
id
3319    #picoftheday
Name: hashtag, dtype: object
id
775    #beautiful
Name: hashtag, dtype: object
id
9326    #summer
Name: hashtag, dtype: object
id
6746    #instagram
Name: hashtag, dtype: object
