In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
#!pip install 'numpy<1.17'

Collecting numpy<1.17
[?25l  Downloading https://files.pythonhosted.org/packages/0f/c9/3526a357b6c35e5529158fbcfac1bb3adc8827e8809a6d254019d326d1cc/numpy-1.16.4-cp36-cp36m-macosx_10_6_intel.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl (13.9MB)
[K    100% |████████████████████████████████| 13.9MB 659kB/s ta 0:00:01
[31mtb-nightly 1.14.0a20190603 has requirement setuptools>=41.0.0, but you'll have setuptools 40.0.0 which is incompatible.[0m
[31mkeras 2.2.1 has requirement keras-applications==1.0.4, but you'll have keras-applications 1.0.8 which is incompatible.[0m
[31mkeras 2.2.1 has requirement keras-preprocessing==1.0.2, but you'll have keras-preprocessing 1.1.0 which is incompatible.[0m
[?25hInstalling collected packages: numpy
Successfully installed numpy-1.16.4
[33mYou are using pip version 19.0.3, however version 19.2.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from random import shuffle
%matplotlib inline
import tensorflow as tf
from tensorflow.keras.applications import MobileNetV2
import os
from functools import reduce
from functions import fetch_image_from_s3_to_array, prepare_image
from pyspark.ml.recommendation import ALSModel
from scipy.spatial.distance import cdist

In [5]:
img_size = 160 # All images will be resized to 160x160
img_shape = (160, 160, 3)

# Create the base model from the pre-trained model MobileNet V2
base_model = MobileNetV2(input_shape=img_shape, include_top=False, weights='imagenet')

global_average_layer = tf.keras.layers.GlobalAveragePooling2D()

neural_network = tf.keras.Sequential([
  base_model,
  global_average_layer,
])

In [6]:
#Load ALS model

In [7]:
#Initialize Pyspark

import pyspark
spark = (pyspark.sql
                .SparkSession
                .builder
                .master("local[*]")
                .getOrCreate())

In [8]:
als_model = ALSModel.load('als_model')

In [9]:
img_features = als_model.userFactors.toPandas()
hashtag_features = als_model.itemFactors.toPandas()

In [10]:
img_features.head()

Unnamed: 0,id,features
0,6,"[0.6061490774154663, 0.2454712986946106, -0.22..."
1,16,"[0.29392072558403015, 0.18212854862213135, 0.5..."
2,26,"[0.23866358399391174, 0.14173837006092072, 0.4..."
3,36,"[-0.11467598378658295, 0.2898446023464203, 0.2..."
4,46,"[0.034687578678131104, 0.07074137777090073, 0...."


In [11]:
#Load deep features

In [12]:
#deep_features = pd.read_csv('deep_features.csv', )
deep_features = pd.read_pickle("df_deep_features.pkl")

In [13]:
deep_features.index = deep_features['name']

In [16]:
deep_features.head()

Unnamed: 0_level_0,deep_features,hashtag,name,pic
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
cc4ddfb1-9d3b-4429-8736-f4aa4b553abb.jpg,"[0.0, 0.0, 0.0, 0.9403897, 0.013560877, 0.5415...",wedding,cc4ddfb1-9d3b-4429-8736-f4aa4b553abb.jpg,"(((tf.Tensor(-0.44509804, shape=(), dtype=floa..."
47a70842-b5d3-4246-a823-d3b979615e79.jpg,"[1.2038865, 0.1280765, 0.0, 0.08866347, 0.5180...",babies,47a70842-b5d3-4246-a823-d3b979615e79.jpg,"(((tf.Tensor(0.99215686, shape=(), dtype=float..."
0b51899b-dc3c-4f30-9230-3f5223cf79a6.jpg,"[0.0, 0.14218274, 0.29680854, 0.46161124, 0.10...",architecture,0b51899b-dc3c-4f30-9230-3f5223cf79a6.jpg,"(((tf.Tensor(0.52156866, shape=(), dtype=float..."
b861224a-3e23-4726-a786-0dfbdafba3c0.jpg,"[0.049770802, 0.40196633, 0.6088646, 0.5214633...",selfie,b861224a-3e23-4726-a786-0dfbdafba3c0.jpg,"(((tf.Tensor(0.6039216, shape=(), dtype=float3..."
4c537c13-1249-4aa1-9e03-9615b3deeeef.jpg,"[0.7323788, 0.014615268, 0.22212349, 0.6641924...",babies,4c537c13-1249-4aa1-9e03-9615b3deeeef.jpg,"(((tf.Tensor(-0.16274509, shape=(), dtype=floa..."


In [17]:
deep_features.iloc[0, 0]

array([0.        , 0.        , 0.        , ..., 0.00304344, 0.7174091 ,
       0.0012624 ], dtype=float32)

In [18]:
#Extract recs dataframe
recs = pd.read_pickle('recs.pkl')

In [19]:
recs.head()

Unnamed: 0_level_0,recommendations,recommended_hashtags,hashtags,image_local_name,search_hashtag
image_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1580,"[(11726, 1.2205473184585571), (7638, 1.0976204...","[#luxury, #car, #cars, #love, #carsofinstagram...","[#checkeredflag, #atv, #flagmanracing, #dirtbi...",88039d35-d189-4db3-926e-a120d88a3efc.jpg,cars
471,"[(2399, 1.1536394357681274), (6412, 1.14570438...","[#selfie, #beauty, #ootd, #fashion, #girl, #su...","[#followme, #fashiondaily, #summer, #food, #ma...",fdc6c8d4-5a13-4330-9098-07e09e4858d2.jpg,selfie
1591,"[(8394, 0.9458842277526855), (7638, 0.85855448...","[#cars, #car, #luxury, #trucks, #carsofinstagr...","[#bidlemanchevrolet, #suvs, #cars, #cardeals, ...",1b7a8fbc-1797-4418-b8b2-c65500e2bc6f.jpg,cars
463,"[(2399, 0.8559869527816772), (11988, 0.7232627...","[#selfie, #wedding, #photographer, #fashion, #...","[#followme, #hotguy, #instaboy, #gay, #instaga...",a78d0e30-6afa-436a-a953-633ef3f64325.jpg,selfie
833,"[(3723, 0.958864688873291), (8394, 0.948372125...","[#sunset, #cars, #travel, #sun, #photography, ...","[#arte, #interiordesign, #archilovers, #argent...",535bc893-ebbe-46aa-9ae9-d2cd5f4dd426.jpg,travel


In [20]:
#add deep features information to recs dataframe
recs_deep = recs.join(deep_features, on='image_local_name', how='inner')

In [21]:
recs_deep.head()

Unnamed: 0_level_0,recommendations,recommended_hashtags,hashtags,image_local_name,search_hashtag,deep_features,hashtag,name,pic
image_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1580,"[(11726, 1.2205473184585571), (7638, 1.0976204...","[#luxury, #car, #cars, #love, #carsofinstagram...","[#checkeredflag, #atv, #flagmanracing, #dirtbi...",88039d35-d189-4db3-926e-a120d88a3efc.jpg,cars,"[0.05277525, 0.008059711, 0.3715715, 1.259724,...",cars,88039d35-d189-4db3-926e-a120d88a3efc.jpg,"(((tf.Tensor(0.845098, shape=(), dtype=float32..."
471,"[(2399, 1.1536394357681274), (6412, 1.14570438...","[#selfie, #beauty, #ootd, #fashion, #girl, #su...","[#followme, #fashiondaily, #summer, #food, #ma...",fdc6c8d4-5a13-4330-9098-07e09e4858d2.jpg,selfie,"[0.8612818, 0.0, 0.0, 0.0, 0.47858134, 0.28118...",selfie,fdc6c8d4-5a13-4330-9098-07e09e4858d2.jpg,"(((tf.Tensor(0.69411767, shape=(), dtype=float..."
1591,"[(8394, 0.9458842277526855), (7638, 0.85855448...","[#cars, #car, #luxury, #trucks, #carsofinstagr...","[#bidlemanchevrolet, #suvs, #cars, #cardeals, ...",1b7a8fbc-1797-4418-b8b2-c65500e2bc6f.jpg,cars,"[0.0, 0.0, 0.0, 2.6671624, 0.11534082, 0.01738...",cars,1b7a8fbc-1797-4418-b8b2-c65500e2bc6f.jpg,"(((tf.Tensor(0.92941177, shape=(), dtype=float..."
463,"[(2399, 0.8559869527816772), (11988, 0.7232627...","[#selfie, #wedding, #photographer, #fashion, #...","[#followme, #hotguy, #instaboy, #gay, #instaga...",a78d0e30-6afa-436a-a953-633ef3f64325.jpg,selfie,"[0.11383941, 0.0, 1.6760664, 0.46152607, 0.062...",selfie,a78d0e30-6afa-436a-a953-633ef3f64325.jpg,"(((tf.Tensor(-0.7254902, shape=(), dtype=float..."
833,"[(3723, 0.958864688873291), (8394, 0.948372125...","[#sunset, #cars, #travel, #sun, #photography, ...","[#arte, #interiordesign, #archilovers, #argent...",535bc893-ebbe-46aa-9ae9-d2cd5f4dd426.jpg,travel,"[0.40183023, 0.0, 0.028862935, 0.0, 0.13101277...",travel,535bc893-ebbe-46aa-9ae9-d2cd5f4dd426.jpg,"(((tf.Tensor(0.99215686, shape=(), dtype=float..."


In [22]:
#only use certain columns
recs_deep_clean = recs_deep[['image_local_name', 'hashtags', 'deep_features']]

In [23]:
img_features.index = img_features['id']

In [24]:
img_features.drop(['id'], axis=1)

Unnamed: 0_level_0,features
id,Unnamed: 1_level_1
6,"[0.6061490774154663, 0.2454712986946106, -0.22..."
16,"[0.29392072558403015, 0.18212854862213135, 0.5..."
26,"[0.23866358399391174, 0.14173837006092072, 0.4..."
36,"[-0.11467598378658295, 0.2898446023464203, 0.2..."
46,"[0.034687578678131104, 0.07074137777090073, 0...."
56,"[0.09326983243227005, -0.3992665410041809, 0.8..."
66,"[0.08425560593605042, -0.3380923271179199, 0.5..."
76,"[-0.8585888147354126, -0.1069592833518982, 0.2..."
86,"[0.012009835802018642, 0.04350445047020912, 0...."
96,"[0.5050057172775269, 0.09498237818479538, 0.36..."


In [25]:
#add image feature into dataframe

recommender_df = recs_deep_clean.join(img_features, how='inner')

In [26]:
recommender_df.head()

Unnamed: 0,image_local_name,hashtags,deep_features,id,features
1580,88039d35-d189-4db3-926e-a120d88a3efc.jpg,"[#checkeredflag, #atv, #flagmanracing, #dirtbi...","[0.05277525, 0.008059711, 0.3715715, 1.259724,...",1580,"[0.9743834137916565, -0.5692329406738281, 0.47..."
471,fdc6c8d4-5a13-4330-9098-07e09e4858d2.jpg,"[#followme, #fashiondaily, #summer, #food, #ma...","[0.8612818, 0.0, 0.0, 0.0, 0.47858134, 0.28118...",471,"[-0.19726665318012238, -0.14174753427505493, 0..."
1591,1b7a8fbc-1797-4418-b8b2-c65500e2bc6f.jpg,"[#bidlemanchevrolet, #suvs, #cars, #cardeals, ...","[0.0, 0.0, 0.0, 2.6671624, 0.11534082, 0.01738...",1591,"[0.6441309452056885, -0.5491575598716736, 0.55..."
463,a78d0e30-6afa-436a-a953-633ef3f64325.jpg,"[#followme, #hotguy, #instaboy, #gay, #instaga...","[0.11383941, 0.0, 1.6760664, 0.46152607, 0.062...",463,"[-0.48255831003189087, -0.1489773690700531, 0...."
833,535bc893-ebbe-46aa-9ae9-d2cd5f4dd426.jpg,"[#arte, #interiordesign, #archilovers, #argent...","[0.40183023, 0.0, 0.028862935, 0.0, 0.13101277...",833,"[-0.03402460366487503, -0.3135524094104767, 0...."


In [27]:
from scipy.spatial.distance import cosine

In [28]:
#code used for testing purposes

from imp import reload
import functions
reload(functions)
prepare_image = functions.prepare_image

In [29]:
#function that finds k nearest neighbors by cosine similarity

def find_neighbor_vectors(image_path, k=5, recommender_df=recommender_df):
    """Find image features (user vectors) for similar images."""
    prep_image = functions.prepare_image(image_path, where='local')
    deep_features = functions.extract_features_for_one_image(prep_image, neural_network)
    #print(deep_features)
    rdf = recommender_df.copy()
    rdf['dist'] = rdf['deep_features'].apply(lambda x: cosine(x, deep_features))
    rdf = rdf.sort_values(by='dist')
    #return rdf.head(k)['features']
    return rdf.head(k)

def equal_len(a, b): #a, b: lists
    '''Fill with 0 the vector that has smaller length'''
    while len(a) != len(b):
        if len(a) > len(b):
            b.append(0)
        else:
            a.append(0)
    return a, b
    

In [42]:
#test the function on a local image

fnv = find_neighbor_vectors('test_wedding.jpg', 
                k=5, recommender_df=recommender_df)

In [43]:
fnv

Unnamed: 0,image_local_name,hashtags,deep_features,id,features,dist
279,4098cbe2-7a78-42e2-b91a-56e136309235.jpg,"[#ootd, #cute, #cuteoutfits, #fashionblogger, ...","[0.59664005, 2.841192, 0.0, 1.0908409, 3.63056...",279,"[0.0012751646572723985, 0.19889169931411743, 0...",0.394075
2968,c001507f-02e6-4fb4-aa49-27a139018cd4.jpg,"[#wedding, #gythio, #greece, #MarriedInMani, #...","[0.15775155, 1.9940039, 0.6179221, 0.0, 3.2098...",2968,"[-0.16875383257865906, -0.035492222756147385, ...",0.416801
2827,7a8c4976-7026-4fc1-acb8-e142e68bc954.jpg,"[#wedding, #instacool, #likelike, #like, #like...","[1.484782, 0.20136833, 0.0, 0.0, 3.6014848, 0....",2827,"[-0.4109117090702057, -0.07897736877202988, -0...",0.417307
763,edf9daee-f603-46b5-bf4a-ae9974bb89ad.jpg,"[#wanderlust, #toddlers, #woods, #forest, #nc,...","[0.74691576, 1.2644926, 0.0, 0.23730333, 2.684...",763,"[-0.2177232950925827, -0.5143095254898071, 0.1...",0.420474
223,d5c1c965-47e1-492e-842b-118ea059ba32.jpg,"[#instagram, #nehakakkar, #trending, #beauty, ...","[0.0, 0.0, 0.0, 0.9554576, 2.4236367, 0.069097...",223,"[-0.12365098297595978, 0.07588334381580353, 0....",0.421033


### Next step to do: after identifying 5 nearest neighbors, look at ALS's image features (user features), get the average for 5. Then compute dot product with every hashtag features vector (item features). Take hashtags woth the highest reasult as recommendations.

In [44]:
#check length of the user features
len(fnv.iloc[3, 4])

10

## Find the average of the 5 user features found based on cosine similarity.

In [45]:
# extract features from dataframe

features = []
for item in fnv.features.values:
    features.append(item)

In [46]:
avg_features = np.mean(np.asarray(features), axis=0)

## Find the dot product with each hashtag features

In [47]:
# take a look at the hashtag features extracted from ALS model

hashtag_features.head()

Unnamed: 0,id,features,dot_product
0,2,"[-0.0018261333461850882, 0.060863085091114044,...",0.020495
1,12,"[0.02017715387046337, -0.005174570716917515, -...",0.016701
2,22,"[-0.3308730125427246, -0.2467443346977234, 0.0...",0.120115
3,32,"[-0.04363624379038811, -0.04164634644985199, 0...",0.054854
4,42,"[0.034835416823625565, -0.08199584484100342, 0...",-0.022905


In [48]:
# add new column to the hashtag features which will be the dot product with the average image(user) features

hashtag_features['dot_product'] = hashtag_features['features'].apply(lambda x: np.asarray(x).dot(avg_features))

In [49]:
# get 10 ids with the highest dot product

df_ten_highest = hashtag_features.sort_values(by='dot_product', ascending=False).head(10)

In [50]:
rec_hashtag_ids = df_ten_highest.id.values

## Find hashtags that correspond to the ids

In [51]:
# Unpickle hashtag data

hashtags_df = pd.read_pickle("hashtags_df.pkl")

In [52]:
hashtags_df.head()

Unnamed: 0_level_0,hashtag
id,Unnamed: 1_level_1
0,#marche
1,#culinary
2,#RestauranteVirtual
3,#littium
4,#adventurecat


In [53]:
for i in rec_hashtag_ids:
    print(hashtags_df.loc[hashtags_df.index==i]['hashtag'])

id
39    #love
Name: hashtag, dtype: object
id
8809    #summer
Name: hashtag, dtype: object
id
2441    #fashion
Name: hashtag, dtype: object
id
6319    #instagood
Name: hashtag, dtype: object
id
2480    #photooftheday
Name: hashtag, dtype: object
id
2399    #selfie
Name: hashtag, dtype: object
id
4556    #beautiful
Name: hashtag, dtype: object
id
2864    #instagram
Name: hashtag, dtype: object
id
7336    #happy
Name: hashtag, dtype: object
id
10051    #photography
Name: hashtag, dtype: object
