In [1]:
import numpy as np
import os
import pandas as pd
from skimage.transform import resize
from skimage import io
from PosterExtractor import extract_posters
from poster_knn import PosterKNN
from features import color_histogram_hsv, hog_feature

### Download posters of all movies

In [6]:
data_path = os.path.join(os.getcwd(), 'data', 'the-movies-dataset')
movies = pd.read_csv(os.path.join(data_path, 'movies_preprocessed.csv'))
skipped_posters = extract_posters(movies,to_folder='/Users/admin/Downloads/extracted_posters_small/')
print('skipped a total of {} movies'.format(len(skipped_posters)))
print(skipped_posters)

http exception - skipping poster of movieId: 35810
http exception - skipping poster of movieId: 77621
http exception - skipping poster of movieId: 121351
http exception - skipping poster of movieId: 106605
http exception - skipping poster of movieId: 156415
http exception - skipping poster of movieId: 38585
http exception - skipping poster of movieId: 23022
http exception - skipping poster of movieId: 53571
http exception - skipping poster of movieId: 79968
http exception - skipping poster of movieId: 140470
http exception - skipping poster of movieId: 242115
http exception - skipping poster of movieId: 55602
http exception - skipping poster of movieId: 31772
skipped a total of 13 movies
[35810, 77621, 121351, 106605, 156415, 38585, 23022, 53571, 79968, 140470, 242115, 55602, 31772]


### Load Poster data into raw vector representation

In [2]:
posters_dir = '/Users/admin/Downloads/extracted_posters_small/'

def image_to_raw_vector_flattened(image_url):
    # flatten the image into a list of raw pixel intensities
    image = io.imread(image_url)
    img_resize = resize(image, (48,32)) # (56,37), (278,185),... keep ratio
    return img_resize.flatten()

def image_to_raw_vector(image_url):
    image = io.imread(image_url)
    img_resize = resize(image, (32,32,3)) # needs to be of same width/height
    return img_resize

def load_movie_posters_data(features=False):
    images = []
    ids = []
    for file in os.listdir(posters_dir):
        if features:
            images.append(image_to_raw_vector(posters_dir+file))
        else:
            images.append(image_to_raw_vector_flattened(posters_dir+file))
        ids.append(int(file.split('.')[0]))
    return np.array(images), np.array(ids)

image_data_X, image_data_y = load_movie_posters_data(features=True)
#image_data_X_flat, image_data_y = load_movie_posters_data()

In [3]:
print(image_data_X.shape)
#print(image_data_X_flat.shape)
print(image_data_y.shape)

(44598,)
(44598,)


### Convert from raw pixels into feature representations

In [8]:
from features import *

num_color_bins = 10 # Number of bins in the color histogram
feature_fns = [hog_feature, lambda img: color_histogram_hsv(img, nbin=num_color_bins)]
image_data_feats = extract_features(image_data_X, feature_fns, verbose=True)

# Preprocessing: Subtract the mean feature
mean_feat = np.mean(image_data_feats, axis=0, keepdims=True)
image_data_feats -= mean_feat

# Preprocessing: Divide by standard deviation. This ensures that each feature
# has roughly the same scale.
std_feat = np.std(image_data_feats, axis=0, keepdims=True)
image_data_feats /= std_feat

# Preprocessing: Add a bias dimension
image_data_feats = np.hstack([image_data_feats, np.ones((image_data_feats.shape[0], 1))])

Done extracting features for 1000 / 44598 images
Done extracting features for 2000 / 44598 images
Done extracting features for 3000 / 44598 images
Done extracting features for 4000 / 44598 images
Done extracting features for 5000 / 44598 images
Done extracting features for 6000 / 44598 images
Done extracting features for 7000 / 44598 images
Done extracting features for 8000 / 44598 images
Done extracting features for 9000 / 44598 images
Done extracting features for 10000 / 44598 images
Done extracting features for 11000 / 44598 images
Done extracting features for 12000 / 44598 images
Done extracting features for 13000 / 44598 images
Done extracting features for 14000 / 44598 images
Done extracting features for 15000 / 44598 images
Done extracting features for 16000 / 44598 images
Done extracting features for 17000 / 44598 images
Done extracting features for 18000 / 44598 images
Done extracting features for 19000 / 44598 images
Done extracting features for 20000 / 44598 images
Done extr

In [4]:
np.save('image_features_X',image_data_feats)
np.save('image_ids_y',image_data_y)

##### Note: for all 40k+ images the raw representations take 2+GB of space. For the following code only a subset of posters was used.

### Initialize kNN's for posters - with and without features

In [12]:
recommender_feats = PosterKNN()
recommender_feats.train(image_data_feats,image_data_y)

In [7]:
recommender = PosterKNN()
recommender.train(image_data_X_flat,image_data_y)

### Compare results

##### Compare the k nearest neighbours of both recommenders and count the different movie ids

In [24]:
k = 10
test_ids = [32672, 862, 5833, 15602, 31357, 11862, 108]

differences = []
for id in test_ids:
    similar1 = recommender_feats.recommend(id,k)
    similar2 = recommender.recommend(id,k)
    diff = list(set(similar1) - set(similar2))
    differences.append(len(diff))
    
print(differences)

[0, 9, 0, 9, 9, 7, 9]


### Test the recommender on Toy Story (id: 862) and open the k=8 most similar

In [15]:
similar = recommender_feats.recommend(5833,8) # Toy Story

In [16]:
from matplotlib.pyplot import imshow
from PIL import Image

%matplotlib inline
for v in similar:
    pil_im = Image.open(posters_dir+str(v)+'.jpg', 'r')
    pil_im.show()