# Retrieve from preprocessed data

In [2]:
import csv
import os
from matplotlib import pyplot as plt
import numpy as np
from skimage import io

# Top 10 Genres in descending order:  
top_genres = ['Drama', 'Comedy', 'Romance', 'Action', 'Crime', 'Thriller','Horror', 'Adventure', 'Documentary', 'Mystery']
num_of_genres = len(top_genres)
# The useful outputs are saved in the lists below

# 1-d vectors
poster_ids = []
scores = [] 
genres = [] # dimension: n x 10

# contains np array of dim(100,100,3) for each input
images = [] 

# flatten_images is used for kNN since it only accepts 2d array 
flatten_images = []
line_count = 0

# set to 500 first 
line_limit = 500
with open('ProcessedData.csv') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    for row in csv_reader:
        if line_count > line_limit:
            break
        line_count += 1
        
        pid = row[0]
        score = row[1]
        genre = row[2]
        genre_vector = []
        
        
        # create label vector
        for i in range(0, len(top_genres)):
            if top_genres[i] in genre:
                genre_vector.append(1)
            else:
                genre_vector.append(0)
        
        try:
            filepath = f'./SamplePosters/{pid}.jpg'
            image = io.imread(filepath)
        except:
            #print(f'{pid} not found')
            continue
        
        # remove greyscale images
        if image.shape[2] != 3:
            continue
        
        poster_ids.append(pid)
        scores.append(score)
        genres.append(genre_vector)
        arr = np.array(image)
        images.append(arr)
        flatten_images.append(arr.flatten())

# Debug
# print(poster_ids[0], images[0].shape)
# plt.imshow(images[0], interpolation='nearest')
# plt.show() 

print('Done')
print(np.array(genres).shape)
print(np.array(flatten_images).shape)

Done
(452, 10)
(452, 146328)


# kNN algorithm from sklearn

In [3]:
import numpy as np
from numpy.linalg import norm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import hamming_loss

# we use 90% of data for training
# might need to include validation*
sample_limit = np.array(genres).shape[0]
print(f'Sample Size: {sample_limit}')
cutoff = int(0.9 * sample_limit)
train_images = flatten_images[:cutoff]
test_images = flatten_images[cutoff:]
train_labels = genres[:cutoff]
test_labels = genres[cutoff:]

def kNN_training(start, end):
        for i in range(start, end, 2):
            num_of_neighbors = i
            neigh = KNeighborsClassifier(n_neighbors=num_of_neighbors)
            neigh.fit(train_images, train_labels)

            list = []
            for item in test_images:
                predicted_label = neigh.predict([item])[0]
                list.append(predicted_label)
                

            print(test_labels[0:2])
            print(list[0:2])
        #     print(np.array(test_labels).shape)
        #     print(np.array(list).shape)
            score = f1_score(test_labels, list, average='micro', zero_division=1)
            macro_score = f1_score(test_labels, list, average='macro', zero_division=1)
            h_loss = hamming_loss(test_labels, list)
            print(f'Average hamming loss, fraction of incorrectly predicted labels: {h_loss}')
            print(f'F1 micro score for {num_of_neighbors} neighbors: {score}')
            print(f'F1 macro score for {num_of_neighbors} neighbors: {macro_score}')
            
kNN_training(3, 10)

# Results for Sample Size: 452
# F1 micro score for 3 neighbors: 0.4352941176470588
# F1 macro score for 3 neighbors: 0.32395808525839487
# F1 micro score for 5 neighbors: 0.4099378881987578
# F1 macro score for 5 neighbors: 0.2586118056172503
# F1 micro score for 7 neighbors: 0.4026845637583893
# F1 macro score for 7 neighbors: 0.23838656470235414
# F1 micro score for 9 neighbors: 0.3611111111111111
# F1 macro score for 9 neighbors: 0.20873015873015874
# F1 micro score for 11 neighbors: 0.36879432624113473
# F1 macro score for 11 neighbors: 0.21000000000000002


Sample Size: 452
[[1, 0, 0, 1, 1, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
[array([0, 0, 0, 1, 0, 0, 0, 1, 0, 0]), array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0])]
Average hamming loss, fraction of incorrectly predicted labels: 0.20869565217391303
F1 micro score for 3 neighbors: 0.4352941176470588
F1 macro score for 3 neighbors: 0.32395808525839487
[[1, 0, 0, 1, 1, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
[array([1, 0, 0, 0, 0, 0, 0, 1, 0, 0]), array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0])]
Average hamming loss, fraction of incorrectly predicted labels: 0.20652173913043478
F1 micro score for 5 neighbors: 0.4099378881987578
F1 macro score for 5 neighbors: 0.2586118056172503
[[1, 0, 0, 1, 1, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
[array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0]), array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0])]
Average hamming loss, fraction of incorrectly predicted labels: 0.1934782608695652
F1 micro score for 7 neighbors: 0.4026845637583893
F1 macro score for 7 neighbors: 0.2383865647