# Retrieve from preprocessed data

In [None]:
print(1)

In [1]:
import csv
from matplotlib import pyplot as plt
import numpy as np
from skimage import io
from skimage.transform import rescale, resize


# save numpy array as csv file
from numpy import asarray
from numpy import savetxt


# Top 10 Genres in descending order:  
top_genres = ['Drama', 'Comedy', 'Romance', 'Action', 'Crime', 'Thriller','Horror', 'Adventure', 'Documentary', 'Mystery']
num_of_genres = len(top_genres)
# The useful outputs are saved in the lists below

# 1-d vectors
poster_ids = []
scores = [] 
genres = [] # dimension: n x 10

# contains np array of dim(100,100,3) for each input
images = [] 

# flatten_images is used for kNN since it only accepts 2d array 
flatten_images = []
sample_count = 0
resized_dim = (100, 100)

# set to 500 first 
sample_size = 3000
with open('ProcessedData.csv') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    for row in csv_reader:
        if sample_count >= sample_size:
            break
        
        pid = row[0]
        score = row[1]
        genre = row[2]
        genre_vector = []
        
        
        # create label vector
        for i in range(0, len(top_genres)):
            if top_genres[i] in genre:
                genre_vector.append(1)
            else:
                genre_vector.append(0)
        
        try:
            filepath = f'./SamplePosters/{pid}.jpg'
            image = io.imread(filepath)
            image_resized = resize(image, resized_dim)
        except:
            continue
        
        # remove greyscale images
        if len(image_resized.shape) != 3:
            continue
        if len(genres) % 500 == 0:
            print(f'Processed image count: {len(genres)}')
        
        poster_ids.append(pid)
        scores.append(float(score))
        genres.append(genre_vector)
        arr = np.array(image_resized)
#         images.append(arr)
        flatten_images.append(arr.flatten())
        sample_count += 1


# Debug
# define data
# data = asarray(flatten_images)
# label_data = asarray(genres)
# score_data = asarray(scores)
# save to csv file
# savetxt('knn_data.csv', data, delimiter=',')
# savetxt('knn_score.csv', score_data, delimiter=',')
# savetxt('knn_label.csv', label_data, delimiter=',')

print('Done')
print(np.array(genres).shape)
print(np.array(flatten_images).shape)

Processed image count: 0
Processed image count: 500
Processed image count: 1000
Processed image count: 1500
Processed image count: 2000
Processed image count: 2500
Done
(3000, 10)
(3000, 30000)


# kNN algorithm from sklearn

In [3]:
import numpy as np
from numpy.linalg import norm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score, accuracy_score, hamming_loss

labels = genres[:3000]
data_input = flatten_images[:3000]
# We split our data in 80:10:10
sample_limit = np.array(labels).shape[0]
print(f'Sample Size: {sample_limit}')
cutoff = int(0.8 * sample_limit)
cutoff2 = int(0.9 * sample_limit)
train_images = data_input[:cutoff]
train_labels = labels[:cutoff]

validation_images = data_input[cutoff:cutoff2]
validation_labels = labels[cutoff:cutoff2]

test_images = data_input[cutoff2:]
test_labels = labels[cutoff2:]

def kNN_training(start, end):
        for i in range(start, end, 2):
            num_of_neighbors = i
            neigh = KNeighborsClassifier(n_neighbors=num_of_neighbors)
            neigh.fit(train_images, train_labels)
            print(f'Training done for {num_of_neighbors} neighbors')
            
            predicted_labels = neigh.predict(validation_images)
                
            
            score = f1_score(validation_labels, predicted_labels, average='micro', zero_division=1)
            h_loss = hamming_loss(validation_labels, predicted_labels)
            accuracy = accuracy_score(validation_labels,predicted_labels)
            print(f'Hamming loss for {num_of_neighbors} neighbors: {h_loss}')
            print(f'F1 micro score for {num_of_neighbors} neighbors: {score}')
            print(f'Accuracy score: {accuracy}\n')
            
def kNN_test(start, end):
        for i in range(start, end, 2):
            num_of_neighbors = i
            neigh = KNeighborsClassifier(n_neighbors=num_of_neighbors)
            neigh.fit(train_images, train_labels)
            print(f'Training done for {num_of_neighbors} neighbors')
            
            predicted_labels = neigh.predict(validation_images)
                
            
            score = f1_score(test_labels, predicted_labels, average='micro', zero_division=1)
            h_loss = hamming_loss(test_labels, predicted_labels)
            accuracy = accuracy_score(test_labels,predicted_labels)
            print(f'Hamming loss for {num_of_neighbors} neighbors: {h_loss}')
            print(f'F1 micro score for {num_of_neighbors} neighbors: {score}')
            print(f'Accuracy score: {accuracy}\n')
# Results for Sample Size: 452
# F1 micro score for 3 neighbors: 0.4352941176470588
# F1 macro score for 3 neighbors: 0.32395808525839487
# F1 micro score for 5 neighbors: 0.4099378881987578
# F1 macro score for 5 neighbors: 0.2586118056172503
# F1 micro score for 7 neighbors: 0.4026845637583893
# F1 macro score for 7 neighbors: 0.23838656470235414
# F1 micro score for 9 neighbors: 0.3611111111111111
# F1 macro score for 9 neighbors: 0.20873015873015874
# F1 micro score for 11 neighbors: 0.36879432624113473
# F1 macro score for 11 neighbors: 0.21000000000000002


Sample Size: 3000


In [4]:
# 
kNN_training(3, 9)

Training done for 3 neighbors
Hamming loss for 3 neighbors: 0.223
F1 micro score for 3 neighbors: 0.32628398791540786
Accuracy score: 0.08

Training done for 5 neighbors
Hamming loss for 5 neighbors: 0.19833333333333333
F1 micro score for 5 neighbors: 0.34972677595628415
Accuracy score: 0.10333333333333333

Training done for 7 neighbors
Hamming loss for 7 neighbors: 0.19133333333333333
F1 micro score for 7 neighbors: 0.3477272727272727
Accuracy score: 0.09666666666666666



In [5]:
kNN_training(13, 16)
kNN_training(25, 26)

Training done for 13 neighbors
Hamming loss for 13 neighbors: 0.192
F1 micro score for 13 neighbors: 0.3110047846889953
Accuracy score: 0.10333333333333333

Training done for 15 neighbors
Hamming loss for 15 neighbors: 0.18966666666666668
F1 micro score for 15 neighbors: 0.31197097944377267
Accuracy score: 0.09666666666666666

Training done for 25 neighbors
Hamming loss for 25 neighbors: 0.18833333333333332
F1 micro score for 25 neighbors: 0.31680773881499397
Accuracy score: 0.09666666666666666



In [5]:
# kNN_training(35, 36)
# kNN_training(45, 46)
kNN_training(91, 92)


Training done for 91 neighbors
Hamming loss for 91 neighbors: 0.18233333333333332
F1 micro score for 91 neighbors: 0.3449101796407186
Accuracy score: 0.11333333333333333



## First attempt at random forest

In [None]:
from sklearn.ensemble import RandomForestClassifier


clf_10 = RandomForestClassifier(max_depth=3, random_state=0).fit(train_images, train_labels)
rf_predicted_labels = clf_10.predict(validation_images)
print('Done training RF')

In [None]:
# Checks
print(rf_predicted_labels[0:15])
print('Val labels: ', validation_labels[0:5])
score = f1_score(validation_labels, rf_predicted_labels, average='micro', zero_division=1)
accuracy = accuracy_score(validation_labels,rf_predicted_labels)
print(f'F1 micro score for random forest : {score}')
print(f'Accuracy score: {accuracy}')

## First attempt at ridge classifier (Disaster)

In [None]:
# from sklearn.linear_model import RidgeClassifierCV

# clf = RidgeClassifierCV(alphas=[1e-3, 1e-2, 1e-1, 1]).fit(train_images, train_labels)

# # print(clf.score(train_images, train_labels))

# predicted_labels = clf.predict(validation_images)

# print(validation_labels[0:3])
# print(predicted_labels)

# # score = f1_score(validation_labels, predicted_labels, average='micro', zero_division=1)
# # print(f'F1 score: {score}')


