# Retrieve from preprocessed data

In [3]:
import csv
from matplotlib import pyplot as plt
import numpy as np
from skimage import io
from skimage.transform import rescale, resize


# Top 10 Genres in descending order:  
top_genres = ['Drama', 'Comedy', 'Romance', 'Action', 'Crime', 'Thriller','Horror', 'Adventure', 'Documentary', 'Mystery']
num_of_genres = len(top_genres)
# The useful outputs are saved in the lists below

# 1-d vectors
poster_ids = []
scores = [] 
genres = [] # dimension: n x 10

# contains np array of dim(100,100,3) for each input
images = [] 

# flatten_images is used for kNN since it only accepts 2d array 
flatten_images = []
line_count = 0
resized_dim = (100, 100)

# set to 500 first 
line_limit = 100
with open('ProcessedData.csv') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    for row in csv_reader:
        if line_count > line_limit:
            break
        line_count += 1
        
        pid = row[0]
        score = row[1]
        genre = row[2]
        genre_vector = []
        
        
        # create label vector
        for i in range(0, len(top_genres)):
            if top_genres[i] in genre:
                genre_vector.append(1)
            else:
                genre_vector.append(0)
        
        try:
            filepath = f'./SamplePosters/{pid}.jpg'
            image = io.imread(filepath)
            image_resized = resize(image, resized_dim)
        except:
            continue
        
        # remove greyscale images
        if len(image_resized.shape) != 3:
            continue
        if len(genres) % 200 == 0:
            print(f'Processed image count: {len(genres)}')
        
        poster_ids.append(pid)
        scores.append(score)
        genres.append(genre_vector)
        arr = np.array(image_resized)
        images.append(arr)
        flatten_images.append(arr.flatten())

# Debug
with open('knn.csv', mode='w') as file:
    employee_writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)

    employee_writer.writerow([poster_ids[0], flatten_images[0]])
    employee_writer.writerow([flatten_images[1]])

print('Done')
print(np.array(genres).shape)
print(np.array(flatten_images).shape)

Processed image count: 0
Processed image count: 200
Processed image count: 400
Processed image count: 600
Processed image count: 800
Processed image count: 1000
Processed image count: 1200
Processed image count: 1400
Processed image count: 1600
Processed image count: 1800
Processed image count: 2000
Processed image count: 2200
Processed image count: 2400
Processed image count: 2600
Done
(2694, 10)
(2694, 30000)


# kNN algorithm from sklearn

In [12]:
import numpy as np
from numpy.linalg import norm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score, accuracy_score, hamming_loss

# We split our data in 80:10:10
sample_limit = np.array(genres).shape[0]
print(f'Sample Size: {sample_limit}')
cutoff = int(0.8 * sample_limit)
cutoff2 = int(0.9 * sample_limit)
train_images = flatten_images[:cutoff]
train_labels = genres[:cutoff]

validation_images = flatten_images[cutoff:cutoff2]
validation_labels = genres[cutoff:cutoff2]

test_images = flatten_images[cutoff2:]
test_labels = genres[cutoff2:]

def kNN_training(start, end):
        for i in range(start, end, 2):
            num_of_neighbors = i
            neigh = KNeighborsClassifier(n_neighbors=num_of_neighbors)
            neigh.fit(train_images, train_labels)
            print('Fitting done')
            predicted_labels = neigh.predict(validation_images)
#             for item in test_images:
#                 predicted_label = neigh.predict([item])[0]
#                 list.append(predicted_label)
                

            print(validation_labels[0:3])
            print(predicted_labels[0:3])
        #     print(np.array(test_labels).shape)
        #     print(np.array(list).shape)
            score = f1_score(validation_labels, predicted_labels, average='micro', zero_division=1)
            h_loss = hamming_loss(validation_labels, predicted_labels)
            accuracy = accuracy_score(validation_labels,predicted_labels)
            print(f'Average hamming loss, fraction of incorrectly predicted labels: {h_loss}')
            print(f'F1 micro score for {num_of_neighbors} neighbors: {score}')
            print(f'Accuracy score: {accuracy}')
            
kNN_training(5, 6)

# Results for Sample Size: 452
# F1 micro score for 3 neighbors: 0.4352941176470588
# F1 macro score for 3 neighbors: 0.32395808525839487
# F1 micro score for 5 neighbors: 0.4099378881987578
# F1 macro score for 5 neighbors: 0.2586118056172503
# F1 micro score for 7 neighbors: 0.4026845637583893
# F1 macro score for 7 neighbors: 0.23838656470235414
# F1 micro score for 9 neighbors: 0.3611111111111111
# F1 macro score for 9 neighbors: 0.20873015873015874
# F1 micro score for 11 neighbors: 0.36879432624113473
# F1 macro score for 11 neighbors: 0.21000000000000002


Sample Size: 2694
Fitting done
[[0, 1, 0, 0, 1, 0, 0, 0, 0, 0], [1, 0, 0, 1, 1, 0, 0, 0, 0, 0], [1, 0, 1, 0, 0, 0, 0, 0, 0, 0]]
[[0 0 0 0 0 0 1 0 0 0]
 [0 0 0 0 0 0 1 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]]
Average hamming loss, fraction of incorrectly predicted labels: 0.21486988847583643
F1 micro score for 5 neighbors: 0.2775
Accuracy score: 0.07434944237918216


## First attempt at random forest

In [26]:
from sklearn.ensemble import RandomForestClassifier


clf_10 = RandomForestClassifier(max_depth=3, random_state=0).fit(train_images, train_labels)
rf_predicted_labels = clf_10.predict(validation_images)
print('Done training RF')

Done training RF


In [27]:
# Checks
print(rf_predicted_labels[0:15])
print('Val labels: ', validation_labels[0:5])
score = f1_score(validation_labels, rf_predicted_labels, average='micro', zero_division=1)
accuracy = accuracy_score(validation_labels,rf_predicted_labels)
print(f'F1 micro score for random forest : {score}')
print(f'Accuracy score: {accuracy}')

[[1 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0]
 [1 1 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0]
 [0 1 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0]]
Val labels:  [[0, 1, 0, 0, 1, 0, 0, 0, 0, 0], [1, 0, 0, 1, 1, 0, 0, 0, 0, 0], [1, 0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 1, 0, 0, 0, 0, 0], [1, 1, 0, 0, 1, 0, 0, 0, 0, 0]]
F1 micro score for random forest : 0.37440758293838866
Accuracy score: 0.10037174721189591


## First attempt at ridge classifier (Disaster)

In [10]:
# from sklearn.linear_model import RidgeClassifierCV

# clf = RidgeClassifierCV(alphas=[1e-3, 1e-2, 1e-1, 1]).fit(train_images, train_labels)

# # print(clf.score(train_images, train_labels))

# predicted_labels = clf.predict(validation_images)

# print(validation_labels[0:3])
# print(predicted_labels)

# # score = f1_score(validation_labels, predicted_labels, average='micro', zero_division=1)
# # print(f'F1 score: {score}')




[[0, 1, 0, 0, 1, 0, 0, 0, 0, 0], [1, 0, 0, 1, 1, 0, 0, 0, 0, 0], [1, 0, 1, 0, 0, 0, 0, 0, 0, 0]]
[1 0 5 3 3 2 0 6 1 3 7 6 6 0 0 0 1 1 0 6 0 5 1 0 0 1 0 3 3 0 2 6 0 4 1 0 1
 2 0 0 0 3 0 5 4 1 4 6 0 4 0 0 4 0 1 3 2 4 0 3 0 0 0 0 4 0 0 1 0 6 5 4 3 0
 3 1 1 1 1 6 1 0 0 0 0 0 2 1 0 0 0 0 0 1 6 3 0 0 0 2 6 0 6 0 5 0 0 0 2 4 2
 6 3 5 0 0 7 3 0 2 0 1 0 0 1 5 1 0 1 0 0 0 0 5 0 0 1 1 1 0 1 7 2 0 0 0 0 1
 0 0 0 2 0 1 7 0 0 2 1 0 2 1 2 1 0 0 3 1 3 4 0 4 3 1 0 6 0 0 0 4 3 0 0 3 3
 0 0 0 0 1 4 0 0 6 0 3 6 0 0 0 3 0 2 0 0 0 0 0 5 1 0 1 0 0 0 1 2 2 1 0 0 3
 3 1 0 2 0 1 0 1 5 1 0 1 3 0 0 1 0 3 6 0 1 5 0 0 5 0 1 1 1 0 0 0 1 3 4 0 0
 1 1 0 0 1 0 0 0 1 0]


ValueError: Classification metrics can't handle a mix of multilabel-indicator and multiclass targets