In [26]:
import numpy as np
import pandas as pd
import os
from itertools import combinations
from datetime import datetime
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix
from sklearn.model_selection import train_test_split

### Functions

In [25]:
def get_poster2(movie_id):
    poster_path = os.path.join(f"posters\\{movie_id}.jpg")
    img = mpimg.imread(poster_path)
    return img
def indicator_function_1(data):
    if (data >= 1970) and (data < 1980):
        return 1
    else:
        return 0 
def indicator_function_2(data):
    if (data >= 1980) and (data < 1990):
        return 1
    else:
        return 0 
def indicator_function_3(data):
    if (data >= 1990) and (data < 2000):
        return 1
    else:
        return 0
     
def indicator_function_4(data):
    if (data >= 2000) and (data < 2010):
        return 1
    else:
        return 0    
def indicator_function_5(data):
    if (data >= 2010) and (data < 2020):
        return 1
    else:
        return 0    
def indicator_function_6(data):
    if (data >= 2020) and (data < 2030):
        return 1
    else:
        return 0
def evaluate_auroc(y_pred, y_true, genres_names):
    roc_auc_scores = {}
    for i in range(y_true.shape[1]):  # Iterate over each label
        roc_auc_scores[genres_names[i]] = roc_auc_score(y_true[:, i], y_pred[:, i])
    
    results = pd.DataFrame(roc_auc_scores.items(), columns=["Genre", "ROC AUC"])
    print(results)
    average_roc_auc = np.mean(list(roc_auc_scores.values()))
    print(f"Average ROC AUC: {average_roc_auc:.6f}")
    
    for i in range(y_true.shape[1]):
        fpr, tpr, _ = roc_curve(y_true[:, i], y_pred[:, i])
        plt.plot(fpr, tpr, label=genres_names[i])
    plt.plot([0, 1], [0, 1], color="red", linestyle="--", label="Baseline")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title(f"{genres_names[0]} {genres_names[1]} {genres_names[2]} ROC curve")
    plt.legend()
    plt.show()

### Model

In [24]:
import tensorflow
from tensorflow.keras import layers, models
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Activation, BatchNormalization, GroupNormalization, Dropout, Conv2D, MaxPooling2D, Flatten, Concatenate, Input
n_genre = 3

model = Sequential()
model.add(Conv2D(32, (3, 3), activation ='relu', padding = 'Same', input_shape =(345, 230, 3))) #1
model.add(GroupNormalization())
model.add(Conv2D(32, (3, 3), activation ='relu', padding = 'Same')) #2
model.add(GroupNormalization())
model.add(Conv2D(32, (3, 3), activation ='relu', padding = 'Same')) #3
model.add(GroupNormalization())
model.add(MaxPooling2D((2, 2)))
    
model.add(Conv2D(32, (3, 3), activation ='relu', padding = 'Same')) #4
model.add(GroupNormalization())
model.add(Conv2D(32, (3, 3), activation ='relu', padding = 'Same')) #5
model.add(GroupNormalization())
model.add(Conv2D(32, (3, 3), activation ='relu', padding = 'Same')) #6
model.add(GroupNormalization())
model.add(MaxPooling2D((2, 2)))
    
model.add(Conv2D(32, (3, 3), activation ='relu', padding = 'Same')) #7
model.add(GroupNormalization())
model.add(Conv2D(32, (3, 3), activation ='relu', padding = 'Same')) #8
model.add(GroupNormalization())
model.add(Conv2D(32, (3, 3), activation ='relu', padding = 'Same')) #9
model.add(GroupNormalization())
model.add(MaxPooling2D((2, 2)))
    
model.add(Conv2D(64, (3, 3), activation ='relu', padding = 'Same')) #10
model.add(GroupNormalization())
model.add(Conv2D(64, (3, 3), activation ='relu', padding = 'Same')) #11
model.add(GroupNormalization())
model.add(Conv2D(64, (3, 3), activation ='relu', padding = 'Same')) #12
model.add(GroupNormalization())
model.add(MaxPooling2D((2, 2)))
    
model.add(Flatten())
indicator_inputs = Input(shape=(6,))
concatenated_inputs = Concatenate()([model.output, indicator_inputs])
    
x = BatchNormalization()(concatenated_inputs)
x = Dense(32, activation='relu')(x)
x = Dropout(0.25)(x)
x = BatchNormalization()(x)
x = Dense(64, activation='relu')(x)
x = Dropout(0.5)(x)
output = Dense(n_genre, activation='softmax')(x)

model = Model(inputs=[model.input, indicator_inputs], outputs=output)
model.compile(optimizer='adam', loss=tensorflow.keras.losses.CategoricalCrossentropy(), metrics=['accuracy'])
# model.summary()

### Implementation

In [5]:
np.random.seed(4749)
genre_choices = ['_Action', '_Animation', '_Comedy', '_Documentary', '_Drama', '_Horror', '_Music', '_Romance']
genre_choices_list = list(combinations(genre_choices, 3))
# 17: ["Action", "Drama", "Romance"]
# 42: ["Comedy", "Drama", "Romance"]
# 52: ["Drama", "Horror", "Music"] 
best_models_index = [17, 42, 52]
for i in best_models_index:
    genre_rank_filter = np.array(genre_choices_list[i])
    print(genre_rank_filter)
    
    n_genre = 3
    IMG_SIZE = (345, 230, 3)
    size = 3000
    test_portion = 0.2
    
    genre_df = pd.read_csv("encoded_genres.csv", index_col=0)
    genre_df["date"] = pd.to_datetime(genre_df["date"]) 
    genre_df = genre_df[genre_df["date"].dt.year >= 2000]
    genre_df_filtered = genre_df[genre_rank_filter]
    genre_filtered = genre_df_filtered[np.sum(genre_df_filtered * 1, axis=1) == 1]
    movie_img = []
    movie_genre = []
    year_list = []
    
    for i in range(n_genre):
        genre_i = genre_filtered.loc[genre_filtered.iloc[:, i] == True]
        rand = np.random.randint(0, genre_i.shape[0], size=(size + 10))
        n_image = 0
        j = 0
        while n_image < size:
            index = genre_filtered.index[rand[j]]
            year = datetime.strptime(genre_df["date"].iloc[np.where(genre_df.index == index)[0].item()],
                                     "%Y-%m-%d").year
            temp = get_poster2(index)
            if sum(temp.shape) == sum(IMG_SIZE):
                movie_img.append(temp)
                movie_genre.append(genre_i.iloc[rand[j], :n_genre])
                n_image += 1
                year_list.append(year)
            j = j + 1
    
    year_list = np.array(year_list)
    year_list = np.reshape(year_list, (size*n_genre, 1))
    movie_img = np.array(movie_img, dtype=float)
    movie_genre = np.array(movie_genre, dtype=float).reshape((size * n_genre, n_genre))
    genre_year = np.hstack((movie_genre, year_list))
    print("Image fetched---------------------------------")
    X_train, X_test, y_train, y_test = train_test_split(movie_img, genre_year, test_size=test_portion, random_state=4748)
    X_train = X_train / 255.0
    X_test = X_test / 255.0
    year_list_train = y_train[:, n_genre:].astype(int)
    y_train = y_train[:, :n_genre]
    year_list_test = y_test[:, n_genre:].astype(int)
    y_test = y_test[:, :n_genre]
    print("Train-test Splited---------------------------------")
    train_size = int(size * n_genre * (1 - test_portion))
    indicator_1 = [indicator_function_1(x) for x in year_list_train]
    indicator_1 = np.reshape(indicator_1, (train_size, 1))
    indicator_2 = [indicator_function_2(x) for x in year_list_train]
    indicator_2 = np.reshape(indicator_2, (train_size, 1))
    indicator_3 = [indicator_function_3(x) for x in year_list_train]
    indicator_3 = np.reshape(indicator_3, (train_size, 1))
    indicator_4 = [indicator_function_4(x) for x in year_list_train]
    indicator_4 = np.reshape(indicator_4, (train_size,1))
    indicator_5 = [indicator_function_5(x) for x in year_list_train]
    indicator_5 = np.reshape(indicator_5, (train_size,1))
    indicator_6 = [indicator_function_6(x) for x in year_list_train]
    indicator_6 = np.reshape(indicator_6, (train_size,1))
    
    train_indicators = np.concatenate((indicator_1, indicator_2, indicator_3, indicator_4, indicator_5, indicator_6), axis=1)
    training_model = model.fit([X_train, train_indicators], y_train, batch_size=32, epochs=5, validation_split=0.2)
    
    test_size = int(size*n_genre*test_portion)
    indicator_1 = [indicator_function_1(x) for x in year_list_test]
    indicator_1 = np.reshape(indicator_1, (test_size,1))
    indicator_2 = [indicator_function_2(x) for x in year_list_test]
    indicator_2 = np.reshape(indicator_2, (test_size,1))
    indicator_3 = [indicator_function_3(x) for x in year_list_test]
    indicator_3 = np.reshape(indicator_3, (test_size,1))
    indicator_4 = [indicator_function_4(x) for x in year_list_test]
    indicator_4 = np.reshape(indicator_4, (test_size,1))
    indicator_5 = [indicator_function_5(x) for x in year_list_test]
    indicator_5 = np.reshape(indicator_5, (test_size,1))
    indicator_6 = [indicator_function_6(x) for x in year_list_test]
    indicator_6 = np.reshape(indicator_6, (test_size,1))
    
    test_indicators = np.concatenate((indicator_1, indicator_2, indicator_3, indicator_4, indicator_5, indicator_6), axis=1)
    y_pred = model.predict([X_test,test_indicators])
    pred_y = np.argmax(y_pred, axis=1)
    true_y = np.argmax(y_test, axis=1)
    accuracy = np.mean(true_y == pred_y)
    confusion = confusion_matrix(true_y, pred_y)
    
    print("Accuracy:", accuracy)
    print("Confusion Matrix:")
    print(confusion)
    
    evaluate_auroc(y_pred, y_test, genre_rank_filter)

['_Comedy' '_Drama' '_Romance']
Image fetched---------------------------------
Train-test Splited---------------------------------
Epoch 1/5


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Accuracy: 0.41944444444444445
Confusion Matrix:
[[125 288 188]
 [122 334 142]
 [117 188 296]]


NameError: name 'evaluate_auroc' is not defined