In [1]:
import tensorflow as tf
import pandas as pd
import glob
import cv2
import numpy as np
import pymongo

In [None]:
from keras.applications.inception_v3 import InceptionV3

In [3]:
mc = pymongo.MongoClient()

In [4]:
db = mc['model']

In [5]:
deep_features_db = db['labels_deep_features']

In [6]:
n_frames = deep_features_db.count_documents({})

In [None]:
db.collection_names()

In [None]:
model = InceptionV3(include_top=False, weights='imagenet', input_shape=(299,299,3), pooling='avg')

In [None]:
df = pd.read_csv('movie_genre_matrix.csv')

In [None]:
def png_to_numpy(movie_name):
    movie_file_strings = glob.glob(f'trailer_test/{movie_name}*.png')
    movie_file_strings.sort()
    movie_file_strings = movie_file_strings[len(movie_file_strings)//3:-len(movie_file_strings)//3]
    
    return [cv2.imread(movie) for movie in movie_file_strings]
    

In [None]:
def create_movie_df(list_of_movies):
    
    movie_titles = []
    frames = []
    for movie in list_of_movies:
        for movie_title, frame in png_to_numpy(movie):
            movie_titles.append(movie_title)
            frames.append(frame)
    
    dict_of_numpys = {'movie_title': movie_titles, 'frames': frames}
    df_numpy = pd.DataFrame(dict_of_numpys)
    df_numpy = pd.merge(df_numpy, df, on='movie_title')
    return df_numpy
        

In [None]:
def get_deep_features(frame):
    prediction = model.predict(frame.reshape((-1, 299, 299, 3)))
    return prediction

In [None]:
def store_deep_features(movie_df):
    movie_list = movie_df['movie_title']
    
    for movie in movie_list:
        frame_id =  0
        movie_labels = movie_df[movie_df['movie_title'] == movie].to_dict('records')[0]
        movie_frames = png_to_numpy(movie)
        for frame in movie_frames:
            frame_id += 1
            deep_features = get_deep_features(frame)[0].tolist()
            features_dict = {str(i): feature for i, feature in enumerate(deep_features)}
            
            final_dict = {**movie_labels, **features_dict, 'frame_id': movie+"_"+str(frame_id)}
            
            deep_features_db.insert_one(final_dict)

In [None]:
store_deep_features(df)

In [None]:
new_df = pd.DataFrame(list(deep_features_db.find().limit(100)))

In [None]:
new_df.shape

In [7]:
genres = ['Action', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'Film-Noir','Game-Show','History','Horror','Music','Musical','Mystery','News','Reality-TV','Romance','Sci-Fi','Short','Sport','Thriller','War','Western']

In [8]:
feature_ids = [str(i) for i in range(2048)]

In [9]:
frame_ids = np.empty(shape=(n_frames,), dtype=np.object)

In [10]:
frame_features = np.empty(shape=(n_frames, 2048), dtype=np.float32)

In [11]:
frame_genres = np.empty(shape=(n_frames, len(genres)), dtype=np.uint8)

In [12]:
for i, row in enumerate(deep_features_db.find({})):
    for feature_id in feature_ids:
        frame_features[i, int(feature_id)] = row[feature_id]
    frame_ids[i] = row['frame_id']
    for j, genre in enumerate(genres):
        frame_genres[i, j] = row[genre]

In [14]:
def split_by_movie(frame_ids, frame_features, frame_genres, train_pct=0.8, limit=None):
    n = len(frame_ids)
    frame_titles = np.array([frame_id.partition('_')[0] for frame_id in frame_ids], dtype=np.object)

    movie_titles = np.array(list(set(frame_titles)))
    movie_titles
    np.random.shuffle(movie_titles)

    train_limit = int(len(movie_titles)*train_pct)

    train_titles = movie_titles[:train_limit]
    test_titles = movie_titles[train_limit:]

    test_mask = np.isin(frame_titles, test_titles)
    train_mask = ~test_mask
    
    if limit is not None:
        idxs = np.arange(n)
        np.random.shuffle(idxs)
        keep_idxs = idxs[:limit]
        keep_mask = np.zeros(n, dtype=bool)
        keep_mask[keep_idxs] = True
        test_mask = test_mask & keep_mask
        train_mask = train_mask & keep_mask

    X_train = frame_features[train_mask, :]
    y_train = frame_genres[train_mask, :]
    X_test = frame_features[test_mask, :]
    y_test = frame_genres[test_mask, :]
    
    return X_train, X_test, y_train, y_test

In [15]:
X_train, X_test, y_train, y_test = split_by_movie(frame_ids, 
                                                  frame_features,
                                                  frame_genres,
                                                  train_pct=0.8,
                                                  )

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rfc = RandomForestClassifier(n_jobs=-1)

In [None]:
rfc.fit(X_train, y_train)

In [None]:
from sklearn.metrics import classification_report, roc_curve, roc_auc_score

In [None]:
print(classification_report(y_test, rfc.predict(X_test)))

In [None]:
import matplotlib.pyplot as plt

In [None]:
x, y, _ = roc_curve(y_test[:,0], lr.predict_proba(X_test)[:,1])

In [None]:
roc_auc_score(y_test[:,0], lr.predict_proba(X_test)[:,1])

In [None]:
roc_auc_score(y_train[:,0], lr.predict_proba(X_train)[:,1])

In [None]:
fig, ax = plt.subplots()
ax.plot(x, y)

line_x = np.linspace(0,1)
line_y = line_x
ax.plot(line_x, line_y)

In [16]:
list_of_titles = [frame_id.partition('_')[0] for frame_id in frame_ids]

In [17]:
df = pd.DataFrame({'title': list_of_titles})

In [None]:
df['title'].value_counts().hist(bins=20)

In [18]:
df['title'][8453]

'Harry Potter and the Prisoner of Azkaban'

In [19]:
img = cv2.imread('trailer_test/Harry Potter and the Prisoner of Azkaban0083.png')

In [None]:
cv2.imshow('ImageWindow', img)
cv2.waitKey()