In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
import pymongo

In [None]:
from keras.applications.inception_v3 import InceptionV3

In [None]:
mc = pymongo.MongoClient()

In [None]:
db = mc['model']

In [None]:
deep_features_db = db['labels_deep_features']

## set number of frames and genres

In [None]:
n_frames = deep_features_db.count_documents({})

In [None]:
genres = ['Action', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'Film-Noir','Game-Show','History','Horror','Music','Musical','Mystery','News','Reality-TV','Romance','Sci-Fi','Short','Sport','Thriller','War','Western']

## given the large dataset, a single dataframe cannot load it. So a set of empty numpy arrays are made to be later filled.

In [None]:
feature_ids = [str(i) for i in range(2048)]

In [None]:
frame_ids = np.empty(shape=(n_frames,), dtype=np.object)

In [None]:
frame_features = np.empty(shape=(n_frames, 2048), dtype=np.float32)

In [None]:
frame_genres = np.empty(shape=(n_frames, len(genres)), dtype=np.uint8)

## Loop that fills out the arrays above by each mongo document. Needs to be done manually like this in order not to fill computer memory

In [None]:
for i, row in enumerate(deep_features_db.find({})):
    for feature_id in feature_ids:
        frame_features[i, int(feature_id)] = row[feature_id]
    frame_ids[i] = row['frame_id']
    for j, genre in enumerate(genres):
        frame_genres[i, j] = row[genre]

## Arrays store frames with movie title, train test split must be done by movie titles in order not mix test frames from the same movie with train frames.

In [None]:
def split_by_movie(frame_ids, frame_features, frame_genres, train_pct=0.8, limit=None):
    n = len(frame_ids)
    frame_titles = np.array([frame_id.partition('_')[0] for frame_id in frame_ids], dtype=np.object)

    movie_titles = np.array(list(set(frame_titles)))
    movie_titles
    np.random.shuffle(movie_titles)

    train_limit = int(len(movie_titles)*train_pct)

    train_titles = movie_titles[:train_limit]
    test_titles = movie_titles[train_limit:]

    test_mask = np.isin(frame_titles, test_titles)
    train_mask = ~test_mask
    
    if limit is not None:
        idxs = np.arange(n)
        np.random.shuffle(idxs)
        keep_idxs = idxs[:limit]
        keep_mask = np.zeros(n, dtype=bool)
        keep_mask[keep_idxs] = True
        test_mask = test_mask & keep_mask
        train_mask = train_mask & keep_mask

    X_train = frame_features[train_mask, :]
    y_train = frame_genres[train_mask, :]
    X_test = frame_features[test_mask, :]
    y_test = frame_genres[test_mask, :]
    
    return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test = split_by_movie(frame_ids, 
                                                  frame_features,
                                                  frame_genres,
                                                  train_pct=0.8,
                                                  )

## import random forest, train on dataset, then export model as pickle to use in deployment.py

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rfc = RandomForestClassifier(n_jobs=-1)

In [None]:
rfc.fit(X_train, y_train)

In [None]:
import pickle

filename = 'finalized_model.sav'
pickle.dump(rfc, open(filename, 'wb'))