In [1]:
import numpy as np

import lab_utils

import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim

import PIL
from PIL import Image
import matplotlib.pyplot as plt

from torch.utils.data import Dataset, DataLoader
from skimage import io, transform
import os
import sys

import torchvision.transforms as transforms
import torchvision.models as models

import csv

from tqdm import tqdm as tqdm

### Define Style Feature Extraction Network Based on VGG

In [2]:
use_cuda = torch.cuda.is_available()
dtype = torch.cuda.FloatTensor if use_cuda else torch.FloatTensor

cnn = models.vgg19(pretrained=True).features

# move it to the GPU if possible:
if use_cuda:
    cnn = cnn.cuda()
    
style_layers = ['conv_1', 'conv_2', 'conv_3', 'conv_4', 'conv_5']

networks = []

for i in range(5):
    model = nn.Sequential()
    networks.append(model)

indexs = [1, 3, 6, 8, 11]

for n in range(5):
    count = 0
    for layer in list(cnn)[:indexs[n]]:
        networks[n].add_module(str(count), layer)
        count += 1


### Model Define

In [3]:
class MovieGenreClassifier(nn.Module):
    def __init__(self, nlabel):
        super(MovieGenreClassifier, self).__init__()
        self.stylefeature = nn.Sequential(
            nn.Linear(64 * 64 + 64 * 64 + 128 * 128 + 128 * 128 + 256 * 256, 1024),
            nn.ReLU(),
            nn.Linear(1024, 80),
        )
        self.objectfeature = nn.Sequential(
            nn.Linear(79, 1024),
            nn.ReLU(),
            nn.Linear(1024, 80))
        self.allfeature = nn.Sequential(
            nn.Linear(160, 64),
            nn.ReLU(),
            nn.Linear(64, 23))

    def forward(self, style, obj):
        x1 = self.stylefeature(style)
        x2 = self.objectfeature(obj)
        x = torch.cat((x1, x2), 2)
        
        return self.allfeature(x)

def extractObjectFeature(image, id2objects, objectTable):
    objdetect = [0] * len(objectTable)
    objects = id2objects.get(image, [])
    
    for obj in objects:
        objdetect[objectTable[obj.split(':')[0]]] += float(obj.split(':')[1].strip('%')) / 100
        
    return objdetect
    
def extractStyleFeature(image, networks):
    features = []
    for network in networks:
        features.append(network(image))
    for i in range(len(features)):
        features[i] = gram(features[i]).view(1, -1)
    
    return torch.cat((features[0], features[1], features[2], features[3], features[4]), 1)
    
def extractLabel(image, id2genre, genresTable):
    genres = id2genre[image]
    genres = genres.split('|')
    labelVec = torch.zeros(1, 23)
    for genre in genres:
        if genre in genresTable:
            labelVec[0][genresTable[genre]] = 1
    
    return labelVec

### Define Gram Matrix to Calculate Style Feature

In [4]:
loader = transforms.Compose([
    transforms.Scale((182, 268)),
    transforms.ToTensor()])  # transform it into a torch tensor

def image_loader(image_name):
    image = Image.open(image_name)    
    image = Variable(loader(image))
    # fake batch dimension required to fit network's input dimensions
    image = image.unsqueeze(0)
    return image

class GramMatrix(nn.Module):
    def forward(self, input):
        a, b, c, d = input.size()  # a=batch size(=1)
        # b=number of feature maps
        # (c,d)=dimensions of a f. map (N=c*d)

        features = input.view(a * b, c * d)  # resise F_XL into \hat F_XL

        G = torch.mm(features, features.t())  # compute the gram product

        # we 'normalize' the values of the gram matrix
        # by dividing by the number of element in each feature maps.
        return G.div(a * b * c * d)

gram = GramMatrix()
if use_cuda:
    gram = gram.cuda()


### Define Custom Dataset

In [5]:
class MyDataset(Dataset):
    def __init__(self, root, csvfile, networks, transform=None):
        self.root = root
        self.transform = transform
        self.csvfile = open(csvfile, 'rb')
        self.networks = networks
        
        reader = csv.reader(self.csvfile)

        id2genre = {}
        id2objects = {}
        for row in reader:
            if row[0] != "":
                id2genre[row[0] + ".jpg"] = row[4]
                id2objects[row[0] + ".jpg"] = row[6:]
        
        self.csvfile = open(csvfile, 'rb')
        
        reader = csv.reader(self.csvfile)
        
        genres = {}
        objects = {}
        for row in reader:
            genre = row[4].split('|')
            for ele in genre:
                if ele != '':
                    genres[ele] = genres.get(ele, 0) + 1
            objs = row[6:]
            for obj in objs:
                if obj != '':
                    objects[obj.split(':')[0]] = objects.get(obj.split(':')[0], 0) + 1

        for ele in list(genres):
            if (genres[ele] < 100):
                del genres[ele]        

        genresTable = {}
        objectTable = {}
        
        count = 0
        
        for ele in list(objects):
            objectTable[ele] = count
            count += 1
        
        count = 0
        for ele in list(genres):
            genresTable[ele] = count
            count += 1
                
        self.dataset = []
        self.objs = []
        self.labels = []
        
        for img in tqdm(os.listdir(self.root)):
            image = io.imread(os.path.join(self.root, img))
            
            obj = extractObjectFeature(img, id2objects, objectTable)
            
            self.dataset.append(image)
            self.objs.append(obj)
            self.labels.append(extractLabel(img, id2genre, genresTable))

    def __len__(self):
        return len(os.listdir(self.root))

    def __getitem__(self, idx):
        image = self.dataset[idx]
        label = self.labels[idx]
        obj = self.objs[idx]
        
        image = Image.fromarray(image)
        
        if self.transform is not None:
            image = self.transform(image)

        image = Variable(loader(image))
        
        image = image.cuda()
        # fake batch dimension required to fit network's input dimensions
        image = image.unsqueeze(0)        
        
        feature = extractStyleFeature(image, self.networks)
        
        obj = torch.FloatTensor(obj).view(1, -1)
        
        return feature.data, obj, label
                               
trainset = MyDataset(root='/home/ubuntu/notebooks/dataset/train',
                     csvfile='/home/ubuntu/notebooks/Movie-Genre-Classification-from-Movie-Poster/Dataset/NewMovieGenre.csv', networks=networks)
valset = MyDataset(root='/home/ubuntu/notebooks/dataset/validation/',
                   csvfile='/home/ubuntu/notebooks/Movie-Genre-Classification-from-Movie-Poster/Dataset/NewMovieGenre.csv', networks=networks)

100%|██████████| 26985/26985 [00:31<00:00, 856.98it/s]
100%|██████████| 1891/1891 [00:02<00:00, 832.51it/s]


### Train the Model

In [31]:
trainLoader = torch.utils.data.DataLoader(trainset, batch_size = 64, 
                                          shuffle = True, num_workers = 0)

valLoader = torch.utils.data.DataLoader(valset, batch_size = 64,
                                        shuffle = True, num_workers = 0)

def train_model(network, criterion, optimizer, trainLoader, valLoader, n_epochs = 10, use_gpu = False):
    if use_gpu:
        network = network.cuda()
        criterion = criterion.cuda()
        
    # Training loop.
    for epoch in range(0, n_epochs):
        correct = 0.0
        cum_loss = 0.0
        counter = 0

        # Make a pass over the training data.
        t = tqdm(trainLoader, desc = 'Training epoch %d' % epoch)
        network.train()  # This is important to call before training!
        for (i, (input1, input2, labels)) in enumerate(t):
            
            # Wrap inputs, and targets into torch.autograd.Variable types.
            input1 = Variable(input1)
            input2 = Variable(input2)
            labels = Variable(labels)
            
            if use_gpu:
                input1 = input1.cuda()
                input2 = input2.cuda()
                labels = labels.cuda()

            # Forward pass:
            outputs = network(input1, input2)
            loss = criterion(outputs, labels)

            # Backward pass:
            optimizer.zero_grad()
            # Loss is a variable, and calling backward on a Variable will
            # compute all the gradients that lead to that Variable taking on its
            # current value.
            loss.backward() 

            # Weight and bias updates.
            optimizer.step()
            
            # logging information.
            cum_loss += loss.data[0]
            
            outlabels = (outputs.data.exp() / (outputs.data.exp() + 1)).round()
            
            correct += ((labels.data * outlabels).sum(2).sum(1) / (labels.data + outlabels).clamp(0, 1).sum(2).sum(1)).sum()
            counter += input1.size(0)
            t.set_postfix(loss = cum_loss / (1 + i), accuracy = 100 * correct / counter)

        # Make a pass over the validation data.
        correct = 0.0
        cum_loss = 0.0
        counter = 0
        t = tqdm(valLoader, desc = 'Validation epoch %d' % epoch)
        network.eval()  # This is important to call before evaluating!
        for (i, (input1, input2, labels)) in enumerate(t):
            # Wrap inputs, and targets into torch.autograd.Variable types.
            input1 = Variable(input1)
            input2 = Variable(input2)
            labels = Variable(labels)
            
            if use_gpu:
                input1 = input1.cuda()
                input2 = input2.cuda()
                labels = labels.cuda()

            # Forward pass:
            outputs = network(input1, input2)
            loss = criterion(outputs, labels)

            # logging information.
            cum_loss += loss.data[0]
            outlabels = (outputs.data.exp() / (outputs.data.exp() + 1)).round()
            
            correct += ((labels.data * outlabels).sum(2).sum(1) / (labels.data + outlabels).clamp(0, 1).sum(2).sum(1)).sum()
            counter += input1.size(0)
            t.set_postfix(loss = cum_loss / (1 + i), accuracy = 100 * correct / counter)

train = True

if train == True:
    classifier = torch.load("/home/ubuntu/notebooks/Movie-Genre-Classification-from-Movie-Poster/neural_style_and_object_detection05.model")
    optimizer = optim.Adam(classifier.parameters(), lr = 0.0001)
    criterion = nn.MultiLabelSoftMarginLoss()
    # Train the previously defined model.
    train_model(classifier, criterion, optimizer, trainLoader, valLoader, n_epochs = 5, use_gpu = True)
    torch.save(classifier, "/home/ubuntu/notebooks/Movie-Genre-Classification-from-Movie-Poster/neural_style_and_object_detection06.model")
else:
    classifier = torch.load("/home/ubuntu/notebooks/Movie-Genre-Classification-from-Movie-Poster/neural_style_and_object_detection05.model")

Training epoch 0: 100%|██████████| 422/422 [19:06<00:00,  2.72s/it, accuracy=33.6, loss=0.207]
Validation epoch 0: 100%|██████████| 30/30 [01:18<00:00,  2.60s/it, accuracy=30.5, loss=0.229]
Training epoch 1: 100%|██████████| 422/422 [19:08<00:00,  2.72s/it, accuracy=33.9, loss=0.207]
Validation epoch 1: 100%|██████████| 30/30 [01:18<00:00,  2.60s/it, accuracy=29.7, loss=0.229]
Training epoch 2: 100%|██████████| 422/422 [19:06<00:00,  2.72s/it, accuracy=34.1, loss=0.206]
Validation epoch 2: 100%|██████████| 30/30 [01:18<00:00,  2.60s/it, accuracy=30.6, loss=0.229]
Training epoch 3: 100%|██████████| 422/422 [19:07<00:00,  2.72s/it, accuracy=34.3, loss=0.206]
Validation epoch 3: 100%|██████████| 30/30 [01:17<00:00,  2.60s/it, accuracy=30.4, loss=0.228]
Training epoch 4: 100%|██████████| 422/422 [19:07<00:00,  2.72s/it, accuracy=34.4, loss=0.205]
Validation epoch 4: 100%|██████████| 30/30 [01:18<00:00,  2.61s/it, accuracy=30.7, loss=0.229]


### Evaluation on Test Dataset

In [7]:
def Evaluation(root, img, classifier, networks, id2genre, genresTable, id2objects, objectTable):
    classifier.eval()

    testImg = io.imread(os.path.join(root, img))
    
    image = Image.fromarray(testImg)

    image = Variable(loader(image))

    image = image.cuda()
    # fake batch dimension required to fit network's input dimensions
    image = image.unsqueeze(0)        

    feature = extractStyleFeature(image, networks)
    
    obj = extractObjectFeature(img, id2objects = id2objects, objectTable = objectTable)
    obj = Variable(torch.FloatTensor(obj).view(1, -1)).cuda()
    
    feature = feature.view(1, 1, -1)
    obj = obj.view(1, 1, -1)
    
    
    outputs = classifier(feature, obj)
    outputs = outputs.view(1, -1)
    max_scores, max_labels = outputs.data.max(1)
    if (max_scores > 0).cpu().numpy():
        outlabels = (outputs.data.exp() / (outputs.data.exp() + 1)).round()
    else:
        outlabels = (outputs.data.exp() / (outputs.data.exp() + 1)).round()
        outlabels[0][max_labels] = 1
        
    if (outlabels.sum(1).sum() > 3):
        outlabels[0] = torch.zeros(outlabels[0].size()[0])
        outlabels[0][outputs.topk(3)[1][0].data] = 1
    
    labels = extractLabel(img, id2genre, genresTable).cuda()
    
    correct = ((labels * outlabels).sum(1) / (labels + outlabels).clamp(0, 1).sum(1)).sum()    
    
    return correct, outlabels, labels

csvf = "/home/ubuntu/notebooks/Movie-Genre-Classification-from-Movie-Poster/Dataset/NewMovieGenre.csv"

csvfile = open(csvf, 'rb')

reader = csv.reader(csvfile)

id2genre = {}
id2objects = {}
for row in reader:
    if row[0] != "":
        id2genre[row[0] + ".jpg"] = row[4]
        id2objects[row[0] + ".jpg"] = row[6:]

csvfile = open(csvf, 'rb')

reader = csv.reader(csvfile)

genres = {}
objects = {}
for row in reader:
    genre = row[4].split('|')
    for ele in genre:
        if ele != '':
            genres[ele] = genres.get(ele, 0) + 1
    objs = row[6:]
    for obj in objs:
        if obj != '':
            objects[obj.split(':')[0]] = objects.get(obj.split(':')[0], 0) + 1

for ele in list(genres):
    if (genres[ele] < 100):
        del genres[ele]        

genresTable = {}
objectTable = {}

count = 0

for ele in list(objects):
    objectTable[ele] = count
    count += 1

count = 0
for ele in list(genres):
    genresTable[ele] = count
    count += 1


correct = 0
classifier = torch.load("/home/ubuntu/notebooks/Movie-Genre-Classification-from-Movie-Poster/neural_style_and_object_detection06.model")
for img in tqdm(os.listdir("/home/ubuntu/notebooks/dataset/test")):
    correct += Evaluation("/home/ubuntu/notebooks/dataset/test", 
                          img, 
                          classifier, 
                          networks, 
                          id2genre,
                          genresTable,
                          id2objects,
                          objectTable)[0]

print "Accuracy on Test Dataset : " + str(100 * correct / len(os.listdir("/home/ubuntu/notebooks/dataset/test"))) + "%"

100%|██████████| 9654/9654 [07:30<00:00, 21.41it/s]

Accuracy on Test Dataset : 34.5561428444%



