In [1]:
import numpy as np

import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim

import PIL
from PIL import Image
import matplotlib.pyplot as plt

from torch.utils.data import Dataset, DataLoader
from skimage import io, transform
import os
import sys

import torchvision.transforms as transforms
import torchvision.models as models

import csv

from tqdm import tqdm as tqdm

### Model Define

In [2]:
class MovieGenreClassifier(nn.Module):
    def __init__(self, nlabel):
        super(MovieGenreClassifier, self).__init__()
        self.main = nn.Sequential(
            nn.Linear(64 * 64 + 64 * 64 + 128 * 128 + 128 * 128 + 256 * 256, 64),
            nn.ReLU(),
            nn.Linear(64, nlabel),
        )

    def forward(self, input):
        return self.main(input)

def extractStyleFeature(image):
    features = []
    for network in networks:
        features.append(network(image))
    for i in range(len(features)):
        features[i] = gram(features[i]).view(1, -1)
    
    return torch.cat((features[0], features[1], features[2], features[3], features[4]), 1)
    
def extractLabel(image, id2genre, genresTable):
    genres = id2genre[image]
    genres = genres.split('|')
    labelVec = torch.zeros(1, 23)
    for genre in genres:
        if genre in genresTable:
            labelVec[0][genresTable[genre]] = 1
    
    return labelVec

### Define Style Feature Extraction Network Based on VGG

In [3]:
use_cuda = torch.cuda.is_available()
dtype = torch.cuda.FloatTensor if use_cuda else torch.FloatTensor

cnn = models.vgg19(pretrained=True).features

# move it to the GPU if possible:
if use_cuda:
    cnn = cnn.cuda()
    
style_layers = ['conv_1', 'conv_2', 'conv_3', 'conv_4', 'conv_5']

networks = []

for i in range(5):
    model = nn.Sequential()
    networks.append(model)

indexs = [1, 3, 6, 8, 11]

for n in range(5):
    count = 0
    for layer in list(cnn)[:indexs[n]]:
        networks[n].add_module(str(count), layer)
        count += 1


### Define Gram Matrix to Calculate Style Feature

In [4]:
loader = transforms.Compose([
    transforms.Scale((182, 268)),
    transforms.ToTensor()])  # transform it into a torch tensor

def image_loader(image_name):
    image = Image.open(image_name)    
    image = Variable(loader(image))
    # fake batch dimension required to fit network's input dimensions
    image = image.unsqueeze(0)
    return image

class GramMatrix(nn.Module):
    def forward(self, input):
        a, b, c, d = input.size()  # a=batch size(=1)
        # b=number of feature maps
        # (c,d)=dimensions of a f. map (N=c*d)

        features = input.view(a * b, c * d)  # resise F_XL into \hat F_XL

        G = torch.mm(features, features.t())  # compute the gram product

        # we 'normalize' the values of the gram matrix
        # by dividing by the number of element in each feature maps.
        return G.div(a * b * c * d)

gram = GramMatrix()
if use_cuda:
    gram = gram.cuda()


### Define Custom Dataset

In [5]:
class MyDataset(Dataset):
    def __init__(self, root, csvfile, transform=None):
        self.root = root
        self.transform = transform
        self.csvfile = open(csvfile, 'rb')
        
        reader = csv.reader(self.csvfile)

        id2genre = {}

        for row in reader:
            if row[0] != "":
                id2genre[row[0] + ".jpg"] = row[4]
        
        self.csvfile = open(csvfile, 'rb')
        
        reader = csv.reader(self.csvfile)
        
        genres = {}
        for row in reader:
            genre = row[4].split('|')
            for ele in genre:
                if ele != '':
                    genres[ele] = genres.get(ele, 0) + 1

        for ele in list(genres):
            if (genres[ele] < 100):
                del genres[ele]        

        genresTable = {}

        count = 0
        for ele in list(genres):
            genresTable[ele] = count
            count += 1
                
        self.dataset = []
        self.labels = []
        
        count = 0
        for img in tqdm(os.listdir(self.root)):
            image = io.imread(os.path.join(self.root, img))
            count += 1
            
            feature = extractStyleFeature(Variable(loader(Image.fromarray(image))).cuda().unsqueeze(0))
            
            self.dataset.append(feature.data)
            self.labels.append(extractLabel(img, id2genre, genresTable))

    def __len__(self):
        return len(os.listdir(self.root))

    def __getitem__(self, idx):
        feature = self.dataset[idx]
        label = self.labels[idx]
        
#         image = Image.fromarray(image)
        
#         if self.transform is not None:
#             image = self.transform(image)

#         image = Variable(loader(image))
        
#         image = image.cuda()
#         # fake batch dimension required to fit network's input dimensions
#         image = image.unsqueeze(0)        
        
#         feature = extractStyleFeature(image)
        
        return feature, label
                               
trainset = MyDataset(root='/home/ubuntu/notebooks/dataset/train',
                     csvfile='/home/ubuntu/notebooks/Movie-Genre-Classification-from-Movie-Poster/Dataset/NewMovieGenre.csv')
valset = MyDataset(root='/home/ubuntu/notebooks/dataset/validation/',
                   csvfile='/home/ubuntu/notebooks/Movie-Genre-Classification-from-Movie-Poster/Dataset/NewMovieGenre.csv')

 80%|███████▉  | 21523/26987 [15:08<03:50, 23.70it/s]

RuntimeError: cuda runtime error (2) : out of memory at /home/ubuntu/src/pytorch/torch/lib/THC/generic/THCStorage.cu:66

 80%|███████▉  | 21523/26987 [15:20<03:53, 23.37it/s]

### Train the Model

In [None]:
trainLoader = torch.utils.data.DataLoader(trainset, batch_size = 128, 
                                          shuffle = True, num_workers = 0)

valLoader = torch.utils.data.DataLoader(valset, batch_size = 128,
                                        shuffle = True, num_workers = 0)

def train_model(network, criterion, optimizer, trainLoader, valLoader, n_epochs = 10, use_gpu = False):
    if use_gpu:
        network = network.cuda()
        criterion = criterion.cuda()
        
    # Training loop.
    for epoch in range(0, n_epochs):
        correct = 0.0
        cum_loss = 0.0
        counter = 0

        # Make a pass over the training data.
        t = tqdm(trainLoader, desc = 'Training epoch %d' % epoch)
        network.train()  # This is important to call before training!
        for (i, (inputs, labels)) in enumerate(t):
            
            # Wrap inputs, and targets into torch.autograd.Variable types.
            inputs = Variable(inputs)
            labels = Variable(labels)
            
            if use_gpu:
                inputs = inputs.cuda()
                labels = labels.cuda()

            # Forward pass:
            outputs = network(inputs)
            loss = criterion(outputs, labels)

            # Backward pass:
            optimizer.zero_grad()
            # Loss is a variable, and calling backward on a Variable will
            # compute all the gradients that lead to that Variable taking on its
            # current value.
            loss.backward() 

            # Weight and bias updates.
            optimizer.step()
            
            # logging information.
            cum_loss += loss.data[0]
            outlabels = (outputs.data.exp() / (outputs.data.exp() + 1)).round()
            
            correct += ((labels.data * outlabels).sum(2).sum(1) / (labels.data + outlabels).clamp(0, 1).sum(2).sum(1)).sum()
            counter += inputs.size(0)
            t.set_postfix(loss = cum_loss / (1 + i), accuracy = 100 * correct / counter)

        # Make a pass over the validation data.
        correct = 0.0
        cum_loss = 0.0
        counter = 0
        t = tqdm(valLoader, desc = 'Validation epoch %d' % epoch)
        network.eval()  # This is important to call before evaluating!
        for (i, (inputs, labels)) in enumerate(t):
            # Wrap inputs, and targets into torch.autograd.Variable types.
            inputs = Variable(inputs)
            labels = Variable(labels)
            
            if use_gpu:
                inputs = inputs.cuda()
                labels = labels.cuda()

            # Forward pass:
            outputs = network(inputs)
            loss = criterion(outputs, labels)

            # logging information.
            cum_loss += loss.data[0]
            outlabels = (outputs.data.exp() / (outputs.data.exp() + 1)).round()
            
            correct += ((labels.data * outlabels).sum(2).sum(1) / (labels.data + outlabels).clamp(0, 1).sum(2).sum(1)).sum()
            counter += inputs.size(0)
            t.set_postfix(loss = cum_loss / (1 + i), accuracy = 100 * correct / counter)

classifier = MovieGenreClassifier(23)
optimizer = optim.Adam(classifier.parameters())
criterion = nn.MultiLabelSoftMarginLoss()
# Train the previously defined model.
train_model(classifier, criterion, optimizer, trainLoader, valLoader, n_epochs = 10, use_gpu = True)