# Build Model on IMDB data
- The processing steps and trained models are saved to be persistent for model deployment.
- The model in this notebook is only for presenting end-to-end pipeline purpose. More sophisticated model with hyperparameter optimization is not covered here.

## Load IMDB data

In [1]:
from src import imdbdata

In [2]:
train_data, train_label, test_data, test_label = imdbdata.get_imdb_raw()

In [3]:
type(train_data)

list

In [4]:
train_data[:2]

["This movie is Jackie's best. I still cant get enough of watching some of his best stunts ever. I also like the bad guys in this movie (the old man looks like a Chinese version of John Howard). Unlike some of Jackie's other work, this movie has also got a great story line and i recommend it to all of Jackie's fans.",
 'a bit slow and boring, the tale of an old man and his wife living a delapidated building and interacting with a fixed cast of characters like the mailman, the brothers sitting on the porch, the wealthy cigar smoking man. The photography of the river is marvelous, as is the interior period decoration. If you like decoration of Banana Republic stores, this is a must.']

## Simple text processing

In [5]:
from src import textproc

In [6]:
mytextproc = textproc.TextProc(train_data)

In [7]:
mytextproc.mode

'train'

In [8]:
top_num = 10000
train_word_processed, selected_word = mytextproc.process(top_num=top_num)

In [9]:
train_word_processed[:2]

[['movie',
  'jackie',
  'good',
  'not',
  'watch',
  'good',
  'stunt',
  'like',
  'bad',
  'guy',
  'movie',
  'old',
  'man',
  'look',
  'like',
  'chinese',
  'version',
  'john',
  'howard',
  'unlike',
  'jackie',
  'work',
  'movie',
  'get',
  'great',
  'story',
  'line',
  'recommend',
  'jackie',
  'fan'],
 ['bit',
  'slow',
  'boring',
  'tale',
  'old',
  'man',
  'wife',
  'live',
  'building',
  'interact',
  'fix',
  'cast',
  'character',
  'like',
  'brother',
  'sit',
  'wealthy',
  'cigar',
  'smoking',
  'man',
  'photography',
  'river',
  'marvelous',
  'interior',
  'period',
  'decoration',
  'like',
  'decoration',
  'banana',
  'republic',
  'store']]

### Apply same processing steps to test data

In [10]:
mytextproc.evalmode()
print('mode {}'.format(mytextproc.mode))
test_word_processed, selected_word = mytextproc.process(text_corpus=test_data)

mode eval


## Save text processor

In [11]:
# mytextproc.save_wcount('./savedspace/torchfullnet/textproc.json')

## Encode text

In [12]:
from src import textencoder
word_encoder = textencoder.OneHotEncoder(selected_word)
train_encoded = word_encoder.encode(train_word_processed)
test_encoded = word_encoder.encode(test_word_processed)

### Split data for model fitting and validation

In [13]:
from src import helper
fit_encoded, fit_labels, val_encoded, val_labels = helper.random_split(train_encoded, train_label)

In [14]:
print(fit_encoded.shape)

(17500, 10001)


### Train model

In [15]:
from src import torchnet
from torch.utils.data import DataLoader
fit_torchdata = torchnet.InMemTorchData(fit_encoded, fit_labels)
fit_dataloader = DataLoader(fit_torchdata, batch_size = 12, shuffle=True)

In [16]:
import logging
from torch import optim
logging.basicConfig(level=logging.DEBUG)
fullnet = torchnet.FullNet('relu', fit_encoded.shape[1], 12, 8, 1)
optimizer = optim.RMSprop(fullnet.parameters(), lr=0.01)

### Train model with validation results for 10 epochs

In [17]:
torchnet.train_bclassif(fullnet, optimizer, epoch_num=10, fit_dataloader=fit_dataloader, 
                        val_data=val_encoded, val_label=val_labels)

DEBUG:src.torchnet:Epoch 0 starts
DEBUG:src.torchnet:Metric rocauc on validation: 0.9425978115384536
DEBUG:src.torchnet:loss on validation: 0.3076963722705841
DEBUG:src.torchnet:Epoch 1 starts
DEBUG:src.torchnet:Metric rocauc on validation: 0.9395301080761522
DEBUG:src.torchnet:loss on validation: 0.3207019567489624
DEBUG:src.torchnet:Epoch 2 starts
DEBUG:src.torchnet:Metric rocauc on validation: 0.9352360201594991
DEBUG:src.torchnet:loss on validation: 0.4505166709423065
DEBUG:src.torchnet:Epoch 3 starts
DEBUG:src.torchnet:Metric rocauc on validation: 0.9282337880615651
DEBUG:src.torchnet:loss on validation: 0.6330882906913757
DEBUG:src.torchnet:Epoch 4 starts
DEBUG:src.torchnet:Metric rocauc on validation: 0.9231131262676758
DEBUG:src.torchnet:loss on validation: 0.7683960199356079
DEBUG:src.torchnet:Epoch 5 starts
DEBUG:src.torchnet:Metric rocauc on validation: 0.9203694244043323
DEBUG:src.torchnet:loss on validation: 1.058332085609436
DEBUG:src.torchnet:Epoch 6 starts
DEBUG:src.tor

([0.9425978115384536,
  0.9395301080761522,
  0.9352360201594991,
  0.9282337880615651,
  0.9231131262676758,
  0.9203694244043323,
  0.9152026442968073,
  0.9150423502660675,
  0.9180620152999016,
  0.9161879832244545],
 [0.3076963722705841,
  0.3207019567489624,
  0.4505166709423065,
  0.6330882906913757,
  0.7683960199356079,
  1.058332085609436,
  1.2638427019119263,
  1.4613703489303589,
  1.466454267501831,
  1.5525388717651367])

### Retrain model with 1 epoch
- The validation results in 10 epochs show that 1 epoch is sufficient
- More sophisticated hyperparameter optimization should be considered if targeting at building more accurate models

In [18]:
fullnet = torchnet.FullNet('relu', fit_encoded.shape[1], 12, 8, 1)
optimizer = optim.RMSprop(fullnet.parameters(), lr=0.01)

In [19]:
torchnet.train_bclassif(fullnet, optimizer, epoch_num=1, fit_dataloader=fit_dataloader, 
                        val_data=val_encoded, val_label=val_labels)

DEBUG:src.torchnet:Epoch 0 starts
DEBUG:src.torchnet:Metric rocauc on validation: 0.9381358913624631
DEBUG:src.torchnet:loss on validation: 0.3743900954723358


([0.9381358913624631], [0.3743900954723358])

### Model evaluation on test data

In [20]:
torchnet.eval_model(fullnet, test_encoded, test_label, metric_type='rocauc')

(0.9303255424000001, None)

### Save trained models

In [21]:
# fullnet.save_modeltopology('./savedspace/torchfullnet/imdb_fullnet_topology.json')
# fullnet.save_model_weights('./savedspace/torchfullnet/imdb_fullnet_weights.pt')

# Reload data processing steps and trained models
- It demos how to reload the saved processor and model to make predictions.

## Load text proc 

In [22]:
from src import textproc
loaded_textproc = textproc.TextProc.from_load_wcount_pair('./savedspace/torchfullnet/textproc.json')

## Load trained model

In [23]:
from src import torchnet
fullnet = torchnet.FullNet.from_modeltopology('./savedspace/torchfullnet/imdb_fullnet_topology.json')
fullnet.load_model_weights('./savedspace/torchfullnet/imdb_fullnet_weights.pt')
fullnet = fullnet.eval()

## Make prediction

In [24]:
from src import textencoder
review_text = ['Movie is perfect, worth watching again.',
              'Movie is good, worth watching again.',
              'Movie is fabulous, worth watching again.']

# text processing and encoding
review_text_processed, selected_word = loaded_textproc.process(text_corpus=review_text)
word_encoder = textencoder.OneHotEncoder(selected_word)
review_text_encoded = word_encoder.encode(review_text_processed)

In [25]:
import torch
with torch.no_grad():
    pred_vals = fullnet.prediction(review_text_encoded)
print(pred_vals)

tensor([[0.7316],
        [0.6500],
        [0.6777]])
