# Summary:
- Based on the transfer learning code provided by mxnet.io
- NO SMOTE
- Below networks are tested with below results:


    - Res 50
    
                         precision    recall  f1-score   support
        0.0               0.80      0.73      0.76      2610
        1.0               0.70      0.77      0.74      2168

        avg / total       0.75      0.75      0.75      4778

            Confusion Matrix
            1905  705
            488 1680

 
    - Res 152
                         precision    recall  f1-score   support

        0.0               0.77      0.76      0.76      2595
        1.0               0.72      0.73      0.72      2183

        avg / total       0.75      0.74      0.75      4778

            Confusion Matrix
            1966  629
            590 1593     

In [1]:
import glob
import multiprocessing
import time
import csv
import pickle
import os
import pandas as pd
import shutil
import numpy as np
import warnings
import tqdm
import logging
import mxnet as mx
from bs4 import BeautifulSoup
from urllib.request import urlopen
from os.path import basename
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer, sent_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

%pylab inline
warnings.filterwarnings("ignore")

scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
  _deprecated()


Populating the interactive namespace from numpy and matplotlib


In [2]:
import os, sys

if sys.version_info[0] >= 3:
    from urllib.request import urlretrieve
else:
    from urllib import urlretrieve

def download(url):
    filename = url.split("/")[-1]
    if not os.path.exists(filename):
        urlretrieve(url, filename)

In [3]:
# 2d to 4d conversion
def to4d(img):
    return img.reshape(img.shape[0], 3, 32, 32).astype(np.float32)

In [4]:
def modelMetric(mod, info, nd=True):
    if nd == True: 
        prob = mod.predict(val_iters).asnumpy()
    else:
        prob = mod.predict(val_iters)
    logging.info(info)
    # Check the accuracy from prediction
    val_iters.reset()

    y_batch = []
    for dbatch in val_iters:
        label = dbatch.label[0].asnumpy()
        pad = val_iters.getpad()
        real_size = label.shape[0] - pad
        y_batch.append(label[0:real_size])
    y = np.concatenate(y_batch)

    # get prediction label 
    py = np.argmax(prob, axis=1)
    acc1 = sum(py == y) / len(y)
    logging.info('Test accuracy = %f', acc1)
    cfn_matrix=confusion_matrix(y,py)
    print(classification_report (y,py))
    print(cfn_matrix)

# 1. Data Preparation

In [5]:
p_file = 'RCT_Vectors'
with open(p_file, 'rb') as fin:
    vectors = pickle.load(fin)

In [6]:
p_file = 'RCT_labels'
with open(p_file, 'rb') as fin:
    data_Y = pickle.load(fin)

In [7]:
d2v_X_train, d2v_X_test, d2v_Y_train, d2v_Y_test = train_test_split(vectors, data_Y, test_size=0.25)

In [8]:
X_train_array = np.array(d2v_X_train)
Y_train_array = np.array(d2v_Y_train)
X_test_array = np.array(d2v_X_test)
Y_test_array = np.array(d2v_Y_test)

In [9]:
# double check whether oversampling works
X_train_array.shape, Y_train_array.shape, X_test_array.shape, Y_test_array.shape, len(vectors)

((14333, 3072), (14333,), (4778, 3072), (4778,), 19111)

In [10]:
# creating 4d training/testing dataset
batch_size = 64
train_iters = mx.io.NDArrayIter(to4d(X_train_array), Y_train_array, batch_size, shuffle=True,label_name='softmax_label')
val_iters = mx.io.NDArrayIter(to4d(X_test_array), Y_test_array, batch_size,label_name='softmax_label')

# 2. Load pretrained model

In [11]:
def get_model(prefix, epoch):
    download(prefix+'-symbol.json')
    download(prefix+'-%04d.params' % (epoch,))

get_model('http://data.mxnet.io/models/imagenet/resnet/50-layers/resnet-50', 0)
sym, arg_params, aux_params = mx.model.load_checkpoint('resnet-50', 0)

In [12]:
def get_fine_tune_model(symbol, arg_params, num_classes, layer_name='flatten0'):
    """
    symbol: the pretrained network symbol
    arg_params: the argument parameters of the pretrained model
    num_classes: the number of classes for the fine-tune datasets
    layer_name: the layer name before the last fully-connected layer
    """
    all_layers = symbol.get_internals()
    net = all_layers[layer_name+'_output']
    net = mx.symbol.FullyConnected(data=net, num_hidden=num_classes, name='fc1')
    net = mx.symbol.SoftmaxOutput(data=net, name='softmax')
    new_args = dict({k:arg_params[k] for k in arg_params if 'fc1' not in k})
    return (net, new_args)

In [13]:
import logging
head = '%(asctime)-15s %(message)s'
logging.basicConfig(level=logging.DEBUG, format=head)

epoch = 50

def fit(symbol, arg_params, aux_params, train, val, batch_size, num_gpus):
    devs = [mx.gpu(i) for i in range(num_gpus)]
    mod = mx.mod.Module(symbol=symbol, context=devs)
    mod.fit(train, val,
        num_epoch=epoch,
        arg_params=arg_params,
        aux_params=aux_params,
        allow_missing=True,
        batch_end_callback = mx.callback.Speedometer(batch_size, batch_size),
        kvstore='device',
        optimizer='sgd',
        optimizer_params={'learning_rate':0.01},
        initializer=mx.init.Xavier(rnd_type='gaussian', factor_type="in", magnitude=2),
        eval_metric='acc')
    return mod

In [14]:
num_classes = 2
batch_per_gpu = 64
num_gpus = 1

(new_sym, new_args) = get_fine_tune_model(sym, arg_params, num_classes)

batch_size = batch_per_gpu * num_gpus
mod = fit(new_sym, new_args, aux_params, train_iters, val_iters, batch_size, num_gpus)


2018-04-26 19:34:01,021 Epoch[0] Batch [64]	Speed: 1239.40 samples/sec	accuracy=0.588702
2018-04-26 19:34:04,376 Epoch[0] Batch [128]	Speed: 1221.87 samples/sec	accuracy=0.613525
2018-04-26 19:34:07,637 Epoch[0] Batch [192]	Speed: 1256.66 samples/sec	accuracy=0.631836
2018-04-26 19:34:09,210 Epoch[0] Train-accuracy=0.656754
2018-04-26 19:34:09,213 Epoch[0] Time cost=11.631
2018-04-26 19:34:10,584 Epoch[0] Validation-accuracy=0.655833
2018-04-26 19:34:13,893 Epoch[1] Batch [64]	Speed: 1244.04 samples/sec	accuracy=0.688702
2018-04-26 19:34:17,014 Epoch[1] Batch [128]	Speed: 1313.55 samples/sec	accuracy=0.702637
2018-04-26 19:34:20,149 Epoch[1] Batch [192]	Speed: 1306.60 samples/sec	accuracy=0.715088
2018-04-26 19:34:21,657 Epoch[1] Train-accuracy=0.736895
2018-04-26 19:34:21,659 Epoch[1] Time cost=11.074
2018-04-26 19:34:23,009 Epoch[1] Validation-accuracy=0.714167
2018-04-26 19:34:26,202 Epoch[2] Batch [64]	Speed: 1289.55 samples/sec	accuracy=0.766827
2018-04-26 19:34:29,381 Epoch[2] Ba

2018-04-26 19:37:55,870 Epoch[18] Train-accuracy=0.997984
2018-04-26 19:37:55,871 Epoch[18] Time cost=11.276
2018-04-26 19:37:57,331 Epoch[18] Validation-accuracy=0.743333
2018-04-26 19:38:00,619 Epoch[19] Batch [64]	Speed: 1251.82 samples/sec	accuracy=0.998317
2018-04-26 19:38:03,869 Epoch[19] Batch [128]	Speed: 1260.81 samples/sec	accuracy=0.997314
2018-04-26 19:38:07,150 Epoch[19] Batch [192]	Speed: 1249.03 samples/sec	accuracy=0.997559
2018-04-26 19:38:08,790 Epoch[19] Train-accuracy=0.995464
2018-04-26 19:38:08,791 Epoch[19] Time cost=11.460
2018-04-26 19:38:10,303 Epoch[19] Validation-accuracy=0.747500
2018-04-26 19:38:13,568 Epoch[20] Batch [64]	Speed: 1261.99 samples/sec	accuracy=0.997356
2018-04-26 19:38:16,845 Epoch[20] Batch [128]	Speed: 1250.90 samples/sec	accuracy=0.998047
2018-04-26 19:38:20,050 Epoch[20] Batch [192]	Speed: 1278.07 samples/sec	accuracy=0.997803
2018-04-26 19:38:21,621 Epoch[20] Train-accuracy=0.994960
2018-04-26 19:38:21,623 Epoch[20] Time cost=11.319
201

2018-04-26 19:41:52,972 Epoch[37] Batch [128]	Speed: 1274.49 samples/sec	accuracy=1.000000
2018-04-26 19:41:56,230 Epoch[37] Batch [192]	Speed: 1257.26 samples/sec	accuracy=1.000000
2018-04-26 19:41:57,869 Epoch[37] Train-accuracy=1.000000
2018-04-26 19:41:57,870 Epoch[37] Time cost=11.340
2018-04-26 19:41:59,380 Epoch[37] Validation-accuracy=0.749583
2018-04-26 19:42:02,702 Epoch[38] Batch [64]	Speed: 1239.07 samples/sec	accuracy=1.000000
2018-04-26 19:42:05,958 Epoch[38] Batch [128]	Speed: 1258.79 samples/sec	accuracy=1.000000
2018-04-26 19:42:09,199 Epoch[38] Batch [192]	Speed: 1264.31 samples/sec	accuracy=1.000000
2018-04-26 19:42:10,756 Epoch[38] Train-accuracy=1.000000
2018-04-26 19:42:10,758 Epoch[38] Time cost=11.377
2018-04-26 19:42:12,166 Epoch[38] Validation-accuracy=0.749583
2018-04-26 19:42:15,425 Epoch[39] Batch [64]	Speed: 1264.17 samples/sec	accuracy=1.000000
2018-04-26 19:42:18,616 Epoch[39] Batch [128]	Speed: 1284.43 samples/sec	accuracy=1.000000
2018-04-26 19:42:21,7

In [15]:
metric = mx.metric.Accuracy()
mod_score = mod.score(val_iters, metric)

In [16]:
mod_score

[('accuracy', 0.7504166666666666)]

In [17]:
# save a model to mymodel-symbol.json and mymodel-0100.params
prefix = 'res-52'
mod.save_checkpoint(prefix, epoch)

2018-04-26 19:44:35,684 Saved checkpoint to "res-52-0050.params"


In [18]:
modelMetric(mod, 'Finish predict of Res 52...')

2018-04-26 19:44:36,829 Finish predict of Res 52...
2018-04-26 19:44:36,838 Test accuracy = 0.750314


             precision    recall  f1-score   support

        0.0       0.80      0.73      0.76      2610
        1.0       0.70      0.77      0.74      2168

avg / total       0.75      0.75      0.75      4778

[[1905  705]
 [ 488 1680]]


In [20]:
shape = {"data" : (batch_size, 3, 32, 32)}
network_chart=mx.viz.plot_network(symbol=new_sym, shape=shape)
network_chart.format = 'png'
network_chart.render('Res50')

'Res50.png'

<img src='Res50.png'>

In [15]:
get_model('http://data.mxnet.io/models/imagenet/resnet/152-layers//resnet-152', 0)
sym, arg_params, aux_params = mx.model.load_checkpoint('resnet-152', 0)

In [16]:
num_classes = 2
batch_per_gpu = 64
num_gpus = 1

(new_sym, new_args) = get_fine_tune_model(sym, arg_params, num_classes)

batch_size = batch_per_gpu * num_gpus
mod = fit(new_sym, new_args, aux_params, train_iters, val_iters, batch_size, num_gpus)

2018-04-26 19:49:51,121 Epoch[0] Batch [64]	Speed: 517.93 samples/sec	accuracy=0.594471
2018-04-26 19:49:59,033 Epoch[0] Batch [128]	Speed: 517.89 samples/sec	accuracy=0.613770
2018-04-26 19:50:06,877 Epoch[0] Batch [192]	Speed: 522.35 samples/sec	accuracy=0.639648
2018-04-26 19:50:10,629 Epoch[0] Train-accuracy=0.685988
2018-04-26 19:50:10,631 Epoch[0] Time cost=27.712
2018-04-26 19:50:14,485 Epoch[0] Validation-accuracy=0.688125
2018-04-26 19:50:22,274 Epoch[1] Batch [64]	Speed: 528.84 samples/sec	accuracy=0.696154
2018-04-26 19:50:30,018 Epoch[1] Batch [128]	Speed: 529.07 samples/sec	accuracy=0.730957
2018-04-26 19:50:37,732 Epoch[1] Batch [192]	Speed: 531.11 samples/sec	accuracy=0.748779
2018-04-26 19:50:41,488 Epoch[1] Train-accuracy=0.779234
2018-04-26 19:50:41,489 Epoch[1] Time cost=27.003
2018-04-26 19:50:45,227 Epoch[1] Validation-accuracy=0.723958
2018-04-26 19:50:53,116 Epoch[2] Batch [64]	Speed: 522.28 samples/sec	accuracy=0.793750
2018-04-26 19:51:00,962 Epoch[2] Batch [12

2018-04-26 19:59:33,219 Epoch[18] Time cost=27.273
2018-04-26 19:59:37,150 Epoch[18] Validation-accuracy=0.744792
2018-04-26 19:59:44,973 Epoch[19] Batch [64]	Speed: 526.49 samples/sec	accuracy=0.998077
2018-04-26 19:59:52,813 Epoch[19] Batch [128]	Speed: 522.55 samples/sec	accuracy=1.000000
2018-04-26 20:00:00,814 Epoch[19] Batch [192]	Speed: 512.04 samples/sec	accuracy=0.998291
2018-04-26 20:00:04,768 Epoch[19] Train-accuracy=0.999496
2018-04-26 20:00:04,770 Epoch[19] Time cost=27.619
2018-04-26 20:00:08,807 Epoch[19] Validation-accuracy=0.743958
2018-04-26 20:00:16,747 Epoch[20] Batch [64]	Speed: 518.89 samples/sec	accuracy=0.999038
2018-04-26 20:00:24,492 Epoch[20] Batch [128]	Speed: 529.02 samples/sec	accuracy=0.999023
2018-04-26 20:00:32,288 Epoch[20] Batch [192]	Speed: 525.48 samples/sec	accuracy=0.999756
2018-04-26 20:00:36,000 Epoch[20] Train-accuracy=0.998488
2018-04-26 20:00:36,001 Epoch[20] Time cost=27.192
2018-04-26 20:00:39,925 Epoch[20] Validation-accuracy=0.746250
2018

2018-04-26 20:09:23,905 Epoch[37] Batch [192]	Speed: 526.47 samples/sec	accuracy=1.000000
2018-04-26 20:09:27,713 Epoch[37] Train-accuracy=1.000000
2018-04-26 20:09:27,714 Epoch[37] Time cost=27.027
2018-04-26 20:09:31,563 Epoch[37] Validation-accuracy=0.744583
2018-04-26 20:09:39,361 Epoch[38] Batch [64]	Speed: 528.16 samples/sec	accuracy=1.000000
2018-04-26 20:09:47,090 Epoch[38] Batch [128]	Speed: 530.03 samples/sec	accuracy=1.000000
2018-04-26 20:09:54,843 Epoch[38] Batch [192]	Speed: 528.37 samples/sec	accuracy=1.000000
2018-04-26 20:09:58,590 Epoch[38] Train-accuracy=1.000000
2018-04-26 20:09:58,591 Epoch[38] Time cost=27.027
2018-04-26 20:10:02,443 Epoch[38] Validation-accuracy=0.744583
2018-04-26 20:10:10,335 Epoch[39] Batch [64]	Speed: 521.74 samples/sec	accuracy=1.000000
2018-04-26 20:10:18,044 Epoch[39] Batch [128]	Speed: 531.42 samples/sec	accuracy=1.000000
2018-04-26 20:10:25,811 Epoch[39] Batch [192]	Speed: 527.45 samples/sec	accuracy=1.000000
2018-04-26 20:10:29,564 Epoc

In [17]:
metric = mx.metric.Accuracy()
mod_score = mod.score(val_iters, metric)

In [18]:
# save a model to mymodel-symbol.json and mymodel-0100.params
prefix = 'res-152'
mod.save_checkpoint(prefix, epoch)

2018-04-26 20:15:48,038 Saved checkpoint to "res-152-0050.params"


In [19]:
modelMetric(mod, 'Finish predict of Res 152...')

2018-04-26 20:15:51,212 Finish predict of Res 52...
2018-04-26 20:15:51,221 Test accuracy = 0.744872


             precision    recall  f1-score   support

        0.0       0.77      0.76      0.76      2595
        1.0       0.72      0.73      0.72      2183

avg / total       0.75      0.74      0.75      4778

[[1966  629]
 [ 590 1593]]


In [23]:
shape = {"data" : (batch_size, 3, 32, 32)}
network_chart=mx.viz.plot_network(symbol=new_sym, shape=shape)
network_chart.format = 'png'
network_chart.render('Res152')

'Res152.png'

<img src='SVM.png'>