# Character CNN Experiment

In [10]:
import sys

import tensorflow as tf
import numpy as np
from sklearn import metrics
import pandas as pd

In [11]:
learn = tf.contrib.learn

In [12]:
MAX_DOCUMENT_LENGTH = 100
N_FILTERS = 10
FILTER_SHAPE1 = [20, 256]
FILTER_SHAPE2 = [20, N_FILTERS]
POOLING_WINDOW = 4
POOLING_STRIDE = 2

In [4]:
# define a 3-layer convolutional neural network
# 2 conv layers + 1 fully_connected

tf.logging.set_verbosity(tf.logging.INFO)
def char_cnn_model(features, target):
    '''Character Level CNN to predict classes'''
    # tf.one_hot(indices, depth, on_value=None, off_value=None, axis=None, dtype=None, name=None)
    target = tf.one_hot(target, 15, 1, 0)
    byte_list = tf.reshape(
        # this transforms features into onehot vectors of 256 dimensions
        tf.one_hot(features,256),
        # reshape that into [n_samples, 256 length_per_sample, dim_per_char, 1]
        [-1, MAX_DOCUMENT_LENGTH, 256, 1]
    )
    
    with tf.device('/cpu:0'):
        with tf.variable_scope('CNN_layer_1'):
            conv1 = tf.contrib.layers.convolution2d(
                                            byte_list, 
                                            N_FILTERS, 
                                            FILTER_SHAPE1, 
                                            padding='VALID')
            # add RELU
            conv1 = tf.nn.relu(conv1)
            # max pool
            pool1 = tf.nn.max_pool(
                                conv1,
                                ksize=[1, POOLING_WINDOW, 1, 1],
                                strides=[1, POOLING_STRIDE, 1, 1],
                                padding='SAME')
            # transpose the matrix so that n_filters becomes width
            pool1 = tf.transpose(pool1, [0, 1, 3, 2])
        with tf.variable_scope('CNN_layer_2'):
            conv2 = tf.contrib.layers.convolution2d(
                                            pool1, 
                                            N_FILTERS, 
                                            FILTER_SHAPE2, 
                                            padding='VALID')
            # max across each filter to get useful features for classification
            # Reduces input_tensor along the dimensions given in reduction_indices. 
            # Unless keep_dims is true, the rank of the tensor is reduced by 1 
            #     for each entry in reduction_indices. 
            # If keep_dims is true, the reduced dimensions are retained with length 1.

            # tf.reduce_max reduce the [1] dimension 
            # tf.squeeze(input, axis=None, name=None, squeeze_dims=None)
            #     squeeze_dims: Deprecated keyword argument that is now axis
            #      this operation will have all dimensions of size 1 removed. 
            #      squeeze_dims / axis = [1] means squeezing the first size 1 dimension
            pool2 = tf.squeeze(tf.reduce_max(conv2,1), squeeze_dims=[1])

        # fully connected linear classifier of WX+B
        logits = tf.contrib.layers.fully_connected(pool2, 15, activation_fn=None)
        loss = tf.losses.softmax_cross_entropy(target, logits)

        # training op
        train_op = tf.contrib.layers.optimize_loss(
            loss,
            tf.contrib.framework.get_global_step(),
            optimizer='Adam',
            learning_rate=0.01)
    
    return ({
        'class': tf.argmax(logits, 1),
        'prob': tf.nn.softmax(logits)
    }, loss, train_op)

In [5]:
def dbpediaExperiment():
    dbpedia = learn.datasets.load_dataset(
                'dbpedia', test_with_fake_data=False, size='small')
    x_train = pd.DataFrame(dbpedia.train.data)[1]
    y_train = pd.Series(dbpedia.train.target)
    x_test = pd.DataFrame(dbpedia.test.data)[1]
    y_test = pd.Series(dbpedia.test.target)

    # process vocabulary
    char_processor = learn.preprocessing.ByteProcessor(MAX_DOCUMENT_LENGTH)
    x_train = np.array(list(char_processor.fit_transform(x_train))) # shape = [n_sample, n_doc_length]
    x_test = np.array(list(char_processor.fit_transform(x_test)))

    classifier = learn.Estimator(model_fn = char_cnn_model)
    classifier.fit(x_train, y_train, steps=1)
    preds = classifier.predict(x_test, as_iterable=True)
    
    return preds

In [None]:
preds = dbpediaExperiment();
class_preds = [p['class'] for p in preds]
prob_preds = [p['prob'] for p in preds]
score = metrics.accuracy_score(y_test, class_preds)
print('Accuracy:{}'.format(score))

# Applying this to WikiTalk

In [16]:
import Load_Data
load_data = False

if(load_data == True):
    %time DATA = Load_Data.load_Onehot_train_test_split()
    training = DATA['Training']
    testing = DATA['Testing']
else:
    training = pd.read_csv('trainingWiki.csv')
    testing = pd.read_csv('testingWiki.csv')


# Rebuild the Model for Binary Classification

In [17]:
MAX_DOCUMENT_LENGTH = 200
N_FILTERS = 10
FILTER_SHAPE1 = [20, 256]
FILTER_SHAPE2 = [20, N_FILTERS]
POOLING_WINDOW = 4
POOLING_STRIDE = 2
N_CLASSES = 2
N_STEPS = 30

tf.logging.set_verbosity(tf.logging.INFO)
def wiki_char_cnn_model(features, target):
    '''Character Level CNN to predict classes'''
    target = tf.one_hot(target, N_CLASSES, 1, 0)
    byte_list = tf.reshape(
        tf.one_hot(features,256),
        [-1, MAX_DOCUMENT_LENGTH, 256, 1]
    )
    
    with tf.device('/cpu:0'):
        with tf.variable_scope('CNN_layer_1'):
            conv1 = tf.contrib.layers.convolution2d(
                                            byte_list, 
                                            N_FILTERS, 
                                            FILTER_SHAPE1, 
                                            padding='VALID')
            # add RELU
            conv1 = tf.nn.relu(conv1)
            # max pool
            pool1 = tf.nn.max_pool(
                                conv1,
                                ksize=[1, POOLING_WINDOW, 1, 1],
                                strides=[1, POOLING_STRIDE, 1, 1],
                                padding='SAME')
            # transpose the matrix so that n_filters becomes width
            pool1 = tf.transpose(pool1, [0, 1, 3, 2])
        with tf.variable_scope('CNN_layer_2'):
            conv2 = tf.contrib.layers.convolution2d(
                                            pool1, 
                                            N_FILTERS, 
                                            FILTER_SHAPE2, 
                                            padding='VALID')
    
            pool2 = tf.squeeze(tf.reduce_max(conv2,1), squeeze_dims=[1])

        # fully connected linear classifier of WX+B
        logits = tf.contrib.layers.fully_connected(pool2, N_CLASSES, activation_fn=None)
        loss = tf.losses.softmax_cross_entropy(target, logits)

        # training op
        train_op = tf.contrib.layers.optimize_loss(
            loss,
            tf.contrib.framework.get_global_step(),
            optimizer='Adam',
            learning_rate=0.01)
    
    return ({
        'class': tf.argmax(logits, 1),
        'prob': tf.nn.softmax(logits)
    }, loss, train_op)


In [51]:
def wikiExperiment():
    char_processor = learn.preprocessing.ByteProcessor(MAX_DOCUMENT_LENGTH)
    
    train_x = np.array(list(char_processor.fit_transform(training.Text)))
    train_y = training.Category.apply(lambda x: 0 if x=='Attack' else 1)
    
    test_x = np.array(list(char_processor.fit_transform(testing.Text)))
    test_y = testing.Category.apply(lambda x: 0 if x=='Attack' else 1)
    
    classifier = learn.Estimator(model_fn = wiki_char_cnn_model)
    
    classifier.fit(train_x, train_y, steps=N_STEPS, batch_size=10)
    wikiPreds = classifier.predict(test_x, as_iterable=True)
    
    test_y = testing.Category.apply(lambda x: 0 if x=='Attack' else 1)
    class_preds = [p['class'] for p in wikiPreds]
    prob_preds = [p['prob'] for p in wikiPreds]
    score = metrics.accuracy_score(test_y, class_preds) # wrong class
    print('Accuracy:{}'.format(score))
    
    return wikiPreds

In [48]:
# do not run this without GPU, 
# this is really expensive
wikiPreds = wikiExperiment();


INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_save_checkpoints_steps': None, '_tf_random_seed': None, '_evaluation_master': '', '_master': '', '_save_checkpoints_secs': 600, '_keep_checkpoint_every_n_hours': 10000, '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1.0
}
, '_num_ps_replicas': 0, '_task_id': 0, '_keep_checkpoint_max': 5, '_environment': 'local', '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fbf266cf278>, '_task_type': None, '_is_chief': True, '_save_summary_steps': 100}
Instructions for updating:
Estimator is decoupled from Scikit Learn interface by moving into
separate class SKCompat. Arguments x, y and batch_size are only
available in the SKCompat class, Estimator will only accept input_fn.
Example conversion:
  est = Estimator(...) -> est = SKCompat(Estimator(...))
Instructions for updating:
Estimator is decoupled from Scikit Learn interface by moving into
separate class SKCompat. Arguments x, y 

  equality = a == b


INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into /tmp/tmpmu4zlm7a/model.ckpt.
INFO:tensorflow:step = 1, loss = 0.704781
INFO:tensorflow:Saving checkpoints for 30 into /tmp/tmpmu4zlm7a/model.ckpt.
INFO:tensorflow:Loss for final step: 0.345497.
Instructions for updating:
Estimator is decoupled from Scikit Learn interface by moving into
separate class SKCompat. Arguments x, y and batch_size are only
available in the SKCompat class, Estimator will only accept input_fn.
Example conversion:
  est = Estimator(...) -> est = SKCompat(Estimator(...))
