In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import pickle
from itertools import chain
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

## Import cleaned data

In [3]:
with open("./clean_review77_83.pickle", "rb") as input_file:
        data2 = pickle.load(input_file)
with open("./word_freq2.pickle", "rb") as input_file:
        word_freq = pickle.load(input_file)
with open("./lable2.pickle", "rb") as input_file:
        label2 = pickle.load(input_file) # useful > 1, binary label
label2 = label2.astype(int)

## Classification Models

### Logistic regression

In [4]:
logistic = LogisticRegression(penalty='l1', dual=False, tol=0.0001, 
                              C=1.0, fit_intercept=True, intercept_scaling=1, 
                              class_weight=None, random_state=None, solver='liblinear', 
                              max_iter=1000, multi_class='ovr', verbose=0, 
                              warm_start=False, n_jobs=1)

In [13]:
#### split train and test
X_train, X_test, y_train, y_test = train_test_split(np.array(word_freq), np.array(label2), test_size=0.3, random_state=101)

param_dist1 = {'C': [0.001,0.01,0.1,1,10,100], 'penalty': ['l1', 'l2']}

logi = LogisticRegression()
logi_cv = GridSearchCV(logi, param_dist1, cv=5)
logi_cv.fit(X_train, y_train)

logi_model = LogisticRegression(C = logi_cv.best_params_['C'], penalty = logi_cv.best_params_['penalty'])
logi_model.fit(X_train, y_train)
train_acc = logi_model.score(X_train, y_train)
test_acc = logi_model.score(X_test, y_test)

print('Training Score: ', train_acc)
print('Test Score: ', test_acc)



Training Score:  0.7779488329524998
Test Score:  0.7757177202658747


In [11]:
logi_cv.best_params_['C']

0.01

### NN

In [14]:
# one hot encoding
onehotlabels = []
labels = [[1,0], [0,1]]
for i in label2:
    onehotlabels.append(labels[i])

X_train, X_test, y_train, y_test = train_test_split(np.array(word_freq), np.array(onehotlabels), test_size=0.3, random_state=101)

# network parameters
learning_rate = 0.001
batch_size = 128
n_batch = len(X_train) // batch_size
display_step = 200

words_length = 297
num_hidden = 300 # hidden layer num of features
num_classes = 2 
vocab_size = words_length
embed_sz = 100

### vanila multi-layer NN

In [20]:
X = tf.placeholder(tf.float32, [None, words_length])
Y = tf.placeholder(tf.int32, [None,num_classes])

h0 = tf.layers.dense(X, units=num_hidden, activation = tf.nn.relu) #x dtype = float32
h1 = tf.layers.dense(h0, num_hidden, activation = tf.nn.relu)
logits = tf.layers.dense(h1, num_classes, activation = tf.nn.sigmoid)

prediction = tf.nn.softmax(logits)

# Define loss and optimizer
loss_op = tf.reduce_mean(tf.losses.sparse_softmax_cross_entropy(logits=logits, labels=tf.argmax(Y, 1)))
train_op = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss_op)

# Evaluate model (with test logits, for dropout to be disabled)
correct_pred = tf.equal(tf.argmax(prediction, 1), tf.argmax(Y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [21]:
with tf.Session() as sess:

    sess.run(tf.global_variables_initializer())

    total_loss, step, acc = 0, 0, 0
    for start, end in zip(range(0, n_batch*batch_size-batch_size, batch_size),
                              range(batch_size, n_batch*batch_size, batch_size)):


        loss_, _, acc_ = sess.run([loss_op,train_op,accuracy], feed_dict={X: X_train[start:end].astype(int), 
                                                                        Y: y_train[start:end]})
        step += 1
        total_loss += loss_
        acc += acc_
        if step % display_step == 0 or step == 1:
            print("Step " + str(step) + ", Minibatch Loss= " + \
                  "{:.4f}".format(total_loss / step) + ", Training Accuracy= " + \
                  "{:.3f}".format(acc/))

    print("Optimization Finished!")

    # Calculate testing accuracy
    test_data = X_test
    test_label = y_test
    print("Testing Accuracy:", \
        sess.run(accuracy, feed_dict={X: test_data, Y: test_label}))
    

Step 1, Minibatch Loss= 0.6772, Training Accuracy= 137.500
Step 200, Minibatch Loss= 0.5369, Training Accuracy= 155.797
Step 400, Minibatch Loss= 0.5374, Training Accuracy= 155.426
Step 600, Minibatch Loss= 0.5362, Training Accuracy= 155.505
Step 800, Minibatch Loss= 0.5353, Training Accuracy= 155.602
Step 1000, Minibatch Loss= 0.5349, Training Accuracy= 155.575
Step 1200, Minibatch Loss= 0.5345, Training Accuracy= 155.617
Optimization Finished!
Testing Accuracy: 0.77556217


### Bi-directional  RNN

In [None]:
X = tf.placeholder(tf.int32, [None, words_length])
Y = tf.placeholder(tf.int32, [None, num_classes])
weights = tf.Variable(tf.random_normal([2*num_hidden, num_classes]))
biases = tf.Variable(tf.random_normal([num_classes]))

In [None]:
def BiRNN(x, weights, biases):

    # Prepare data shape to match `rnn` function requirements
    # Current data input shape: (batch_size, n_input)
    R = tf.Variable(tf.random_normal([vocab_size, embed_sz], stddev = .1))
    embs_R = tf.nn.embedding_lookup(R, x)
    # the input shape of tf.nn.static_bidirectional_rnn should be 3-dimension tensor
    embs_R = tf.unstack(tf.transpose(embs_R, perm=[2, 0, 1]))

    x = tf.reshape(x, [-1, words_length])
    x = tf.split(axis=0, num_or_size_splits=n_batch, value=x)
    
    # Forward direction cell
    lstm_fw_cell = tf.contrib.rnn.GRUCell(num_hidden)
    
    # Backward direction cell
    lstm_bw_cell = tf.contrib.rnn.GRUCell(num_hidden)

    # Get lstm cell output
    try:
        outputs, _, _ = tf.nn.static_bidirectional_rnn(lstm_fw_cell, lstm_bw_cell, embs_R, dtype = tf.float32)
    except Exception: # Old TensorFlow version only returns outputs not states
        outputs = tf.nn.static_bidirectional_rnn(lstm_fw_cell, lstm_bw_cell, embs_R, dtype = tf.float32)

    # Linear activation, using rnn inner loop last output
    return tf.matmul(outputs[-1], weights) + biases

In [None]:
logits = BiRNN(X, weights, biases)
prediction = tf.nn.softmax(logits)

# Define loss and optimizer
loss_op = tf.reduce_mean(tf.losses.sparse_softmax_cross_entropy(logits=logits, labels=tf.argmax(Y, 1)))
train_op = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss_op)

# Evaluate model (with test logits, for dropout to be disabled)
correct_pred = tf.equal(tf.argmax(prediction, 1), tf.argmax(Y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [None]:
with tf.Session() as sess:

    sess.run(tf.global_variables_initializer())

    total_loss, step, acc = 0, 0, 0
    for start, end in zip(range(0, n_batch*batch_size-batch_size, batch_size),
                              range(batch_size, n_batch*batch_size, batch_size)):


        loss_, _, acc = sess.run([loss_op,train_op,accuracy], feed_dict={X: X_train[start:end].astype(int), 
                                                                        Y: y_train[start:end]})
        step += 1
        total_loss += loss_
        if step % display_step == 0 or step == 1:
            print("Step " + str(step) + ", Minibatch Loss= " + \
                  "{:.4f}".format(total_loss / step) + ", Training Accuracy= " + \
                  "{:.3f}".format(acc))

    print("Optimization Finished!")

    # Calculate testing accuracy
    test_data = X_test
    test_label = y_test
    print("Testing Accuracy:", \
        sess.run(accuracy, feed_dict={X: test_data, Y: test_label}))
    