# Sentence sentiment prediction using three models


Some portions of the program adapted from coursera Sequence Modeling course emojify homework.
Change from previous version: use keras for both neural network models.

First run: train/test proportions were incorrectly set to yield small training sets (300) and large test sets (700). Increasing the training sets may benefit the neural networks.

In [1]:
!pwd

/Users/murataydogdu/Desktop/TextualAnalysis


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
from random import shuffle
import csv
import time

%matplotlib inline

In [3]:
from numpy.random import seed
from tensorflow import set_random_seed
seed(1)
set_random_seed(1)

In [4]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_recall_fscore_support

In [5]:
from nltk.tokenize import TweetTokenizer, sent_tokenize
tokenizer_words = TweetTokenizer()

In [6]:
from keras.models import Model, Sequential, load_model
from keras.layers import Dense, Input, Dropout, LSTM, Activation, Flatten
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform

Using TensorFlow backend.


### GloVe: Global vectors for Word representation
https://nlp.stanford.edu/projects/glove/

Convert an input sentence into the word vector representation, which then get averaged together. We will use pretrained 50-dimensional GloVe embeddings. Run the following cell to load the `word_to_vec_map`, which contains all the vector representations.

This will loaded:
- `word_to_index`: dictionary mapping from words to their indices in the vocabulary (400,001 words, with the valid indices ranging from 0 to 400,000)
- `index_to_word`: dictionary mapping from indices to their corresponding words in the vocabulary
- `word_to_vec_map`: dictionary mapping words to their GloVe vector representation.

In [7]:
def read_glove_vecs(glove_file):
    with open(glove_file, 'r') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
        
        i = 1
        words_to_index = {}
        index_to_words = {}
        for w in sorted(words):
            words_to_index[w] = i
            index_to_words[i] = w
            i = i + 1
    return words_to_index, index_to_words, word_to_vec_map

In [8]:
word_to_index, index_to_word, word_to_vec_map = read_glove_vecs('data/glove.6B.50d.txt')

In [9]:
def parse_sent(input_sent):
    # This tokenization retains numbers and punctuation.
    # It makes sure the tokens retained from each sentence are in the dictionary.
    in0 = tokenizer_words.tokenize(input_sent)
    inp = [x.encode('UTF8').lower() for x in in0]
    # This happens with sentences that have commas in them.  
    # In that case, the first and last character of the line is a quotation.
    if inp[-1] == '"':
        inp = inp[:-1] 
    #print ("Input:", inp)

    valid_sent = []
    for word in inp:
        try:
            exists = word_to_index[word]
            valid_sent.append(word)
        except:
            # Ignore this word 
            x = 0
    valid_sent = ' '.join(valid_sent)        
    #print ("Output:", valid_sent)
    return valid_sent

In [10]:
# Read the labelled input data
# which will be used for training and testing.
# The sentences are already shuffled so sentences from one filing are not in sequence.
# Not all of the sentences are labelled:
# those that are labelled have the first field coded as 0/1/2
# the rest are labelled 5.
with open('new_trtest_10K_labelled.csv', 'rb') as f:
    reader = csv.reader(f)
    sentences = list(reader)
    
# Grab the labelled sentences and store them in arrays
y, trfile, sent_cnt, sent_ind, word_cnt, full_s = [],[],[],[],[],[]
for sent in sentences:
    # This is for when the sentence has commas in it. In that case the sentence will be split.
    full_sent = ",".join([str(i) for i in sent])
    
    # Split the sentence using the split characters: ' _*_ '
    inp = full_sent.split(' _*_ ')
    y_ = int(inp[0]) 
    #print y_
    if y_ >= 0 and y_ <= 2: # Not all sentences are labelled: grab the labelled ones only
        trfile_ = inp[1].replace('"', '')
        sent_cnt_ = int(inp[2])
        sent_ind_ = int(inp[3])
        word_cnt_ = int(inp[4])
        # Split the sentence into words, and keep the words that are in GloVe only
        full_sent_ = parse_sent(inp[5])
        #print y_, trfile_, sent_cnt_, sent_ind_, word_cnt_ , full_sent_

        y.append(y_)
        trfile.append(trfile_)
        sent_cnt.append(sent_cnt_)
        sent_ind.append(sent_ind_)
        word_cnt.append(word_cnt_)
        full_s.append([full_sent_])         

In [11]:
print np.unique(y, return_counts=True)

(array([0, 1, 2]), array([193, 479, 328]))


In [12]:
# Randomly select training and test observations
# I used this instead of sckit_learn so that I can use the
# indices on other data items if needed
# returns arrays 
def TrTestSet(X, y, trlen):
    all_ind = range(0, len(X))
    #print all_ind
    train_ind = random.sample(range(0, len(X)-1), trlen)
    test_ind = [i for i in all_ind if i not in train_ind]

    X_test= [X[i][0] for i in test_ind]
    y_test= np.asarray([y[i] for i in test_ind])
    X_train= [X[i][0] for i in train_ind]
    X_train = np.asarray(X_train)
    X_test = np.asarray(X_test)
    y_train= np.asarray([y[i] for i in train_ind])
    return X_test, X_train, y_test, y_train

In [13]:
def eval_metrics(actual, predicted, cnt , m , t):   
    res = {}
    res['acc'] = accuracy_score(actual, predicted)
    cls = classification_report(actual, predicted)
    con = confusion_matrix(actual, predicted)
    precision,recall,fscore,support = precision_recall_fscore_support(actual, predicted)
    for i in range(3): # Rows: predicted
        varname = 'prec_'+str(i)
        res[varname] = precision[i]
        varname = 'rec_'+str(i)
        res[varname] = recall[i]
        varname = 'f1_'+str(i)
        res[varname] = fscore[i]
        varname = 'sup_'+str(i)
        res[varname] = support[i]        
    precision,recall,fscore,support = precision_recall_fscore_support(actual, predicted, average='weighted')
    res['prec_all'] = precision
    res['rec_all'] = recall
    res['f1_all'] = fscore
    res['sup_all'] = res['sup_0']+res['sup_1']+res['sup_2']
    for i in range(3): # Rows: predicted
        for j in range(3): # Columns: actual
            varname = 'A'+str(i)+'P'+str(j)
            res[varname] = con[i,j]
    res['rnd_ct'] = cnt
    res['model'] = m
    res['trtest'] = t
    return pd.Series(res).to_frame().transpose()

In [14]:
# Sentiment prediction based on Loughran - McDonald dictionary of financial words.
# In this "model" there is no training.
# Sentiment: Positive / Neutral / Negative
positives = open('data/LoughranMcDonald_Positive.csv', "r").readlines()
positive = [pos.strip().lower().split(',')[0] for pos in positives]
negatives = open('data/LoughranMcDonald_Negative.csv', "r").readlines()
negative = [neg.strip().lower().split(',')[0] for neg in negatives]

In [15]:
def LM_predict(X):
    bins = np.array([-0.1, 0.10, 1.0])
    tot_, perc_ = [], []
    for s in X:
        words = s.split()
        #print (words)
        pos,neg = [], []
        for word in words:
            if word in (positive):
                pos.append(word)
            if word in (negative):
                neg.append(word)  
        pos_len = len(pos)
        neg_len = len(neg)
        tot =  pos_len + neg_len
        if tot == 0:
            perc = 0
        else:
            perc = 1.0 * (pos_len - neg_len) / tot
        tot_.append(tot)
        perc_.append(perc)
    pred_ = np.digitize(perc_, bins, right = True)
    return (tot_, perc_, pred_)

In [16]:
def sentence_to_avg(sentences, word_to_vec_map):
    avgs = []
    for sentence in sentences:
        words = sentence.split()
        # Initialize the average word vector, should have the same shape as your word vectors.
        avg = np.zeros((50,))
        # Step 2: average the word vectors. You can loop over the words in the list "words".
        for w in words:
            avg += word_to_vec_map[w]
        avg = avg / len(words)
        avgs.append(avg)
    np_avgs = np.array(avgs)    
    return np_avgs

In [17]:
#Note: when using the categorical_crossentropy loss, your targets should be in categorical format (e.g. if you have 10 classes, the target for each sample should be a 10-dimensional vector that is all-zeros except for a 1 at the index corresponding to the class of the sample). In order to convert integer targets into categorical targets, you can use the Keras utility to_categorical:
        
#from keras.utils.np_utils import to_categorical

#categorical_labels = to_categorical(int_labels, num_classes=None)

In [18]:
def GV_predict(X_train, X_test, y_train, y_test):
    # For the GV model, we need word vectors averaged into a single vector per sentence
    X_train_avg = sentence_to_avg(X_train, word_to_vec_map)
    X_test_avg = sentence_to_avg(X_test, word_to_vec_map)
    Y_train_oh = np.eye(3)[np.asarray(y_train).reshape(-1)]
    Y_test_oh = np.eye(3)[np.asarray(y_test).reshape(-1)]    

    GV_model = Sequential([
    Dense(50, input_shape=(50,)),
    Dense(3),
    Activation('softmax')
    ])
    #GV_model.summary()
    GV_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])    

    seed(1)
    set_random_seed(1)
    np.random.seed(1)
    GV_model.fit(X_train_avg, Y_train_oh, epochs = 200, batch_size = 32, shuffle=True, verbose=0)  
    
    # Simple GloVe Neural Network (GV) Model 
    # performance on the training and test sets
    #loss, acc = GV_model.evaluate(X_train_avg, Y_train_oh)
    pred_train = np.argmax(GV_model.predict(X_train_avg), axis=1)
    # Save the model so that it can be used on full Item 7s later
    GV_model.save('GV_model.h5')  # creates an HDF5 file '
    #print('GV model performance')
    #print 'Training set accuracy:','%0.2f' % acc
    #loss, acc = GV_model.evaluate(X_test_avg, Y_test_oh)
    pred_test = np.argmax(GV_model.predict(X_test_avg), axis=1)
    #print 'Test set accuracy:','%0.2f' % acc
    from IPython.display import SVG
    from keras.utils.vis_utils import model_to_dot

    SVG(model_to_dot(GV_model).create(prog='dot', format='svg'))    
    
    return pred_train, pred_test

In [19]:
def sentences_to_indices(X, word_to_index, max_len):
    m = X.shape[0]                                   # number of training examples
    # Initialize X_indices as a numpy matrix of zeros and the correct shape 
    X_indices = np.zeros((m,max_len))
    
    for i in range(m):                               # loop over training examples   
        # Split the ith training sentence into a list of words.
        sentence_words =X[i].split()
        #print len(sentence_words), sentence_words
        # Initialize j to 0
        j = 0
        
        # Loop over the words of sentence_words
        for w in sentence_words:
            # Set the (i,j)th entry of X_indices to the index of the correct word.
            X_indices[i, j] = word_to_index[w]
            # Increment j to j + 1
            j = j + 1    
    return X_indices

In [20]:
def pretrained_embedding_layer(word_to_vec_map, word_to_index):
    vocab_len = len(word_to_index) + 1                  # adding 1 to fit Keras embedding (requirement)
    emb_dim = word_to_vec_map["cucumber"].shape[0]      # define dimensionality of your GloVe word vectors (= 50)
    
    # Initialize the embedding matrix as a numpy array of zeros of shape (vocab_len, dimensions of word vectors = emb_dim)
    emb_matrix = np.zeros((vocab_len, emb_dim))
    
    # Set each row "index" of the embedding matrix to be the word vector representation of the "index"th word of the vocabulary
    for word, index in word_to_index.items():
        emb_matrix[index, :] = word_to_vec_map[word]

    # Define Keras embedding layer with the correct output/input sizes, make it trainable. 
    # Use Embedding(...). Make sure to set trainable=False. 
    embedding_layer = Embedding(vocab_len, emb_dim, trainable=False)

    # Build the embedding layer, it is required before setting the weights of the embedding layer. 
    # Do not modify the "None".
    embedding_layer.build((None,))
    
    # Set the weights of the embedding layer to the embedding matrix. Your layer is now pretrained.
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

In [21]:
def model(input_shape, word_to_vec_map, word_to_index):
    """
    Function creating the Emojify-v2 model's graph.
    
    Arguments:
    input_shape -- shape of the input, usually (max_len,)
    word_to_vec_map -- dictionary mapping every word in a vocabulary into its 50-dimensional vector representation
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

    Returns:
    model -- a model instance in Keras
    """
    # Define sentence_indices as the input of the graph, it should be of shape input_shape and dtype 'int32' (as it contains indices).
    sentence_indices = Input(input_shape, dtype = 'int32')
    # Create the embedding layer pretrained with GloVe Vectors (≈1 line)
    embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
    # Propagate sentence_indices through your embedding layer, you get back the embeddings
    embeddings = embedding_layer(sentence_indices)   
    # Propagate the embeddings through an LSTM layer with 128-dimensional hidden state
    # Be careful, the returned output should be a batch of sequences.
    X = LSTM(128, return_sequences = True)(embeddings)
    # Add dropout with a probability of 0.5
    X = Dropout(0.5)(X)
    # Propagate X trough another LSTM layer with 128-dimensional hidden state
    # Be careful, the returned output should be a single hidden state, not a batch of sequences.
    X = LSTM(128, return_sequences = False)(X)
    # Add dropout with a probability of 0.5
    X = Dropout(0.5)(X)
    # Propagate X through a Dense layer with softmax activation to get back a batch of 3-dimensional vectors.
    X = Dense(3)(X)
    # Add a softmax activation
    X = Activation('softmax')(X)
    # Create Model instance which converts sentence_indices into X.
    model = Model(inputs = sentence_indices, outputs = X)
    
    return model

In [22]:
def NN_predict(X_train, X_test, y_train, y_test):
    seed(1)
    set_random_seed(1)
    np.random.seed(1)
    NN_model = model((maxLen,), word_to_vec_map, word_to_index)
    #NN_model.summary()
    NN_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    X_train_indices = sentences_to_indices(X_train, word_to_index, maxLen)
    Y_train_oh = np.eye(3)[np.asarray(y_train).reshape(-1)]
    Y_test_oh = np.eye(3)[np.asarray(y_test).reshape(-1)]    

    NN_model.fit(X_train_indices, Y_train_oh, epochs = 30, batch_size = 32, shuffle=True, verbose = 0)
    # Save the model so that it can be used on full Item 7s later
    NN_model.save('NN_model.h5')  # creates an HDF5 file '

    #del model  # deletes the existing model
    #model = load_model('model2.h5') # 
    # LSTM Neural Network (NN) Model
    X_train_indices = sentences_to_indices(X_train, word_to_index, max_len = maxLen)
    #loss, acc = NN_model.evaluate(X_train_indices, Y_train_oh)
    pred_train = np.argmax(NN_model.predict(X_train_indices), axis=1)

    X_test_indices = sentences_to_indices(X_test, word_to_index, max_len = maxLen)
    #loss, acc = NN_model.evaluate(X_test_indices, Y_test_oh)
    pred_test = np.argmax(NN_model.predict(X_test_indices), axis=1)
    #print 'Test set accuracy:','%0.2f' % acc
    #del NN_model    
    return pred_train, pred_test    

In [23]:
# Main loop: Randomly generate train and test sets, run the three models
# save model performances (both train and test sets) in a dataframe.
# 70% of the observations will be in the test set
trlen = int(len(full_s)*0.7)
maxLen = 55

f_ = 1   # Very first run will create a dataframe
for i_ in range(1,101):
    print 'Trial:', i_, time.ctime()
    X_test, X_train, y_test, y_train = TrTestSet(full_s, y, trlen)
        
    tot_, perc_, pred_ = LM_predict(X_train)
    res = eval_metrics(y_train, pred_, i_, 'LM','TR')
    print 'Finished:', i_, 'LM','TR'
    if f_ == 1:
        res_df = res
        f_ = 0
    else:
        res_df = res_df.append(res, ignore_index=True)    
        
    tot_, perc_, pred_ = LM_predict(X_test)
    res = eval_metrics(y_test, pred_, i_, 'LM','TE')
    print 'Finished:', i_, 'LM','TE'
    if f_ == 1:
        res_df = res
        f_ = 0
    else:
        res_df = res_df.append(res, ignore_index=True)
    
    pred_train, pred_test = GV_predict(X_train, X_test, y_train, y_test)
    res = eval_metrics(y_train, pred_train, i_, 'GV','TR')
    print 'Finished:', i_, 'GV','TR'
    if f_ == 1:
        res_df = res
        f_ = 0
    else:
        res_df = res_df.append(res, ignore_index=True)    
    
    res = eval_metrics(y_test, pred_test, i_, 'GV','TE')   
    print 'Finished:', i_, 'GV','TE'
    if f_ == 1:
        res_df = res
        f_ = 0
    else:
        res_df = res_df.append(res, ignore_index=True)    
        
    pred_train, pred_test = NN_predict(X_train, X_test, y_train, y_test)
    res = eval_metrics(y_train, pred_train, i_, 'NN','TR')
    print 'Finished:', i_, 'NN','TR'
    if f_ == 1:
        res_df = res
        f_ = 0
    else:
        res_df = res_df.append(res, ignore_index=True)    
    
    res = eval_metrics(y_test, pred_test, i_, 'NN','TE') 
    print 'Finished:', i_, 'NN','TE'
    if f_ == 1:
        res_df = res
        f_ = 0
    else:
        res_df = res_df.append(res, ignore_index=True)     
    print 'Trial end:', time.ctime()     

Trial: 1 Tue May 29 19:55:13 2018
Finished: 1 LM TR
Finished: 1 LM TE
Finished: 1 GV TR
Finished: 1 GV TE
Finished: 1 NN TR
Finished: 1 NN TE
Trial end: Tue May 29 19:57:51 2018
Trial: 2 Tue May 29 19:57:51 2018
Finished: 2 LM TR
Finished: 2 LM TE
Finished: 2 GV TR
Finished: 2 GV TE
Finished: 2 NN TR
Finished: 2 NN TE
Trial end: Tue May 29 20:00:29 2018
Trial: 3 Tue May 29 20:00:29 2018
Finished: 3 LM TR
Finished: 3 LM TE
Finished: 3 GV TR
Finished: 3 GV TE
Finished: 3 NN TR
Finished: 3 NN TE
Trial end: Tue May 29 20:02:56 2018
Trial: 4 Tue May 29 20:02:56 2018
Finished: 4 LM TR
Finished: 4 LM TE
Finished: 4 GV TR
Finished: 4 GV TE
Finished: 4 NN TR
Finished: 4 NN TE
Trial end: Tue May 29 20:05:23 2018
Trial: 5 Tue May 29 20:05:23 2018
Finished: 5 LM TR
Finished: 5 LM TE
Finished: 5 GV TR
Finished: 5 GV TE
Finished: 5 NN TR
Finished: 5 NN TE
Trial end: Tue May 29 20:07:50 2018
Trial: 6 Tue May 29 20:07:50 2018
Finished: 6 LM TR
Finished: 6 LM TE
Finished: 6 GV TR
Finished: 6 GV TE
Fini

  'precision', 'predicted', average, warn_for)


Finished: 46 LM TR
Finished: 46 LM TE
Finished: 46 GV TR
Finished: 46 GV TE
Finished: 46 NN TR
Finished: 46 NN TE
Trial end: Tue May 29 22:00:53 2018
Trial: 47 Tue May 29 22:00:53 2018
Finished: 47 LM TR
Finished: 47 LM TE
Finished: 47 GV TR
Finished: 47 GV TE
Finished: 47 NN TR
Finished: 47 NN TE
Trial end: Tue May 29 22:04:02 2018
Trial: 48 Tue May 29 22:04:02 2018
Finished: 48 LM TR
Finished: 48 LM TE
Finished: 48 GV TR
Finished: 48 GV TE
Finished: 48 NN TR
Finished: 48 NN TE
Trial end: Tue May 29 22:07:05 2018
Trial: 49 Tue May 29 22:07:05 2018
Finished: 49 LM TR
Finished: 49 LM TE
Finished: 49 GV TR
Finished: 49 GV TE
Finished: 49 NN TR
Finished: 49 NN TE
Trial end: Tue May 29 22:10:09 2018
Trial: 50 Tue May 29 22:10:09 2018
Finished: 50 LM TR
Finished: 50 LM TE
Finished: 50 GV TR
Finished: 50 GV TE
Finished: 50 NN TR
Finished: 50 NN TE
Trial end: Tue May 29 22:13:16 2018
Trial: 51 Tue May 29 22:13:16 2018
Finished: 51 LM TR
Finished: 51 LM TE
Finished: 51 GV TR
Finished: 51 GV TE

Finished: 90 NN TR
Finished: 90 NN TE
Trial end: Wed May 30 00:37:56 2018
Trial: 91 Wed May 30 00:37:56 2018
Finished: 91 LM TR
Finished: 91 LM TE
Finished: 91 GV TR
Finished: 91 GV TE
Finished: 91 NN TR
Finished: 91 NN TE
Trial end: Wed May 30 00:42:00 2018
Trial: 92 Wed May 30 00:42:00 2018
Finished: 92 LM TR
Finished: 92 LM TE
Finished: 92 GV TR
Finished: 92 GV TE
Finished: 92 NN TR
Finished: 92 NN TE
Trial end: Wed May 30 00:46:02 2018
Trial: 93 Wed May 30 00:46:02 2018
Finished: 93 LM TR
Finished: 93 LM TE
Finished: 93 GV TR
Finished: 93 GV TE
Finished: 93 NN TR
Finished: 93 NN TE
Trial end: Wed May 30 00:50:15 2018
Trial: 94 Wed May 30 00:50:15 2018
Finished: 94 LM TR
Finished: 94 LM TE
Finished: 94 GV TR
Finished: 94 GV TE
Finished: 94 NN TR
Finished: 94 NN TE
Trial end: Wed May 30 00:54:21 2018
Trial: 95 Wed May 30 00:54:21 2018
Finished: 95 LM TR
Finished: 95 LM TE
Finished: 95 GV TR
Finished: 95 GV TE
Finished: 95 NN TR
Finished: 95 NN TE
Trial end: Wed May 30 00:58:26 2018
T

In [24]:
#display(res_df)
from IPython.display import display

pd.options.display.max_columns = None
display(res_df)
res_df.to_csv('res_df.csv',  float_format='%.3f', index=False)

Unnamed: 0,A0P0,A0P1,A0P2,A1P0,A1P1,A1P2,A2P0,A2P1,A2P2,acc,f1_0,f1_1,f1_2,f1_all,model,prec_0,prec_1,prec_2,prec_all,rec_0,rec_1,rec_2,rec_all,rnd_ct,sup_0,sup_1,sup_2,sup_all,trtest
0,59,70,6,66,242,22,18,169,48,0.498571,0.42446,0.596794,0.308682,0.466835,LM,0.412587,0.503119,0.631579,0.528785,0.437037,0.733333,0.204255,0.498571,1,135,330,235,700,TR
1,21,32,5,32,106,11,7,66,20,0.49,0.355932,0.600567,0.310078,0.463219,LM,0.35,0.519608,0.555556,0.497961,0.362069,0.711409,0.215054,0.49,1,58,149,93,300,TE
2,42,42,51,13,274,43,18,64,153,0.67,0.403846,0.771831,0.634855,0.654878,GV,0.575342,0.721053,0.619433,0.658836,0.311111,0.830303,0.651064,0.67,1,135,330,235,700,TR
3,13,11,34,7,118,24,17,35,41,0.573333,0.273684,0.753994,0.427083,0.559792,GV,0.351351,0.719512,0.414141,0.553669,0.224138,0.791946,0.44086,0.573333,1,58,149,93,300,TE
4,15,11,109,0,312,18,4,37,194,0.744286,0.194805,0.904348,0.697842,0.69818,NN,0.789474,0.866667,0.604361,0.76372,0.111111,0.945455,0.825532,0.744286,1,135,330,235,700,TR
5,4,10,44,1,122,26,3,33,57,0.61,0.121212,0.77707,0.518182,0.570016,NN,0.5,0.739394,0.448819,0.603033,0.0689655,0.818792,0.612903,0.61,1,58,149,93,300,TE
6,54,70,9,75,241,18,15,168,50,0.492857,0.389892,0.592866,0.322581,0.464334,LM,0.375,0.503132,0.649351,0.527457,0.406015,0.721557,0.214592,0.492857,2,133,334,233,700,TR
7,26,32,2,23,107,15,10,67,18,0.503333,0.436975,0.609687,0.276923,0.469769,LM,0.440678,0.519417,0.514286,0.502045,0.433333,0.737931,0.189474,0.503333,2,60,145,95,300,TE
8,56,31,46,14,266,54,32,62,139,0.658571,0.476596,0.767677,0.588983,0.652892,GV,0.54902,0.740947,0.58159,0.651438,0.421053,0.796407,0.596567,0.658571,2,133,334,233,700,TR
9,17,19,24,8,116,21,14,26,55,0.626667,0.343434,0.75817,0.564103,0.613768,GV,0.435897,0.720497,0.55,0.609586,0.283333,0.8,0.578947,0.626667,2,60,145,95,300,TE


# STOP HERE

In [25]:
from keras.utils import plot_model
plot_model(NN_model, to_file='NN_model.png')

NameError: name 'NN_model' is not defined

In [None]:
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

SVG(model_to_dot(NN_model).create(prog='dot', format='svg'))

In [None]:
plots(Y_test, pred_nn_te)

In [None]:
plots(Y_test, pred_lm_te)

In [None]:
plots(Y_test, pred_gv_te)

In [None]:
def plots(y_act, y_pred, title='Confusion matrix', cmap=plt.cm.gray_r): 
    print(pd.crosstab(y_act, y_pred.reshape(len(y_pred),), rownames=['Actual'], colnames=['Predicted'], margins=True))
    df_confusion = pd.crosstab(y_act, y_pred.reshape(y_pred.shape[0],), rownames=['Actual'], colnames=['Predicted'], margins=True)
    df_conf_norm = df_confusion / df_confusion.sum(axis=1)
    plt.matshow(df_confusion, cmap=cmap) 
    plt.colorbar()
    tick_marks = np.arange(len(df_confusion.columns))
    plt.xticks(tick_marks, df_confusion.columns) 
    plt.yticks(tick_marks, df_confusion.index)
    plt.ylabel(df_confusion.index.name)
    plt.xlabel(df_confusion.columns.name)

In [None]:
show_results(Y_test, pred_lm_te)

In [None]:
show_results(Y_test, pred_nn_te)

In [None]:
show_results(Y_test, pred_gv_te)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix#, accuracy_score
# Inputs are 0/1/2, not the one-hot coded versions
def show_results (y_true, y_pred):

    correct = np.sum(y_true == y_pred)
    total = len(y_true)
    error = 100.0 - (100 * float(correct) / float(total))
    print ("Error rate: %.1f%%" % (error))

    print classification_report(y_true, y_pred)
    print confusion_matrix(y_true, y_pred)
    
    confusions = np.zeros([3, 3], np.float32)
    bundled = zip(y_true, y_pred)
    for actual, predicted in bundled:
        confusions[actual, predicted] += 1

    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.grid(False)
    plt.xticks(np.arange(3))
    plt.yticks(np.arange(3))
    plt.imshow(confusions, cmap=plt.cm.RdGy, interpolation='nearest')

    for i, cas in enumerate(confusions):
        for j, count in enumerate(cas):
            if count >= 0:
                xoff = .07 * len(str(count))
                plt.text(j-xoff, i+.2, int(count), fontsize=9, color='blue')
    plt.show() 

In [None]:
# This code allows you to see the mislabelled examples
C = 5
y_test_oh = np.eye(C)[Y_test.reshape(-1)]
X_test_indices = sentences_to_indices(X_test, word_to_index, maxLen)
pred = NN_model.predict(X_test_indices)
for i in range(len(X_test)):
    x = X_test_indices
    num = np.argmax(pred[i])
    if(num != Y_test[i]):
        print('Expected label:', Y_test[i], ' prediction: ', num, X_test[i])

In [None]:
# Consolidate predictions
# First concatenate training and test sets, then merge the predictions with 
# the input data
pred_tr = pd.DataFrame(np.column_stack([X_train, Y_train, pred_lm_tr, pred_gv_tr,pred_nn_tr, tot_lm_tr, perc_lm_tr]), 
                               columns=['sentence', 'actual', 'pred_lm', 'pred_gv', 'pred_nn', 'tot_lm', 'perc_lm'])
pred_tr['trte'] = 'TR'
#display(pred_tr)
pred_te = pd.DataFrame(np.column_stack([X_test, Y_test, pred_lm_te, pred_gv_te,pred_nn_te, tot_lm_te, perc_lm_te]), 
                               columns=['sentence', 'actual', 'pred_lm', 'pred_gv', 'pred_nn', 'tot_lm', 'perc_lm'])
pred_te['trte'] = 'TE'
#display(pred_te)
predictions = pd.concat([pred_tr, pred_te])
predictions.index = range(len(predictions.index))
display(predictions)

In [None]:
other_cols = pd.DataFrame(np.column_stack([full_s, y, trfile, sent_cnt, sent_ind, word_cnt]), 
                               columns=['full_s', 'y', 'trfile', 'sent_cnt', 'sent_ind', 'word_cnt'])
other_cols.info()
other_cols.reset_index(drop=True)
display(other_cols)

In [None]:
final_df = pd.concat([other_cols, predictions], axis=1)
final_df.info()
display(final_df)

In [None]:
final_df.to_csv('final_df.csv',  float_format='%.3f', index=False)