In [1]:
import re
def tokenize(sentence, remove_vowels=False, remove_repeats=False, min_len=2):
    tokens = []
    for token in re.findall("[a-zA-Z]+",sentence.lower()):

        if len(token) >= min_len:
            if remove_vowels:
                token = delete_vowels(token)
            if remove_repeats:
                token = delete_repeats(token)
            tokens.append(token)
    return tokens

VOWELS = ['a', 'e', 'i', 'o', 'u']

def delete_repeats(string):
    return re.sub(r'(.)\1+', r'\1\1', string)     

def delete_vowels(string):
    return ''.join([l for l in string.lower() if l not in VOWELS])

if __name__ == '__main__':
    pass

def normalize_matrix(matrix):
    pass


In [2]:
import numpy as np
import h5py
import pickle
from copy import deepcopy
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.preprocessing import sequence
from keras import backend as K
from keras.layers.core import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import LSTM, GRU, Bidirectional
from keras.layers.convolutional import Convolution1D, MaxPooling1D

from keras.utils import np_utils
from keras.utils import pad_sequences

In [3]:
#Filenames
Masterdir = 'C:/Users/aman porwal/Desktop/Sentiment-Analysis-BidirectionalLSTM/'
Datadir = 'Data/'
inputdatasetfilename = 'tweetsData.txt'

In [4]:
#Data I/O formatting
SEPERATOR = '\t'
DATA_COLUMN = 1
LABEL_COLUMN = 3
LABELS = ['0','1','2'] # 0 -> Negative, 1-> Neutral, 2-> Positive
mapping_char2num = {}
mapping_num2char = {}
MAXLEN = 200

#LSTM Model Parameters
#Embedding
MAX_FEATURES = 0
embedding_size = 128

# Convolution
filter_length = 3
nb_filter = 128
pool_length = 3

# LSTM
lstm_output_size = 128

# Training
batch_size = 128
number_of_epochs = 1
numclasses = 3
test_size = 0.2

In [5]:
def parseData(Masterdir,filename,seperator,datacol,labelcol,labels):
    f=open(Masterdir+Datadir+filename,'r', encoding="utf-8")
    lines = f.read().lower()
    lines = lines.lower().split('\n')[:-1]

    X_train = []
    Y_train = []
    
    for line in lines:
        line = line.split(seperator)
        tokenized_lines = tokenize(line[datacol])
        
        char_list = []
        for words in tokenized_lines:
            for char in words:
                char_list.append(char)
            char_list.append(' ')
        X_train.append(char_list)
        
        if line[labelcol] == labels[0]:
            Y_train.append(0)
        if line[labelcol] == labels[1]:
            Y_train.append(1)
        if line[labelcol] == labels[2]:
            Y_train.append(2)

    Y_train = np.asarray(Y_train)
    assert(len(X_train) == Y_train.shape[0])

    return [X_train,Y_train]

In [6]:
def convert_chartonum(mapping_n2c,mapping_c2n,trainwords,maxlen):
    allchars = []
    errors = 0

    for line in trainwords:
        try:
            allchars = set(allchars+line)
            allchars = list(allchars)
        except:
            errors += 1

    charno = 0
    for char in allchars:
        mapping_char2num[char] = charno
        mapping_num2char[charno] = char
        charno += 1

    assert(len(allchars)==charno) #Checks

    X_train = []
    for line in trainwords:
        char_list=[]
        for letter in line:
            char_list.append(mapping_char2num[letter])
        X_train.append(char_list)
    print(mapping_char2num)
    print(mapping_num2char)

    X_train = pad_sequences(X_train[:], maxlen=maxlen)
    return [X_train,mapping_num2char,mapping_char2num,charno]

In [7]:
def BidirectionalLSTM(X_train,y_train,args):
    max_features = args[0]
    maxlen = args[1]
    embedding_size = args[2]
    
    # Convolution hyperparameters
    filter_length = args[3]
    nb_filter = args[4]
    pool_length = args[5]
    
    # LSTM hyperparameters
    lstm_output_size = args[6]
    
    # Training hyperparameters
    batch_size = args[7]
    nb_epoch = args[8]
    numclasses = args[9]
    test_size = args[10] 

    y_train = np_utils.to_categorical(y_train, numclasses) 
    
    #Train & Validation data splitting
    X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=test_size, random_state=42)
    
    #Build the sequential model
    # Model Architecture is:
    # Input -> Embedding -> Conv1D+Maxpool1D -> BidirectionalLSTM -> FC-1 -> Softmaxloss
    print('Build model...')
    model = Sequential()
    model.add(Embedding(max_features, embedding_size, input_length=maxlen))
    model.add(Convolution1D(filters=nb_filter, kernel_size=filter_length, padding='valid', activation='relu', strides=1))
    model.add(MaxPooling1D(pool_size=pool_length))
    model.add(Bidirectional(LSTM(lstm_output_size, dropout=0.2, recurrent_dropout=0.2, return_sequences=True), merge_mode='concat'))
    model.add(Bidirectional(LSTM(lstm_output_size, dropout=0.2, recurrent_dropout=0.2)))
    model.add(Dense(numclasses))
    model.add(Activation('softmax'))

    # Optimizer is Adamax along with categorical crossentropy loss
    model.compile(loss='categorical_crossentropy', optimizer='adamax', metrics=['accuracy'])

    print('Train...')
    model.fit(X_train, y_train, batch_size=batch_size, shuffle=True, epochs=nb_epoch, validation_data=(X_valid, y_valid))
    return model

In [8]:
def evaluate_model(X_test,y_test,model,batch_size,numclasses):
    #Convert y_test to one-hot encoding
    y_test = np_utils.to_categorical(y_test, numclasses)
    #Evaluate the accuracies
    score, acc = model.evaluate(X_test, y_test, batch_size=batch_size)
    print('Test score:', score)
    print('Test accuracy:', acc)

In [None]:
if __name__ == '__main__':
    out = parseData(Masterdir,inputdatasetfilename,SEPERATOR,DATA_COLUMN,LABEL_COLUMN,LABELS)
    X_train = out[0]
    y_train = out[1]

    #Creating character dictionaries and format conversion in progess...
    out = convert_chartonum(mapping_num2char,mapping_char2num,X_train,MAXLEN)
    mapping_num2char = out[1]
    mapping_char2num = out[2]
    MAX_FEATURES = out[3]
    X_train = np.asarray(out[0])
    y_train = np.asarray(y_train).flatten()
    
    #Splitting data into train and test...
    X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
    print('X_train shape:', X_train.shape)
    print('X_test shape:', X_test.shape)
    
    #Creating LSTM Network...
    model = BidirectionalLSTM(deepcopy(X_train),deepcopy(y_train),[MAX_FEATURES, MAXLEN, embedding_size,\
                 filter_length, nb_filter, pool_length, lstm_output_size, batch_size, \
                 number_of_epochs, numclasses, test_size])

    #Evaluating model...
    evaluate_model(X_test,deepcopy(y_test),model,batch_size,numclasses)

{'g': 0, 'm': 1, 'b': 2, 'n': 3, 'k': 4, 'd': 5, 'r': 6, 'e': 7, 'z': 8, 'u': 9, 'l': 10, 'y': 11, ' ': 12, 'o': 13, 'f': 14, 'h': 15, 'q': 16, 'p': 17, 'a': 18, 'j': 19, 's': 20, 'w': 21, 't': 22, 'i': 23, 'x': 24, 'v': 25, 'c': 26}
{0: 'g', 1: 'm', 2: 'b', 3: 'n', 4: 'k', 5: 'd', 6: 'r', 7: 'e', 8: 'z', 9: 'u', 10: 'l', 11: 'y', 12: ' ', 13: 'o', 14: 'f', 15: 'h', 16: 'q', 17: 'p', 18: 'a', 19: 'j', 20: 's', 21: 'w', 22: 't', 23: 'i', 24: 'x', 25: 'v', 26: 'c'}
X_train shape: (4000, 200)
X_test shape: (1000, 200)
Build model...
Train...