In [80]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [81]:
!ls '/content/drive/Team Drives/ANLY-521 Final'

 business_reviews2017.tsv   FinalProjectDescription.pdf		    model
 data			    Glove_Global_Vectors		    script
'DL(Glove).ipynb'	   'Literature Review - Brainstorm .gdoc'
'Final Deliverable'	   'Literature Review.gdoc'


In [82]:
pip install paramiko



In [83]:
import argparse
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from gensim.models import KeyedVectors
import re
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.preprocessing import text
from keras.layers import Dense, Dropout, Activation, Embedding, LSTM, Conv1D, MaxPooling1D, Bidirectional
from sklearn.metrics import accuracy_score, f1_score
# from keras.models import load_model


class LoadData:
    '''
    Load, classify and split data
    '''
    def __init__(self, data_file, out_path, verbose=True):
        self.data = pd.read_csv(data_file, sep = '\t', index_col = 0)
        # assign review samples to two classes using [0,4) and [4, 5] criteria
        self.data['class'] = (self.data['stars'] >= 4).astype(int)
        self.data = self.data[['text', 'class']]
        self.data['text'] = self.data['text'].apply(CleanText)
        np.random.seed(1)
        self.train, self.test = train_test_split(self.data, train_size=0.7)
        # optional file saving
        if verbose:
            self.data.to_csv(out_path + '.tsv', sep='\t', index=False)
            self.train.to_csv(out_path+'_train.tsv', sep='\t', index=False)
            self.test.to_csv(out_path+'_test.tsv', sep='\t', index=False)


def CleanText(string):
    '''
    String cleaning
    :param string:
    :return: Cleaned review text
    '''
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r" \'s", "\'s", string)
    string = re.sub(r" \'ve", "\'ve", string)
    string = re.sub(r" n\'t", "n\'t", string)
    string = re.sub(r" \'re", "\'re", string)
    string = re.sub(r" \'d", "\'d", string)
    string = re.sub(r" \'ll", "\'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r" \(", "", string)
    string = re.sub(r" \)", "", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\n", " ", string)
    return string


def Padding(data, max_len = 50):
    '''
    Padding vector to specified length
    :param data: Text input
    :param max_len: Padding length
    :return: Padded vector
    '''
    return pad_sequences(data, padding='post', truncating='post', maxlen = max_len)


def WordEmbedding(X, y, embed_path='model/GoogleNews-vectors-negative300.bin',
                  max_features = 3000, w2v_size = 300, max_len = 50):
    '''
    Create word embedding
    :param X: train data input
    :param y: train data label
    :param embed_path: Path to pre-trained word2vec model
    :param max_features: Maximum number of features
    :param w2v_size: Word2vec size
    :param max_len: text padding length
    :return: Processed training data input and label, tokenizer, word embedding
    '''
    # tokenization & vectorizization
    tk = text.Tokenizer(num_words=max_features, filters='"#$%&()*+,-./:;<=>@[\\]^_`{|}~\t\n',
                        split=" ")
    tk.fit_on_texts(X)
    word_index = tk.word_index
    # padding
    X_train = Padding(tk.texts_to_sequences(X), max_len)
    y_train = y
    # load google news pre-trained model
    w2v_model = KeyedVectors.load_word2vec_format(embed_path, binary=True)
    # Create word embedding vector matrix using pre-trained model
    w2v_matrix = np.zeros((len(word_index) + 1, w2v_size))
    for word,i in word_index.items():
        if word in w2v_model.vocab:
            w2v_matrix[i] = w2v_model[word]
    w2v_emb = Embedding(len(word_index)+1, w2v_size, weights=[w2v_matrix],
                            input_length=max_len)
    return X_train, y_train, tk, w2v_emb


def WordEmbedding_1(X, y, max_features = 3000, w2v_size = 300, max_len = 50):
    '''
    This one is for GLOVE Embedding from Stanford
    '''
    # tokenization
    tk = text.Tokenizer(num_words=max_features, filters='"#$%&()*+,-./:;<=>@[\\]^_`{|}~\t\n', 
                        split=" ")
    tk.fit_on_texts(X)
    word_index = tk.word_index
    # padding
    X_train = Padding(tk.texts_to_sequences(X), max_len)
    y_train = y                
    
    # prepare the embedding layer by using GLOVE
    GLOVE_dir = '/content/drive/Team Drives/ANLY-521 Final/Glove_Global_Vectors'

    embedding_ind = {}
    file = open(os.path.join(GLOVE_dir, 'glove.6B.50d.txt'))
    for line in file:      
      values = line.split()
      word = values[0]
      coefs = np.asarray(values[1:], dtype='float32')
      embedding_ind[word] = coefs
    file.close()

    # compute embedding matrix
    embedding_dim = 50 # size of each word vector

    embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
    for word, i in word_index.items():
        embedding_vector = embedding_ind.get(word)
        if embedding_vector is not None:
          # words not found treated as all zeros
          embedding_matrix[i] = embedding_vector
    
    # define embedding layer
    embedding_layer = Embedding(len(word_index) + 1, 
                                embedding_dim,
                                weights=[embedding_matrix],
                                input_length=max_len,
                                trainable=False)
    
    return X_train, y_train, tk, embedding_layer  
  
  
def base_LSTM(X, y, w2v_emb, output_size=100, dropout=0.2,
              loss='binary_crossentropy', optimizer='adam',
              batch_size=128, nb_epoch=10, validation_split=0.2,
              shuffle=True):
    '''
    LSTM Model
    :param X: Train data input
    :param y: Train data label
    :param w2v_emb: Word embedding
    :param output_size: Output size
    :param dropout: Dropout ratio
    :param loss: Loss function
    :param optimizer: Optimizer
    :param batch_size: Batch size
    :param nb_epoch: Number of epoch
    :param validation_split: Training / Validation split
    :param shuffle: Shuffle training data before each epoch
    :return: LSTM model
    '''
    model = Sequential()
    model.add(w2v_emb)
    model.add(LSTM(output_size))
    model.add(Dropout(dropout))
    model.add(Dense(1))
    model.add(Activation('sigmoid'))
    model.layers[1].trainable = False
    model.compile(loss=loss,
                  optimizer=optimizer,
                  metrics=['accuracy'])
    model.fit(X, y, batch_size=batch_size,
              epochs=nb_epoch, validation_split=validation_split,
              shuffle=shuffle)

    return model


def CNNLSTM(X, y, w2v_emb, output_size=100, dropout=0.2,
            loss='binary_crossentropy', optimizer='adam',
            batch_size=128, nb_epoch=10, validation_split=0.2,
            shuffle=True):
    '''
    CNN + LSTM Model
    :param X: Train data input
    :param y: Train data label
    :param w2v_emb: Word embedding
    :param output_size: Output size
    :param dropout: Dropout ratio
    :param loss: Loss function
    :param optimizer: Optimizer
    :param batch_size: Batch size
    :param nb_epoch: Number of epoch
    :param validation_split: Training / Validation split
    :param shuffle: Shuffle training data before each epoch
    :return: CNN + LSTM model
    '''
    model = Sequential()
    model.add(w2v_emb)
    model.add(Conv1D(64, 5, activation='relu'))
    model.add(Dropout(dropout))
    model.add(MaxPooling1D(pool_size=4))
    model.add(LSTM(output_size))
    model.add(Dense(1))
    model.add(Activation('sigmoid'))
    model.layers[1].trainable = False
    model.compile(loss=loss,
                  optimizer=optimizer,
                  metrics=['accuracy'])
    model.fit(X, y, batch_size=batch_size,
              epochs=nb_epoch, validation_split=validation_split,
              shuffle=shuffle)

    return model


def BiLSTM(X, y, w2v_emb, output_size=100, dropout=0.2,
           loss='binary_crossentropy', optimizer='adam',
           batch_size=128, nb_epoch=10, validation_split=0.2,
           shuffle=True):
    '''
    CNN + Bidirectional LSTM Model
    :param X: Train data input
    :param y: Train data label
    :param w2v_emb: Word embedding
    :param output_size: Output size
    :param dropout: Dropout ratio
    :param loss: Loss function
    :param optimizer: Optimizer
    :param batch_size: Batch size
    :param nb_epoch: Number of epoch
    :param validation_split: Training / Validation split
    :param shuffle: Shuffle training data before each epoch
    :return: CNN + Bidirectional model
    '''
    model = Sequential()
    model.add(w2v_emb)
    model.add(Conv1D(64, 5, activation='relu'))
    model.add(Dropout(dropout))
    model.add(MaxPooling1D(pool_size=4))
    model.add(Bidirectional(LSTM(output_size)))
    model.add(Dense(1))
    model.add(Activation('sigmoid'))
    model.layers[1].trainable = False
    model.compile(loss=loss,
                  optimizer=optimizer,
                  metrics=['accuracy'])
    model.fit(X, y, batch_size=batch_size,
              epochs=nb_epoch, validation_split=validation_split,
              shuffle=shuffle)

    return model


def BiLSTM_1(X, y, embedding_layer, max_len = 100, output_size = 50, dropout = 0.1, 
            loss = 'binary_crossentropy', optimizer = 'adam', 
            batch_size = 128, nb_epoch = 10, validation_split = 0.2, 
            shuffle = True):
    '''
    This one is for using GLOVE embedding from Stanford
    '''  
    inp = Input(shape=(max_len,))
    x = embedded_sequences = embedding_layer(inp)
    x = Bidirectional(LSTM(output_size, return_sequences=True, dropout=dropout, recurrent_dropout=0.1))(x)
    x = GlobalMaxPool1D()(x)
    x = Dense(50, activation="relu")(x)
    x = Dropout(0.1)(x)
    x = Dense(1, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss=loss, optimizer=optimizer, metrics=['accuracy'])
   
    # fit model
    model.fit(X, y, batch_size=batch_size, epochs=nb_epoch, 
              validation_split=validation_split, 
              shuffle=shuffle)
  
    return model
      
  
def Eval(X, y, name, model, verbose = True,
         output_path='model/'):
    '''
    Model evaluation
    :param X: Test data input
    :param y: Test data label
    :param model: Model
    :return: Accuracy and F1 scores
    '''
    print(f'{name} Model')
    X_test = X
    y_test = y
    y_pred = model.predict_classes(X_test)
    print(f'Test Accuracy:{accuracy_score(y_test, y_pred)}')
    print(f'Test F1:{f1_score(y_test, y_pred)}')
    if verbose:
        model.save(f'{output_path}{name}.h5')
        

def Eval_1(X, y, name, model, verbose = True,
         output_path='model/'):
    '''
    Model evaluation for non-sequential
    :param X: Test data input
    :param y: Test data label
    :param model: Model
    :return: Accuracy and F1 scores
    '''
    print(f'{name} Model')
    X_test = X
    y_test = y
    test_acc = model.evaluate(X_test, y_test, verbose=0)
    print(f'Test Accuracy:{test_acc}')
    if verbose:
        model.save(f'{output_path}{name}.h5')
  

def main():

    # Load, classify and split data
    data_file = '/content/drive/Team Drives/ANLY-521 Final/data/business_reviews2017.tsv'
    out_path = '/content/drive/Team Drives/ANLY-521 Final/data/business_reviews'    
    DF = LoadData(data_file, out_path, verbose=False)

    # visualize the distribution of each class
    #ax = DF.data['class'].value_counts().plot(kind='bar',figsize=(14,8),
    #            title="Number for Each Class (1 = high star, 0 = low star)")
    #ax.set_xlabel("Class")
    #ax.set_ylabel("Count")
    #plt.show()

    # data pre-processing
    #X_train, y_train, tk, w2v_emb = WordEmbedding(DF.train['text'], DF.train['class'], max_len = 100)
    X_train_1, y_train_1, tk_1, embedding_layer = WordEmbedding_1(DF.train['text'], DF.train['class'], max_len = 100) 

    # model training
    LSTM_model = base_LSTM(X_train_1, y_train_1, embedding_layer, output_size = 64)
    CNNLSTM_model = CNNLSTM(X_train_1, y_train_1, embedding_layer, output_size = 64, nb_epoch = 3)
    biLSTM_model = BiLSTM(X_train_1, y_train_1, embedding_layer, output_size = 64, nb_epoch = 3)
    biLSTM_model_1 = BiLSTM_1(X_train_1, y_train_1, embedding_layer, max_len = 100, output_size = 64, nb_epoch = 3)
    
    # model performance on test data
    X_test = Padding(tk.texts_to_sequences(DF.test['text']), max_len = 100)
    Eval(X_test, DF.test['class'], 'LSTM', LSTM_model, verbose=False)
    Eval(X_test, DF.test['class'], 'CNN + LSTM', CNNLSTM_model, verbose=False)
    Eval(X_test, DF.test['class'], 'CNN + Bidirectional LSTM', biLSTM_model, verbose=False)
    Eval_1(X_test, DF.test['class'], 'CNN + Bidirectional LSTM with GLOVE', biLSTM_model_1, verbose=False)
    
 

 #if __name__ == "__main__":

#    parser = argparse.ArgumentParser()
#    parser.add_argument("--data_file", type=str,
#                        default="data/business_reviews2017.tsv",
#                        help="2017 Yelp Business Reviews tsv file")
#    parser.add_argument("--out_path", type=str,
#                        default="data/business_reviews",
#                        help="Dir to write train/test data")

#    args = parser.parse_args()

#    main(args.data_file, args.out_path)
main()



Train on 41341 samples, validate on 10336 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 41341 samples, validate on 10336 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Train on 41341 samples, validate on 10336 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Train on 41341 samples, validate on 10336 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
LSTM Model
Test Accuracy:0.5430738667148275
Test F1:0.35069934556653404
CNN + LSTM Model
Test Accuracy:0.6010926494491602
Test F1:0.4679954236165472
CNN + Bidirectional LSTM Model
Test Accuracy:0.5894437420986094
Test F1:0.3414210183240386
CNN + Bidirectional LSTM with GLOVE Model
Test Accuracy:[0.6485450126557697, 0.6250225754018421]
