##CNN-LSTM approach

In [0]:
from sklearn import metrics
from random import shuffle
from collections import defaultdict
from tqdm import tqdm_notebook as tqdm
from gensim.models import Word2Vec, Doc2Vec
from gensim.models.doc2vec import LabeledSentence
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC 
from keras.models import load_model
import matplotlib.pyplot as plt
import seaborn as sns
import  numpy as np
from matplotlib import pyplot as plt
import spacy
import os
import warnings
warnings.filterwarnings("ignore")

Using TensorFlow backend.


In [0]:
path_data = 'tweet_data.txt'

In [0]:
labeled_tweets = []
with open(path_data,'r', encoding='utf-8') as f:
    for tweet in f:
        t  = tweet.split(" ")
        labeled_tweets.append((t[0],t[1:]))
shuffle(labeled_tweets)
tweets = [x for _,x in labeled_tweets]
labels = [y for y,_ in labeled_tweets]

In [0]:
from gensim.models import Word2Vec, Doc2Vec
from gensim.models.doc2vec import LabeledSentence
vector_size = 200
def train_model(data):
    print('Training Doc2Vec model...')
    epochs = 15
    model = Doc2Vec(min_count=3, window=2, size=vector_size)
    model.build_vocab(data)
    model.train(data,total_examples=model.corpus_count,epochs=epochs)
    return model

In [0]:
print('Generating cross-folds...')
split_ratio = (0.8,0.1,0.1) #Train,Dev,Test
cross_folds = int((1-split_ratio[0])**(-1))
print(f'Using {cross_folds} crossfolds...')
assert sum(split_ratio) == 1
cross_fold_items = []
print('Training classifiers...')
for i in range(cross_folds):
    size = len(tweets)
    k1 = i*int(size*split_ratio[1])
    k2 = k1+int(size*split_ratio[1])
    k3 = k2+int(size*split_ratio[2])

    trainD = list(enumerate(tweets[k3:] + tweets[:k1]))
    testD = list(enumerate(tweets[k1:k2]))
    devD = list(enumerate(tweets[k2:k3]))
    print('Generating training data...')
    model_train = train_model([LabeledSentence(tweet,[str(id_)]) for id_,tweet in trainD])
    training_data = np.vstack(model_train[str(id_)] for id_,_ in trainD)
    training_labels = np.hstack(label for label in labels[k3:] + labels[:k1])
    print('Generating test data...')
    model_test =  [model_train.infer_vector(tweet) for id_,tweet in testD]      
    test_data = np.vstack(model_test[id_] for id_,_ in testD)
    test_labels = np.hstack(label for label in labels[k1:k2])
    print('Generating dev data...')
    model_dev =  [model_train.infer_vector(tweet) for id_,tweet in devD]    
    dev_data = np.vstack(model_dev[id_] for id_,_ in devD)
    dev_labels = np.hstack(label for label in labels[k2:k3])
    cross_fold_items.append((training_data,training_labels,test_data,test_labels,dev_data,dev_labels,model_train,model_test,model_dev))
    assert len(test_data) + len(training_data) + len(dev_data) == size

Generating cross-folds...
Using 5 crossfolds...
Training classifiers...
Generating training data...
Training Doc2Vec model...
Generating test data...
Generating dev data...
Generating training data...
Training Doc2Vec model...
Generating test data...
Generating dev data...
Generating training data...
Training Doc2Vec model...
Generating test data...
Generating dev data...
Generating training data...
Training Doc2Vec model...
Generating test data...
Generating dev data...
Generating training data...
Training Doc2Vec model...
Generating test data...
Generating dev data...


In [0]:
# inspired by https://github.com/mihirahlawat/Sentiment-Analysis
# BB_twtr at SemEval-2017 Task 4: Twitter Sentiment Analysis with CNNs and LSTMs

from keras.layers import Embedding
from keras.models import Sequential, Model
from keras.layers import Dense, Activation
from keras.layers import Flatten, Conv1D, SpatialDropout1D, MaxPooling1D, AveragePooling1D, Bidirectional, merge, concatenate, Input, Dropout, LSTM

def model(x_dim,y_dim, num_filters=200, filter_sizes=[3,4,5], pool_padding='valid', dropout=0.1):
    pooled_outputs = []
    embed_input = Input(shape=(vector_size,1))
    for i in range(len(filter_sizes)):
        conv = Conv1D(num_filters, kernel_size=filter_sizes[i], padding=pool_padding, activation='relu')(embed_input)
        conv = AveragePooling1D(pool_size=x_dim-filter_sizes[i]+1)(conv)           
        pooled_outputs.append(conv)
    merge = concatenate(pooled_outputs)
    
    x = Dense(30, activation='relu')(merge)
    x = Dropout(dropout)(x)
    x = Bidirectional(LSTM(100, return_sequences=True, dropout=dropout, recurrent_dropout=dropout))(x)
    x = Dense(30, activation='relu')(x)
    x = Dropout(dropout)(x)
    x = Flatten()(x)
    x = Dense(y_dim, activation='softmax')(x)

    model = Model(inputs=embed_input,outputs=x)   
    return model

In [0]:
from keras import backend as K
import tensorflow as tf

def recall_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

def precision_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

def f1_loss(y_true, y_pred):
    
    tp = K.sum(K.cast(y_true*y_pred, 'float'), axis=0)
    tn = K.sum(K.cast((1-y_true)*(1-y_pred), 'float'), axis=0)
    fp = K.sum(K.cast((1-y_true)*y_pred, 'float'), axis=0)
    fn = K.sum(K.cast(y_true*(1-y_pred), 'float'), axis=0)

    p = tp / (tp + fp + K.epsilon())
    r = tp / (tp + fn + K.epsilon())

    f1 = 2*p*r / (p+r+K.epsilon())
    f1 = tf.where(tf.is_nan(f1), tf.zeros_like(f1), f1)
    return 1 - K.mean(f1)
  

In [0]:
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
from keras.preprocessing.text import one_hot
import numpy as np

from tensorflow.python.client import device_lib
K.tensorflow_backend._get_available_gpus()
def CNN_Model(training_data,training_labels,test_data,test_labels,dev_data,dev_labels,model_train,model_test,model_dev):
  training_labels = np.array([[1,0,0] if 'p'in label else [0,1,0] if 'neut' in label else  [0,0,1] for label in training_labels])
  test_labels = np.array([[1,0,0] if 'p'in label else [0,1,0] if 'neut'in label else  [0,0,1] for label in test_labels])
  dev_labels = np.array([[1,0,0] if 'p'in label else [0,1,0] if 'neut'in label else  [0,0,1] for label in dev_labels])
  mdl = model(vector_size, y_dim=3,filter_sizes = [3,4,5], dropout=0.1)
  mdl.compile(loss=f1_loss, 
              optimizer='adam', 
              metrics=['acc',f1_m,precision_m, recall_m])

  batch_size = 32
  num_epochs = 50

  earlyStopping = EarlyStopping(monitor='val_loss', patience=5, verbose=0, mode='min')
  mcp_save = ModelCheckpoint('saved_model.h5', verbose=0, monitor='val_loss',save_best_only=True, mode='min')
  reduce_lr_loss = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=7, verbose=0, epsilon=1e-4, mode='min')

  training_data = training_data.reshape(-1,vector_size,1)
  test_data = test_data.reshape(-1,vector_size,1)
  dev_data = dev_data.reshape(-1,vector_size,1)
  history = mdl.fit(training_data, training_labels, validation_data=(dev_data, dev_labels), batch_size=batch_size, epochs=num_epochs, 
                    callbacks=[mcp_save],verbose=1)
  loss, acc, f1, prec, rec = mdl.evaluate(test_data, test_labels)
  print("loss: {}, accuracy: {}, f1-score: {}, precision: {}, recall: {}".format(round(loss,4), round(acc,4), round(f1,4), round(prec,4), round(rec,4)))
  return loss, acc, f1, prec, rec

In [0]:
print('Classifiers...')
lossT, accT, f1T, precT, recT = 0,0,0,0,0
for i in range(cross_folds):
    print('#'*35)
    training_data,training_labels,test_data,test_labels,dev_data,dev_labels,model_train,model_test,model_dev = cross_fold_items[i]
    loss, acc, f1, prec, rec = CNN_Model(training_data,training_labels,test_data,test_labels,dev_data,dev_labels,model_train,model_test,model_dev)
    lossT += loss
    accT += acc
    f1T += f1
    precT += prec
    recT += rec
print('='*10)
lossT, accT, f1T, precT, recT = lossT/cross_folds, accT/cross_folds, f1T/cross_folds, precT/cross_folds, recT/cross_folds
print("loss: {}, accuracy: {}, f1-score: {}, precision: {}, recall: {}".format(round(lossT,4), round(accT,4), round(f1T,4), round(precT,4), round(recT,4)))

Classifiers...
###################################





Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Train on 40267 samples, validate on 5033 samples
Epoch 1/50


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50

In [0]:
print('='*10)
print("loss: {}, accuracy: {}, f1-score: {}, precision: {}, recall: {}".format(round(lossT,6), round(accT,6), round(f1T,6), round(precT,6), round(recT,6
                                                                                                                                                  )))