In [7]:
import string
import re
import os
import nltk
import pandas as pd
import numpy as np
import json
import tensorflow as tf
from tensorflow import keras
SEED = 1013
np.random.seed(SEED)
#nltk.download('stopwords')
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords, twitter_samples 
from stance_utils import *
#from utils import *
#from parameters import *
from nltk.stem import PorterStemmer
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dropout,Concatenate,Dense, Embedding, LSTM, SpatialDropout1D, Flatten, GRU, Bidirectional, Conv1D, Input,MaxPooling1D
from sklearn.model_selection import train_test_split
from tensorflow.keras import Model
from sklearn.model_selection import StratifiedKFold
stemmer = PorterStemmer()
tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
stopwords_english = stopwords.words('english')
from sklearn.preprocessing import LabelEncoder

In [8]:
classes = {'FAVOR': np.array([1, 0, 0]), 'NONE': np.array([0, 1, 0]), 'AGAINST': np.array([0, 0, 1])}
classes_ = np.array(['FAVOR', 'NONE', 'AGAINST'])

In [9]:
train_data_file = '/data/parush/stance_mohammed/train.txt'
test_data_file = '/data/parush/stance_mohammed/test.txt'
TARGETS = [ 'Atheism','Climate Change is a Real Concern', 'Feminist Movement','Hillary Clinton', 'Legalization of Abortion' ]


# train_data_file = '/data/parush/SomasundaranWiebe-politicalDebates/train.txt'
# test_data_file = '/data/parush/SomasundaranWiebe-politicalDebates/test.txt'
# TARGETS = ['god','healthcare','guns','gayRights','abortion', 'creation']


# train_data_file = '/data/parush/Data_MPCHI/train.txt'
# test_data_file = '/data/parush/Data_MPCHI/test.txt'
# TARGETS = ['Are E-Cigarettes safe?','Does MMR Vaccine lead to autism in children?',
#       'Does Sunlight exposure lead to skin cancer?','Does Vitamin C prevent common cold?',
#       'Should women take HRT post-menopause?']


In [10]:
def train_and_test():
    
    sentence_maxlen = 0
    x_train = []
    y_train = []

    
    with open(train_data_file, 'r') as trainfile:
        for line in trainfile:
            
            line = line.replace('#SemST', '').strip()
            line = line.split('\t')
            
            #if line[0].strip() != 'ID' and line[1].strip() == t:
            if line[0].strip() != 'ID':
                tweet = line[2]
                tweet = process_tweet(tweet)
                if len(tweet) > sentence_maxlen:
                    sentence_maxlen = len(tweet)
                x_train.append(tweet)
                y_train.append(classes[line[3].strip()])
                               
    
    x_test = []
    y_test = []
    with open(test_data_file, 'r') as testfile:
        for line in testfile:
            line = line.replace('#SemST', '').strip()
            line = line.split('\t')
        

            #if line[0] != 'ID' and line[1] == t:
            if line[0] != 'ID':
                tweet = line[2]
                tweet = process_tweet(tweet)
                if len(tweet) > sentence_maxlen:
                    sentence_maxlen = len(tweet)
                x_test.append(tweet)
                y_test.append(classes[line[3].strip()])


    
    return x_train, y_train, x_test, y_test, sentence_maxlen

In [11]:
x_train, y_train, x_test, y_test, sentence_maxlen = train_and_test()

In [12]:
vocabulary = build_vocab(x_train)
vocab_size = len(vocabulary)
print("Total words in vocab are",vocab_size)

Total words in vocab are 6497


In [13]:

embeddings_weights = get_embeddings('wikipedia',100,vocabulary)



In [14]:
for i in range(len(x_train)):
    tweet_tensor = tweet_to_tensor(x_train[i], vocabulary)
    if len(tweet_tensor) < sentence_maxlen:
        diff = sentence_maxlen - len(tweet_tensor)
        n_pad = [0]*diff
        tweet_tensor = tweet_tensor + n_pad
    x_train[i] = tweet_tensor
for i in range(len(x_test)):
    tweet_tensor = tweet_to_tensor(x_test[i], vocabulary)
    if len(tweet_tensor) < sentence_maxlen:
        diff = sentence_maxlen - len(tweet_tensor)
        n_pad = [0]*diff
        tweet_tensor = tweet_tensor + n_pad
    x_test[i] = tweet_tensor

In [15]:
x_train = np.array(x_train)

x_test = np.array(x_test)

y_train = np.asarray(y_train).astype('float32')
y_test = np.asarray(y_test).astype('float32')

2914

In [18]:
num_classes = 3
def biLSTM():
    model = Sequential()
    model.add(Embedding(embeddings_weights.shape[0], embeddings_weights.shape[1], weights=[embeddings_weights]))
    model.add(Dropout(0.2))
    model.add(LSTM(64,return_sequences=True,dropout=0.3))
    model.add(Bidirectional(LSTM(64,dropout=0.3)))
    #model.add(Flatten())
    #add a dropout here
    model.add(Dropout(0.5))
    model.add(Dense(num_classes,activation='softmax'))
    model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
    return model

def biLSTMCNN():
    inputs = Input(shape=(sentence_maxlen,))
    embedded_inputs = Embedding(embeddings_weights.shape[0], embeddings_weights.shape[1], weights=[embeddings_weights])(inputs)
    embedded_inputs = Dropout(0.2)(embedded_inputs)
    lstm = Bidirectional(LSTM(64,return_sequences=True,dropout=0.3))(embedded_inputs)
    convs = []
    for each_filter_size in [3,4,5]:
        #print(rnn.shape)
        each_conv = Conv1D(100, each_filter_size, activation='relu')(lstm)
        each_conv = MaxPooling1D(sentence_maxlen-each_filter_size+1)(each_conv)
        each_conv = Flatten()(each_conv)
        #print(each_conv.shape)
        convs.append(each_conv)
        
    output = Concatenate()(convs)
    output = Dropout(0.5)(output)
    output = (Dense(3,activation='softmax'))(output)
    model = Model(inputs=inputs, outputs=output)
    model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy']) 
    return model

def biGRU():
    model = Sequential()
    model.add(Embedding(embeddings_weights.shape[0], embeddings_weights.shape[1], weights=[embeddings_weights]))
    model.add(Dropout(0.2))
    model.add(Bidirectional(GRU(64,return_sequences=True,dropout=0.3)))
    model.add(Bidirectional(GRU(64,dropout=0.3)))
    model.add(Dropout(0.5))
    model.add(Dense(3,activation='softmax'))
    model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
    return model

def biGRUCNN():
    inputs = Input(shape=(sentence_maxlen,))
    embedded_inputs = Embedding(embeddings_weights.shape[0], embeddings_weights.shape[1], weights=[embeddings_weights])(inputs)
    embedded_inputs = Dropout(0.2)(embedded_inputs)
    rnn = Bidirectional(GRU(64,return_sequences=True,dropout=0.3))(embedded_inputs)
    convs = []
    for each_filter_size in [3,4,5]:
        #print(rnn.shape)
        each_conv = Conv1D(100, each_filter_size, activation='relu')(rnn)
        each_conv = MaxPooling1D(sentence_maxlen-each_filter_size+1)(each_conv)
        each_conv = Flatten()(each_conv)
        #print(each_conv.shape)
        convs.append(each_conv)
        
    output = Concatenate()(convs)
    output = Dropout(0.5)(output)
    output = (Dense(3,activation='softmax'))(output)
    model = Model(inputs=inputs, outputs=output)
    model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])    
    
    return model

In [19]:
model = biLSTMCNN()
print(model.summary())


2021-08-30 18:13:49.349371: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /apps/cuda/cuda-11.2/lib64:/opt/rh/devtoolset-8/root/usr/lib64:/opt/rh/devtoolset-8/root/usr/lib:/opt/rh/devtoolset-8/root/usr/lib64/dyninst:/opt/rh/devtoolset-8/root/usr/lib/dyninst:/opt/rh/devtoolset-8/root/usr/lib64:/opt/rh/devtoolset-8/root/usr/lib:/usr/include/slurm/
2021-08-30 18:13:49.349421: W tensorflow/stream_executor/cuda/cuda_driver.cc:326] failed call to cuInit: UNKNOWN ERROR (303)
2021-08-30 18:13:49.349459: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (gaivi.cse.usf.edu): /proc/driver/nvidia/version does not exist
2021-08-30 18:13:49.349760: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 20)]         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 20, 100)      649700      input_1[0][0]                    
__________________________________________________________________________________________________
dropout (Dropout)               (None, 20, 100)      0           embedding[0][0]                  
__________________________________________________________________________________________________
bidirectional (Bidirectional)   (None, 20, 128)      84480       dropout[0][0]                    
______________________________________________________________________________________________

In [20]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
cvscores = []

for train, val in kfold.split(x_train, classes_[y_train.argmax(1)]):
    model.fit(x_train[train], y_train[train], epochs = 50, batch_size = 16, verbose=1)
    scores = model.evaluate(x_train[val], y_train[val], verbose=0)
    print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
    cvscores.append(scores[1] * 100)
print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))

y_pred = np.round(model.predict(x_test))

print("For: ",classification_report(y_test, y_pred, digits=4))


2021-08-30 18:13:55.797218: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)
2021-08-30 18:13:55.797852: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 2399910000 Hz


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
accuracy: 58.49%
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch

Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
accuracy: 99.66%
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50


Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
accuracy: 100.00%
90.84% (+/- 16.23%)
For:                precision    recall  f1-score   support

           0     0.4573    0.5461    0.4978       304
           1     0.3653    0.4304    0.3952       230
           2     0.7378    0.6336    0.6817       715

   micro avg     0.5753    0.5749    0.5751      1249
   macro avg     0.5201    0.5367    0.5249      1249
weighted avg     0.6009    0.5749    0.5842      1249
 samples avg     0.5749    0.5749    0.5749      1249



  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
For:                precision    recall  f1-score   support

           0     0.4413    0.5197    0.4773       304
           1     0.3815    0.4130    0.3967       230
           2     0.7314    0.6476    0.6869       715

   micro avg     0.5774    0.5733    0.5753      1249
   macro avg     0.5181    0.5268    0.5203      1249
weighted avg     0.5964    0.5733    0.5825      1249
 samples avg     0.5733    0.5733    0.5733      1249