In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
!cp /content/drive/My\ Drive/Colab\ Notebooks/3-class.csv 3-class.csv

In [0]:
!cp /content/drive/My\ Drive/Colab\ Notebooks/list.txt list.txt

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv('3-class.csv')
df.head()

Unnamed: 0,Tweet,Polarity
0,ابراهيم_عيسى الوسخ ابن الوسخه كلما حصل حادث ا...,neg
1,اخطر حروب الارض حرب العقيده حسيبك الله ي اول ...,neg
2,اصبحت تقدم برامج عبر الجمعيات الخيريه لايصال ...,neg
3,اعلامنا متمثل في داوودالشريان و روتانا وطقتهم...,neg
4,الاصرار مرتزقه_برنامج_الاصرار بضاعه هالمترديه...,neg


In [0]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56674 entries, 0 to 56673
Data columns (total 2 columns):
Tweet       56674 non-null object
Polarity    56674 non-null object
dtypes: object(2)
memory usage: 885.7+ KB


In [0]:
df.Polarity.value_counts()

neg     20731
neut    18726
pos     17217
Name: Polarity, dtype: int64

In [0]:
df['Polarity'] = df['Polarity'].map({'neg': 0, 'pos': 1, 'neut': 2})

In [6]:
df[df.Polarity == 0].head(10)
# df[df.Polarity == 1].head(10)
# df[df.Polarity == 2].head(10)

Unnamed: 0,Tweet,Polarity
0,ابراهيم_عيسى الوسخ ابن الوسخه كلما حصل حادث ا...,0
1,اخطر حروب الارض حرب العقيده حسيبك الله ي اول ...,0
2,اصبحت تقدم برامج عبر الجمعيات الخيريه لايصال ...,0
3,اعلامنا متمثل في داوودالشريان و روتانا وطقتهم...,0
4,الاصرار مرتزقه_برنامج_الاصرار بضاعه هالمترديه...,0
5,الاعلام اللبناني يهاجم السعوديه منذ مده بكل ق...,0
6,البرنامج استاجر بعض المشاهير و الهوامير في تو...,0
7,الحمد لله ما احتاج اتعلم من واحد فاشل اخلاقيا...,0
8,الخرج بيض الله وجه محافظ الخرج فهذه القناه تص...,0
9,الرياض السعوديه رسالتي لوزير العمل في حينه عن...,0


###Data Preparartion

In [0]:
df['pre_clean_len'] = [len(t) for t in df.Tweet]

In [8]:
from pprint import pprint
data_dict = {
    'polarity': {
        'type': df.Polarity.dtype,
        'description': 'sentiment class - 0: negative, 1: positive, 2: neutral'
    },
    'tweet': {
        'type': df.Tweet.dtype,
        'description': 'tweet text'
    },
    'pre_clean_len': {
        'type': df.pre_clean_len.dtype,
        'description': 'length of the tweet before cleaning'
    },
    'dataset.shape': df.shape
}

pprint(data_dict)

{'dataset.shape': (56674, 3),
 'polarity': {'description': 'sentiment class - 0: negative, 1: positive, 2: '
                             'neutral',
              'type': dtype('int64')},
 'pre_clean_len': {'description': 'length of the tweet before cleaning',
                   'type': dtype('int64')},
 'tweet': {'description': 'tweet text', 'type': dtype('O')}}


* In some tweets the HTML encodings are not converted to 
text, so we'll convert the HTML decoding to general text. We can use beautiful soup for decoding the HTML encodings.

* We also have to remove the '@' charcter mentions as it is not relevant to us.

* We'll also remove the URL links as they contain little significance for our task.

* We have to remove hashtag and numbers from the tweets too.

In [0]:
# Data cleaning function definition

from nltk.tokenize import WordPunctTokenizer
tok = WordPunctTokenizer()

In [0]:
import re
from bs4 import BeautifulSoup

pat1 = r'@[A-Za-z0-9_]+'
pat2 = r'https?://[^ ]+'
combined_pat = r'|'.join((pat1, pat2))
www_pat = r'www.[^ ]+'
arabic_num_pat = '[٠١٢٣٤٥٦٧٨٩]'
eng_num_pat = '[0123456789]'
sharta_pat = '[_]'
eng_pat = '[A-Za-z]'

def tweet_cleaner(text):
  soup = BeautifulSoup(text, 'lxml')
  souped = soup.get_text()
  try:
        bom_removed = souped.decode("utf-8-sig").replace(u"\ufffd", "?")
  except:
        bom_removed = souped
  stripped = re.sub(combined_pat, '', bom_removed)
  stripped = re.sub(www_pat, '', stripped)
  stripped = re.sub(arabic_num_pat, '', stripped)
  stripped = re.sub(eng_num_pat, '', stripped)
  stripped = re.sub(sharta_pat, ' ', stripped)
  stripped = re.sub(eng_pat, ' ', stripped)
  words = [x for x in tok.tokenize(stripped) if len(x) > 1]
  return (" ".join(words)).strip()

In [11]:
df_copy = df
clean_tweet_texts = []
for i in range(0, len(df)):
  if( (i+1)%10000 == 0 ):
        print("Tweets %d of %d has been processed"%(i+1,len(df)))                                                                    
  clean_tweet_texts.append(tweet_cleaner(df_copy['Tweet'][i]))

Tweets 10000 of 56674 has been processed
Tweets 20000 of 56674 has been processed
Tweets 30000 of 56674 has been processed
Tweets 40000 of 56674 has been processed
Tweets 50000 of 56674 has been processed


In [0]:
clean_tweet_texts[:10]

['ابراهيم عيسى الوسخ ابن الوسخه كلما حصل حادث اتهم السعوديه بالارهاب الكلب كان براتب مليون جنيه من سنوي مصر',
 'اخطر حروب الارض حرب العقيده حسيبك الله اول ال راه يم رب عل نا وق ده الع يده وات زا',
 'اصبحت تقدم برامج عبر الجمعيات الخيريه لايصال خبثها اين مسؤولي الجمعيه من هذا نطالب خادم الحرمين بايقاف',
 'اعلامنا متمثل في داوودالشريان روتانا وطقتهم كيف ترجي من هالاشكال خير همهم الوحيد في الحياه قياده المراه للسياره',
 'الاصرار مرتزقه برنامج الاصرار بضاعه هالمترديه قناه العهر مزجاه في جميع المجالات',
 'الاعلام اللبناني يهاجم السعوديه منذ مده بكل قبيح ومجموعه تدعم الاعلام في لبنان باقامه برامجها الضخمه فيها',
 'البرنامج استاجر بعض المشاهير الهوامير في تويتر عشان يبررو لهم ويرقعو لقناه برنامج اصرار',
 'الحمد لله ما احتاج اتعلم من واحد فاشل اخلاقيا همه الشحاذه من على حساب مواطن يفضح برنامج الاصرار برعايه',
 'الخرج بيض الله وجه محافظ الخرج فهذه القناه تصب على المسلمين سيل من المخالفات الشرعيه والمحرمات تدعو الى الرذيله وتشوه صوره الاسلام',
 'الرياض السعوديه رسالتي لوزير العمل في حينه عن برنا

In [0]:
len(clean_tweet_texts)

56674

In [0]:
#normalization
def normalizeArabic(text):
    text = re.sub("[إأٱآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "و", text)
    text = re.sub("ئ", "ء", text)
    text = re.sub("ة", "ه", text)
    return(text)
for i in range(len(clean_tweet_texts)):
  clean_tweet_texts[i] = normalizeArabic(clean_tweet_texts[i])
#repeated letters
import re
for i in range(len(clean_tweet_texts)):
  clean_tweet_texts[i] = re.sub(r'(.)\1+', r'\1\1', clean_tweet_texts[i])

In [0]:
clean_tweet_texts[:10]

['ابراهيم عيسي الوسخ ابن الوسخه كلما حصل حادث اتهم السعوديه بالارهاب الكلب كان براتب مليون جنيه من سنوي مصر',
 'اخطر حروب الارض حرب العقيده حسيبك الله اول ال راه يم رب عل نا وق ده الع يده وات زا',
 'اصبحت تقدم برامج عبر الجمعيات الخيريه لايصال خبثها اين مسوولي الجمعيه من هذا نطالب خادم الحرمين بايقاف',
 'اعلامنا متمثل في داوودالشريان روتانا وطقتهم كيف ترجي من هالاشكال خير همهم الوحيد في الحياه قياده المراه للسياره',
 'الاصرار مرتزقه برنامج الاصرار بضاعه هالمترديه قناه العهر مزجاه في جميع المجالات',
 'الاعلام اللبناني يهاجم السعوديه منذ مده بكل قبيح ومجموعه تدعم الاعلام في لبنان باقامه برامجها الضخمه فيها',
 'البرنامج استاجر بعض المشاهير الهوامير في تويتر عشان يبررو لهم ويرقعو لقناه برنامج اصرار',
 'الحمد لله ما احتاج اتعلم من واحد فاشل اخلاقيا همه الشحاذه من علي حساب مواطن يفضح برنامج الاصرار برعايه',
 'الخرج بيض الله وجه محافظ الخرج فهذه القناه تصب علي المسلمين سيل من المخالفات الشرعيه والمحرمات تدعو الي الرذيله وتشوه صوره الاسلام',
 'الرياض السعوديه رسالتي لوزير العمل في حينه عن برنا

In [0]:
clean_df = pd.DataFrame(clean_tweet_texts, columns=['text'])
clean_df['target'] = df.Polarity
clean_df.to_csv('clean_tweet.csv', encoding='utf-8')

In [14]:
!cp /content/drive/My\ Drive/Colab\ Notebooks/clean_tweet.csv

cp: missing destination file operand after '/content/drive/My Drive/Colab Notebooks/clean_tweet.csv'
Try 'cp --help' for more information.


In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

%matplotlib inline
%config InlinBackend.figure_format = 'retina'

In [16]:
csv = 'clean_tweet.csv'
my_df = pd.read_csv(csv, index_col=0)
my_df.head()

Unnamed: 0,text,target
0,ابراهيم عيسي الوسخ ابن الوسخه كلما حصل حادث ات...,0
1,اخطر حروب الارض حرب العقيده حسيبك الله اول ال ...,0
2,اصبحت تقدم برامج عبر الجمعيات الخيريه لايصال خ...,0
3,اعلامنا متمثل في داوودالشريان روتانا وطقتهم كي...,0
4,الاصرار مرتزقه برنامج الاصرار بضاعه هالمترديه ...,0


In [17]:
my_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 56674 entries, 0 to 56673
Data columns (total 2 columns):
text      56673 non-null object
target    56674 non-null int64
dtypes: int64(1), object(1)
memory usage: 1.3+ MB


In [18]:
my_df.dropna(inplace=True)
my_df.reset_index(drop=True, inplace=True)
my_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56673 entries, 0 to 56672
Data columns (total 2 columns):
text      56673 non-null object
target    56673 non-null int64
dtypes: int64(1), object(1)
memory usage: 885.6+ KB


In [0]:
x = my_df.text
y = my_df.target

In [0]:
from sklearn.model_selection import train_test_split
SEED = 666
x_train, x_validation_and_test, y_train, y_validation_and_test = train_test_split(x, y, test_size=.2, random_state=SEED)
x_validation, x_test, y_validation, y_test = train_test_split(x_validation_and_test, y_validation_and_test, test_size=.5, random_state=SEED)

In [21]:
print("Train set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive, {3:.2f}% neutral".format(len(x_train),
                                                                             (len(x_train[y_train == 0]) / (len(x_train)*1.))*100,
                                                                            (len(x_train[y_train == 1]) / (len(x_train)*1.))*100,
                                                                            (len(x_train[y_train == 2]) / (len(x_train)*1.))*100))
print("Validation set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive, {3:.2f}% neutral".format(len(x_validation),
                                                                             (len(x_validation[y_validation == 0]) / (len(x_validation)*1.))*100,
                                                                            (len(x_validation[y_validation == 1]) / (len(x_validation)*1.))*100,
                                                                            (len(x_validation[y_validation == 2]) / (len(x_validation)*1.))*100))
print("Test set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive, {3:.2f}% neutral".format(len(x_test),
                                                                             (len(x_test[y_test == 0]) / (len(x_test)*1.))*100,
                                                                            (len(x_test[y_test == 1]) / (len(x_test)*1.))*100,
                                                                            (len(x_test[y_test == 2]) / (len(x_test)*1.))*100))


Train set has total 45338 entries with 36.59% negative, 30.52% positive, 32.90% neutral
Validation set has total 5667 entries with 36.07% negative, 29.93% positive, 34.00% neutral
Test set has total 5668 entries with 37.01% negative, 29.73% positive, 33.26% neutral


In [22]:
#converting the labels to categorical data

from keras.utils import to_categorical
y_train = to_categorical(y_train)
y_train.shape

Using TensorFlow backend.


(45338, 3)

In [0]:
y_validation = to_categorical(y_validation)

In [24]:
y_validation.shape

(5667, 3)

In [2]:
!pip install --upgrade tqdm

Collecting tqdm
[?25l  Downloading https://files.pythonhosted.org/packages/cd/80/5bb262050dd2f30f8819626b7c92339708fe2ed7bd5554c8193b4487b367/tqdm-4.42.1-py2.py3-none-any.whl (59kB)
[K     |█████▌                          | 10kB 25.8MB/s eta 0:00:01[K     |███████████                     | 20kB 5.7MB/s eta 0:00:01[K     |████████████████▋               | 30kB 8.1MB/s eta 0:00:01[K     |██████████████████████▏         | 40kB 5.4MB/s eta 0:00:01[K     |███████████████████████████▊    | 51kB 6.6MB/s eta 0:00:01[K     |████████████████████████████████| 61kB 4.8MB/s 
[?25hInstalling collected packages: tqdm
  Found existing installation: tqdm 4.28.1
    Uninstalling tqdm-4.28.1:
      Successfully uninstalled tqdm-4.28.1
Successfully installed tqdm-4.42.1


In [0]:
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
import gensim
from gensim.models.word2vec import Word2Vec
from gensim.models.doc2vec import TaggedDocument
import multiprocessing
from sklearn import utils

In [0]:
!cp /content/drive/My\ Drive/Colab\ Notebooks/full_uni_cbow_100_twitter.zip /content/

In [0]:
!cp /content/drive/My\ Drive/Colab\ Notebooks/full_uni_sg_100_twitter.zip /content/

In [29]:
!unzip full_uni_cbow_100_twitter.zip


Archive:  full_uni_cbow_100_twitter.zip
  inflating: full_uni_cbow_100_twitter.mdl  
  inflating: full_uni_cbow_100_twitter.mdl.trainables.syn1neg.npy  
  inflating: full_uni_cbow_100_twitter.mdl.wv.vectors.npy  


In [30]:
!unzip full_uni_sg_100_twitter.zip

Archive:  full_uni_sg_100_twitter.zip
  inflating: full_uni_sg_100_twitter.mdl  
  inflating: full_uni_sg_100_twitter.mdl.trainables.syn1neg.npy  
  inflating: full_uni_sg_100_twitter.mdl.wv.vectors.npy  


In [31]:
from gensim.models import KeyedVectors
model_ug_cbow = KeyedVectors.load('full_uni_cbow_100_twitter.mdl')
model_ug_sg = KeyedVectors.load('full_uni_cbow_100_twitter.mdl')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [32]:
len(model_ug_cbow.wv.vocab.keys())

1259756

In [33]:
embeddings_index = {}
for w in model_ug_cbow.wv.vocab.keys():
  embeddings_index[w] = np.append(model_ug_cbow.wv[w], 
                                  model_ug_sg.wv[w])
print(f'Found {len(embeddings_index)} word vectors.')

Found 1259756 word vectors.


In [0]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=100000)
tokenizer.fit_on_texts(x_train)
sequences = tokenizer.texts_to_sequences(x_train)

In [35]:
len(tokenizer.word_index)

74239

In [36]:
for x in x_train[:5]:
  print(x)

لماذا يا خوله العنزي انا ماعلي منها هذي حياتها وبلعنتها بس ليش يطلعونها بالتلفزيون انها واجهه مشرفه وكل الشعب يدري انها عار السعوديه
لماذا يا خوله العنزي فالرجاا كل واحد يبلع تبن وينطم والله بيحاسب كل واححد بطلو لقاافه
الف مبروك زعماا البطوله للتوضيح بطولات الهلال في سنه واحده بجده والثانيه بلندن والثالثه الرياض مدلع جمهوره بكل مكان الهل
كلمه للتاريخ فارس عوض لماذا الالغاا الهلال التعاون التعاون الهلال
تعليق الدراسه في القصيم اتخذ القرار قبل منتصف الليل الامطار تسقط غزاره علي القصيم اي حدث يحدث لا سمح


In [0]:
length = []
for x in x_train:
  length.append(len(x.split()))

In [38]:
max(length)

29

In [39]:
x_train_seq = pad_sequences(sequences, maxlen=35)
print('Shape of data tensor: ', x_train_seq.shape)

Shape of data tensor:  (45338, 35)


In [40]:
x_train_seq[:5]

array([[    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,    69,    16,    92,   100,    84, 13248,
          373,   212,  6944, 32195,    58,   224, 32196, 16290,   295,
        21407, 32197,   345,   211,  8619,   295,  1845,     6],
       [    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,    69,    16,    92,   100, 16291,    21,   161, 32198,
         1798, 21408,    51, 32199,    21, 32200, 11246, 32201],
       [    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,    29,    30,   500,
          238,  7697,   583,     1,     2,   277,   805,  3513,  8620,
        21409, 16292,    42,  8621,  2081,   296,   493,  1799],
       [    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,   

In [0]:
sequences_val = tokenizer.texts_to_sequences(x_validation)
x_val_seq = pad_sequences(sequences_val, maxlen=35)

In [0]:
num_words = 100000
embedding_matrix = np.zeros((num_words, 200))

for word, i in tokenizer.word_index.items():
  if i >= num_words:
    continue
  embedding_vector = embeddings_index.get(word)
  if embedding_vector is not None:
    embedding_matrix[i] = embedding_vector

In [0]:
seed = 3

from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers.embeddings import Embedding

In [0]:
sequences_test = tokenizer.texts_to_sequences(x_test)
x_test_seq = pad_sequences(sequences_test, maxlen=35)
y_test = to_categorical(y_test)

In [45]:
!pip install keras_metrics

Collecting keras_metrics
  Downloading https://files.pythonhosted.org/packages/32/c9/a87420da8e73de944e63a8e9cdcfb1f03ca31a7c4cdcdbd45d2cdf13275a/keras_metrics-1.1.0-py2.py3-none-any.whl
Installing collected packages: keras-metrics
Successfully installed keras-metrics-1.1.0


In [0]:
import keras
import keras_metrics
from keras import layers

#**GRU**

In [0]:
%%time


model_gru = Sequential()
e = Embedding(100000, 200, weights=[embedding_matrix], input_length=35
              , trainable=True)
model_gru.add(e)
model_gru.add(keras.layers.GRU(256, dropout=0.2, recurrent_dropout=0.5))
model_gru.add(Dense(3, activation='softmax'))

model_gru.compile(loss='categorical_crossentropy', optimizer='adam',
                  metrics=['accuracy', keras_metrics.precision(), 
                           keras_metrics.recall(), keras_metrics.f1_score()])
model_gru.summary()
model_gru.fit(x_train_seq, y_train, 
          validation_data=(x_val_seq, y_validation),
          epochs=5, batch_size=32)











Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.



Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 35, 200)           20000000  
_________________________________________________________________
gru_1 (GRU)                  (None, 256)               350976    
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 771       
Total params: 20,351,747
Trainable params: 20,351,747
Non-trainable params: 0
_________________________________________________________________
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Train on 45338 samples, validate on 5667 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CPU times: user 10min 20s, sys: 1min 13s, total: 1

In [0]:
model_gru.evaluate(x_test_seq, y_test)



[0.5628368094637096,
 0.8034580098800282,
 0.8396907216062015,
 0.7764537654539346,
 0.8068350169013767]

#**LSTM**

In [0]:
%%time

model_lstm = Sequential()
e = Embedding(100000, 200, weights=[embedding_matrix], input_length=35
              , trainable=True)
model_lstm.add(e)
# model_lstm.add(Flatten())
# model.add(Dense(256, activation='relu'))
model_lstm.add(keras.layers.LSTM(256, dropout=0.2, recurrent_dropout=0.5))
model_lstm.add(Dense(3, activation='softmax'))

model_lstm.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy', keras_metrics.precision(), keras_metrics.recall(), keras_metrics.f1_score()])
model_lstm.summary()
model_lstm.fit(x_train_seq, y_train, 
          validation_data=(x_val_seq, y_validation),
          epochs=5, batch_size=32)

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 35, 200)           20000000  
_________________________________________________________________
lstm_1 (LSTM)                (None, 256)               467968    
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 771       
Total params: 20,468,739
Trainable params: 20,468,739
Non-trainable params: 0
_________________________________________________________________
Train on 45338 samples, validate on 5667 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CPU times: user 12min 2s, sys: 1min 17s, total: 13min 20s
Wall time: 9min 39s


In [0]:
model_lstm.evaluate(x_test_seq, y_test)



[0.5733786303137253,
 0.8016937191249118,
 0.8412039439106795,
 0.7726406100680343,
 0.8054657885595433]

#**Bi-GRU**

In [0]:
%%time

model_bi_gru = Sequential()
e = Embedding(100000, 200, weights=[embedding_matrix], input_length=35
              , trainable=True)
model_bi_gru.add(e)
# model_gru.add(Flatten())
# model.add(Dense(256, activation='relu'))
model_bi_gru.add(keras.layers.Bidirectional(keras.layers.GRU(256, dropout=0.2, recurrent_dropout=0.5)))
model_bi_gru.add(Dense(3, activation='softmax'))

model_bi_gru.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy', keras_metrics.precision(), keras_metrics.recall(), keras_metrics.f1_score()])
model_bi_gru.summary()
model_bi_gru.fit(x_train_seq, y_train, 
          validation_data=(x_val_seq, y_validation),
          epochs=5, batch_size=32)

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 35, 200)           20000000  
_________________________________________________________________
bidirectional_1 (Bidirection (None, 512)               701952    
_________________________________________________________________
dense_3 (Dense)              (None, 3)                 1539      
Total params: 20,703,491
Trainable params: 20,703,491
Non-trainable params: 0
_________________________________________________________________
Train on 45338 samples, validate on 5667 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CPU times: user 22min 16s, sys: 3min 7s, total: 25min 23s
Wall time: 15min 59s


In [0]:
model_bi_gru.evaluate(x_test_seq, y_test)



[0.5635861348920753,
 0.7972829922371206,
 0.8499730166837575,
 0.750714966599108,
 0.7972664649582831]

#**Bi-LSTM**

In [0]:
%%time

model_bi_lstm = Sequential()
e = Embedding(100000, 200, weights=[embedding_matrix], input_length=35
              , trainable=True)
model_bi_lstm.add(e)
# model_lstm.add(Flatten())
# model.add(Dense(256, activation='relu'))
model_bi_lstm.add(keras.layers.Bidirectional(keras.layers.LSTM(256, dropout=0.2, recurrent_dropout=0.5)))
model_bi_lstm.add(Dense(3, activation='softmax'))

model_bi_lstm.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy', keras_metrics.precision(), keras_metrics.recall(), keras_metrics.f1_score()])
model_bi_lstm.summary()
model_bi_lstm.fit(x_train_seq, y_train, 
          validation_data=(x_val_seq, y_validation),
          epochs=5, batch_size=32)

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 35, 200)           20000000  
_________________________________________________________________
bidirectional_2 (Bidirection (None, 512)               935936    
_________________________________________________________________
dense_4 (Dense)              (None, 3)                 1539      
Total params: 20,937,475
Trainable params: 20,937,475
Non-trainable params: 0
_________________________________________________________________
Train on 45338 samples, validate on 5667 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CPU times: user 27min 11s, sys: 3min 51s, total: 31min 2s
Wall time: 19min 26s


In [0]:
model_bi_lstm.evaluate(x_test_seq, y_test)



[0.5793126914268997,
 0.7997529992942837,
 0.8698358799734105,
 0.732602478516082,
 0.7953427705066731]

#**Stacked GRU**

In [0]:
%%time

model_st_gru = Sequential()
e = Embedding(100000, 200, weights=[embedding_matrix], input_length=35
              , trainable=True)
model_st_gru.add(e)
# model_st_gru.add(Flatten())
# model.add(Dense(256, activation='relu'))
model_st_gru.add(keras.layers.GRU(64, dropout=0.2, recurrent_dropout=0.5, return_sequences=True))
model_st_gru.add(keras.layers.GRU(128, dropout=0.2))
model_st_gru.add(Dense(3, activation='softmax'))

model_st_gru.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy', keras_metrics.precision(), keras_metrics.recall(), keras_metrics.f1_score()])
model_st_gru.summary()
model_st_gru.fit(x_train_seq, y_train, 
          validation_data=(x_val_seq, y_validation),
          epochs=5, batch_size=32)

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 35, 200)           20000000  
_________________________________________________________________
gru_3 (GRU)                  (None, 35, 64)            50880     
_________________________________________________________________
gru_4 (GRU)                  (None, 128)               74112     
_________________________________________________________________
dense_5 (Dense)              (None, 3)                 387       
Total params: 20,125,379
Trainable params: 20,125,379
Non-trainable params: 0
_________________________________________________________________
Train on 45338 samples, validate on 5667 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CPU times: user 17min 59s, sys: 1min 33s, total: 19min 33s
Wall time: 13min 46s


In [0]:
model_st_gru.evaluate(x_test_seq, y_test)



[0.5480669759156086,
 0.805045871559633,
 0.8445487740821831,
 0.7716873212215593,
 0.8064756661262017]

#**Stacked LSTM**

In [0]:
%%time

model_st_lstm = Sequential()
e = Embedding(100000, 200, weights=[embedding_matrix], input_length=35
              , trainable=True)
model_st_lstm.add(e)
# model_st_gru.add(Flatten())
# model.add(Dense(256, activation='relu'))
model_st_lstm.add(keras.layers.LSTM(64, dropout=0.2, recurrent_dropout=0.5, return_sequences=True))
model_st_lstm.add(keras.layers.LSTM(128, dropout=0.2))
model_st_lstm.add(Dense(3, activation='softmax'))

model_st_lstm.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy', keras_metrics.precision(), keras_metrics.recall(), keras_metrics.f1_score()])
model_st_lstm.summary()
model_st_lstm.fit(x_train_seq, y_train, 
          validation_data=(x_val_seq, y_validation),
          epochs=5, batch_size=32)

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 35, 200)           20000000  
_________________________________________________________________
lstm_3 (LSTM)                (None, 35, 64)            67840     
_________________________________________________________________
lstm_4 (LSTM)                (None, 128)               98816     
_________________________________________________________________
dense_6 (Dense)              (None, 3)                 387       
Total params: 20,167,043
Trainable params: 20,167,043
Non-trainable params: 0
_________________________________________________________________
Train on 45338 samples, validate on 5667 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CPU times: user 21min 8s, sys: 1min 47s, total: 22min 56s
Wall time: 16min 11s


In [0]:
model_st_lstm.evaluate(x_test_seq, y_test)



[0.5411306851837276,
 0.8041637261820748,
 0.844027125673238,
 0.7712106767983218,
 0.8059775341212453]

#**Stacked Bi-GRU**

In [0]:
%%time
model_StBiGRU_input = layers.Input(shape=(x_train_seq.shape[1],))
e = Embedding(100000, 200, weights=[embedding_matrix], input_length=35
              , trainable=True)(model_StBiGRU_input)

forw = layers.GRU(64, return_sequences=True, dropout=0.2, recurrent_dropout=0.5)(e)
backw = layers.GRU(64, return_sequences=True, go_backwards=True, dropout=0.2, recurrent_dropout=0.5)(e)


forw2 = layers.GRU(128, dropout=0.2)(forw)
backw2 = layers.GRU(128, go_backwards=True, dropout=0.2)(backw)
link= layers.Concatenate()([forw2,backw2])

x = layers.Dense(3, activation='softmax')(link)

model_StBiGRU = keras.models.Model(inputs=model_StBiGRU_input, outputs=x)

model_StBiGRU.compile(loss='categorical_crossentropy', 
                        optimizer='adam', 
                        metrics=['accuracy', 
                                 keras_metrics.precision(), 
                                 keras_metrics.recall(), 
                                 keras_metrics.f1_score()])
model_StBiGRU.summary()
model_StBiGRU.fit(x_train_seq, y_train, 
          validation_data=(x_val_seq, y_validation),
          epochs=5, batch_size=32)


Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 35)           0                                            
__________________________________________________________________________________________________
embedding_7 (Embedding)         (None, 35, 200)      20000000    input_1[0][0]                    
__________________________________________________________________________________________________
gru_5 (GRU)                     (None, 35, 64)       50880       embedding_7[0][0]                
__________________________________________________________________________________________________
gru_6 (GRU)                     (None, 35, 64)       50880       embedding_7[0][0]                
____________________________________________________________________________________________

In [0]:
model_StBiGRU.evaluate(x_test_seq, y_test)



[0.5879813143293371,
 0.7985179957657021,
 0.8304123710912158,
 0.767874165835659,
 0.7979197122955777]

#**Stacked Bi-LSTM**

In [0]:
%%time
model_StBiLSTM_input = layers.Input(shape=(x_train_seq.shape[1],))
e = Embedding(100000, 200, weights=[embedding_matrix], input_length=35
              , trainable=True)(model_StBiLSTM_input)

forw = layers.LSTM(64, return_sequences=True, dropout=0.2, recurrent_dropout=0.5)(e)
backw = layers.LSTM(64, return_sequences=True, go_backwards=True, dropout=0.2, recurrent_dropout=0.5)(e)


forw2 = layers.LSTM(128, dropout=0.2)(forw)
backw2 = layers.LSTM(128, go_backwards=True, dropout=0.2)(backw)
link= layers.Concatenate()([forw2,backw2])

x = layers.Dense(3, activation='softmax')(link)

model_StBiLSTM = keras.models.Model(inputs=model_StBiLSTM_input, outputs=x)

model_StBiLSTM.compile(loss='categorical_crossentropy', 
                        optimizer='adam', 
                        metrics=['accuracy', 
                                 keras_metrics.precision(), 
                                 keras_metrics.recall(), 
                                 keras_metrics.f1_score()])
model_StBiLSTM.summary()
model_StBiLSTM.fit(x_train_seq, y_train, 
          validation_data=(x_val_seq, y_validation),
          epochs=5, batch_size=32)

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 35)           0                                            
__________________________________________________________________________________________________
embedding_8 (Embedding)         (None, 35, 200)      20000000    input_2[0][0]                    
__________________________________________________________________________________________________
lstm_5 (LSTM)                   (None, 35, 64)       67840       embedding_8[0][0]                
__________________________________________________________________________________________________
lstm_6 (LSTM)                   (None, 35, 64)       67840       embedding_8[0][0]                
____________________________________________________________________________________________

In [0]:
model_StBiLSTM.evaluate(x_test_seq, y_test)



[0.5795366378342003,
 0.8022230063514467,
 0.8661814109257456,
 0.7373689227484572,
 0.7966013920941897]

#**Concatenated BI-GRU**

In [0]:
%%time

model_st_bi_gru = Sequential()
e = Embedding(100000, 200, weights=[embedding_matrix], input_length=35
              , trainable=True)
model_st_bi_gru.add(e)
model_st_bi_gru.add(keras.layers.Bidirectional(
    keras.layers.GRU(64, dropout=0.2, recurrent_dropout=0.5, 
                      return_sequences=True)))
model_st_bi_gru.add(keras.layers.Bidirectional(
    keras.layers.GRU(128, dropout=0.2)))
model_st_bi_gru.add(Dense(3, activation='softmax'))

model_st_bi_gru.compile(loss='categorical_crossentropy', 
                        optimizer='adam', 
                        metrics=['accuracy', 
                                 keras_metrics.precision(), 
                                 keras_metrics.recall(), 
                                 keras_metrics.f1_score()])
model_st_bi_gru.summary()
model_st_bi_gru.fit(x_train_seq, y_train, 
          validation_data=(x_val_seq, y_validation),
          epochs=5, batch_size=32)

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 35, 200)           20000000  
_________________________________________________________________
bidirectional_3 (Bidirection (None, 35, 128)           101760    
_________________________________________________________________
bidirectional_4 (Bidirection (None, 256)               197376    
_________________________________________________________________
dense_9 (Dense)              (None, 3)                 771       
Total params: 20,299,907
Trainable params: 20,299,907
Non-trainable params: 0
_________________________________________________________________
Train on 45338 samples, validate on 5667 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CPU times: user 36min 55s, sys: 4min 7s, total: 41min 3s
Wall time: 26min 26s


In [0]:
model_st_bi_gru.evaluate(x_test_seq, y_test)



[0.5910256639196575,
 0.7979887085391673,
 0.8586533110262297,
 0.7354623450555071,
 0.7922977679729384]

#**Concatenated Bi-LSTM**

In [0]:
%%time

model_st_bi_lstm = Sequential()
e = Embedding(100000, 200, weights=[embedding_matrix], input_length=35
              , trainable=True)
model_st_bi_lstm.add(e)
model_st_bi_lstm.add(keras.layers.Bidirectional(
    keras.layers.LSTM(64, dropout=0.2, recurrent_dropout=0.5, 
                      return_sequences=True)))
model_st_bi_lstm.add(keras.layers.Bidirectional(
    keras.layers.LSTM(128, dropout=0.2)))
model_st_bi_lstm.add(Dense(3, activation='softmax'))

model_st_bi_lstm.compile(loss='categorical_crossentropy', 
                        optimizer='adam', 
                        metrics=['accuracy', 
                                 keras_metrics.precision(), 
                                 keras_metrics.recall(), 
                                 keras_metrics.f1_score()])
model_st_bi_lstm.summary()
model_st_bi_lstm.fit(x_train_seq, y_train, 
          validation_data=(x_val_seq, y_validation),
          epochs=5, batch_size=32)

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, 35, 200)           20000000  
_________________________________________________________________
bidirectional_5 (Bidirection (None, 35, 128)           135680    
_________________________________________________________________
bidirectional_6 (Bidirection (None, 256)               263168    
_________________________________________________________________
dense_10 (Dense)             (None, 3)                 771       
Total params: 20,399,619
Trainable params: 20,399,619
Non-trainable params: 0
_________________________________________________________________
Train on 45338 samples, validate on 5667 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CPU times: user 44min 35s, sys: 5min 10s, total: 49min 46s
Wall time: 32min 7s


In [0]:
model_st_bi_lstm.evaluate(x_test_seq, y_test)



[0.5684865785469534,
 0.8032815808045166,
 0.8315467074613809,
 0.7764537654539346,
 0.8030563956686027]

#**Concatenated Bi-GRU (2)**

In [0]:
%%time
model_ConGRU_input = layers.Input(shape=(x_train_seq.shape[1],))
e = Embedding(100000, 200, weights=[embedding_matrix], input_length=35
              , trainable=True)(model_ConGRU_input)

forw = layers.GRU(64, return_sequences=True, dropout=0.2, recurrent_dropout=0.5)(e)
backw = layers.GRU(64, return_sequences=True, go_backwards=True, dropout=0.2, recurrent_dropout=0.5)(e)

link= layers.Concatenate()([forw,backw])

forw2 = layers.GRU(64, dropout=0.2)(link)
backw2 = layers.GRU(64, go_backwards=True, dropout=0.2)(link)
link2= layers.Concatenate()([forw2,backw2])

x = layers.Dense(3, activation='softmax')(link2)

model_ConGRU = keras.models.Model(inputs=model_ConGRU_input, outputs=x)

model_ConGRU.compile(loss='categorical_crossentropy', 
                        optimizer='adam', 
                        metrics=['accuracy', 
                                 keras_metrics.precision(), 
                                 keras_metrics.recall(), 
                                 keras_metrics.f1_score()])
model_ConGRU.summary()
model_ConGRU.fit(x_train_seq, y_train, 
          validation_data=(x_val_seq, y_validation),
          epochs=5, batch_size=32)

Model: "model_7"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_7 (InputLayer)            (None, 35)           0                                            
__________________________________________________________________________________________________
embedding_15 (Embedding)        (None, 35, 200)      20000000    input_7[0][0]                    
__________________________________________________________________________________________________
gru_19 (GRU)                    (None, 35, 64)       50880       embedding_15[0][0]               
__________________________________________________________________________________________________
gru_20 (GRU)                    (None, 35, 64)       50880       embedding_15[0][0]               
____________________________________________________________________________________________

In [0]:
model_ConGRU.evaluate(x_test_seq, y_test)



[0.575116395066974,
 0.7974594213126324,
 0.8275686673027686,
 0.7755004766074595,
 0.8006889263913004]

#**Concatenated Bi-LSTM (2)**

In [0]:
%%time
model_ConLSTM_input = layers.Input(shape=(x_train_seq.shape[1],))
e = Embedding(100000, 200, weights=[embedding_matrix], input_length=35
              , trainable=True)(model_ConLSTM_input)

forw = layers.LSTM(64, return_sequences=True, dropout=0.2, recurrent_dropout=0.5)(e)
backw = layers.LSTM(64, return_sequences=True, go_backwards=True, dropout=0.2, recurrent_dropout=0.5)(e)

link= layers.Concatenate()([forw,backw])

forw2 = layers.LSTM(64, dropout=0.2)(link)
backw2 = layers.LSTM(64, go_backwards=True, dropout=0.2)(link)
link2= layers.Concatenate()([forw2,backw2])

x = layers.Dense(3, activation='softmax')(link2)

model_ConLSTM = keras.models.Model(inputs=model_ConLSTM_input, outputs=x)

model_ConLSTM.compile(loss='categorical_crossentropy', 
                        optimizer='adam', 
                        metrics=['accuracy', 
                                 keras_metrics.precision(), 
                                 keras_metrics.recall(), 
                                 keras_metrics.f1_score()])
model_ConLSTM.summary()
model_ConLSTM.fit(x_train_seq, y_train, 
          validation_data=(x_val_seq, y_validation),
          epochs=5, batch_size=32)

Model: "model_8"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_8 (InputLayer)            (None, 35)           0                                            
__________________________________________________________________________________________________
embedding_16 (Embedding)        (None, 35, 200)      20000000    input_8[0][0]                    
__________________________________________________________________________________________________
lstm_19 (LSTM)                  (None, 35, 64)       67840       embedding_16[0][0]               
__________________________________________________________________________________________________
lstm_20 (LSTM)                  (None, 35, 64)       67840       embedding_16[0][0]               
____________________________________________________________________________________________

In [0]:
model_ConLSTM.evaluate(x_test_seq, y_test)



[0.5654549431279249,
 0.80257586450247,
 0.8646239553835864,
 0.7397521448646448,
 0.7973284886641925]