<a href="https://colab.research.google.com/github/Allarwa/AMI-Dataset/blob/main/AMI_benchmark_evaluation_experiments_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# AMI benchmark experiments



* Traditional
 * TF/IDF
 *  BOW
* Word Embeddings
 * Word2Vec - CBoW

* Language Models
 * AraBERT
* DNN



## Required Packages

In [None]:
!pip install -q tqdm
!pip install -q joblib==1.1.0
!pip install -q pyarabic
!pip install transformers

In [None]:
import os
import pandas as pd
from tqdm import tqdm
import random

## Mount your Google drive

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


## Read the Dataset and Preprocessing

In [None]:
df_AMITraining = pd.read_excel("/content/drive/MyDrive/AMI TrainingSet.xlsx", header=0)
df_AMITesting = pd.read_excel("/content/drive/MyDrive/AMI TestSet.xlsx", header=0)

In [None]:
df_AMITraining

In [None]:
df_AMITesting

In [None]:
# Labels Encoding (LabelEncoder) TRAINING  choose manual or automatic annotation
from sklearn.preprocessing import LabelEncoder
lbl_encoder = LabelEncoder()

df_AMITraining['label'] = lbl_encoder.fit_transform(df_AMITraining['Asentiment'])

df_AMITraining.sample(10)

In [None]:
# Labels Encoding (LabelEncoder) TESTING
from sklearn.preprocessing import LabelEncoder
lbl_encoder = LabelEncoder()

df_AMITesting['label'] = lbl_encoder.fit_transform(df_AMITesting['Asentiment'])

df_AMITesting.sample(10)


Unnamed: 0,ID,Tweet,Msentiment,Asentiment,label
327,328,علي فكره الاكتئاب ماله اي علاقه بالدين بنت خا...,POS,POS,1
630,631,ما اروع الارق بتاع الامتحانات,POS,POS,1
501,502,الارق ملعون,NEG,NEG,0
346,347,كلهم يعانون من ثنائي القطب,NEG,NEG,0
200,201,اللهم امانا لمن اعتاد القلق ونسي الطمانينه,POS,POS,1
560,561,بعد مرور خمس سنوات من الارق اعلن ع هذا المنبر ...,POS,POS,1
519,520,الله ياخذ الارق والتفكير الزايد,NEG,NEG,0
587,588,ثاني اسوا شي بالحياه بعد انسداد الشهيه الارق ل...,NEG,NEG,0
57,58,اطمئن سيمضي القلق وستاتي الراحه بعد هذا الكم ...,POS,POS,1
160,161,الضغط النفسي يولد الانفجار علي شيء اسخف من ال...,NEG,NEG,0


In [None]:
df_AMITraining['label'].value_counts()

In [None]:
# Clean the dataset from any unnecessary texts
STOPWORDS_PATH = '/content/drive/MyDrive/arabic_stopwords.txt'
with open(STOPWORDS_PATH, 'r') as f:
  ar_stopwords = set([word.strip() for word in f.readlines()])

from pyarabic.araby import *

def clean_text(text):
  def get_none_stopwords(word):
    return word not in ar_stopwords

  return ' '.join(tokenize(text,
           conditions = [is_arabicrange, get_none_stopwords],
           morphs = [strip_tashkeel, strip_harakat, strip_tatweel]))



In [None]:
df_AMITraining['processed_Tweet'] = df_AMITraining['Tweet'].apply(clean_text)
df_AMITraining.head()

In [None]:
df_AMITesting['processed_Tweet'] = df_AMITesting['Tweet'].apply(clean_text)
df_AMITesting.head()

### Functions and data spliting

In [None]:
# Build the vocab
base_dir = '/content/drive/MyDrive/ Benchmark Evaluation '

words_set = set()

clean_data = df_AMITraining['processed_Tweet'].tolist()
for s in clean_data:
    words_set.update(s.split())

clean_data2 = df_AMITesting['processed_Tweet'].tolist()
for s in clean_data2:
    words_set.update(s.split())

with open(base_dir+'vocab.txt', 'w') as f:
    f.write('\n'.join(list(words_set)))

print("Vocb size is ", len(list(words_set)))

Vocb size is  12625


In [None]:
# confusion_matrix code
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import itertools
import math
import numpy as np
def confusion_matrix_scorer(y, pred):

      cm = confusion_matrix(y, pred)
      print_statistics(cm)
      plot_confusion_matrix(cm, classes=['Negative','Positive'],
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues)
      #to add rule matrix
      return {'tn': cm[0, 0], 'fp': cm[0, 1],
              'fn': cm[1, 0], 'tp': cm[1, 1]}


def print_statistics(cm):
    tn, fp, fn, tp = cm.ravel()

    # TP
    print("TP: " + str(tp))
    # TN
    print("TN: " + str(tn))
    # FP
    print("FP: " + str(fp))
    # FN
    print("FN: " + str(fn))
    # TPR
    recall = tp/(tp+fn)
    print("TPR/recall: " + str(recall))
    # TNR
    specificity = tn/(tn+fp)
    print("TNR/specificity: " + str(specificity))
    # PPV
    precision = tp/(tp+fp)
    print("PPV/precision: " + str(precision))
    # NPV
    npv = tn/(tn+fn)
    print("NPV/negative predictive value: " + str(npv))
    # FNR
    miss_rate = 1-recall
    print("FNR/false negative rate: " + str(miss_rate))
    # FPR
    fall_out = 1-specificity
    print("FPR/false positive rate: " + str(fall_out))
    # FDR
    fdr = 1-precision
    print("FDR/false discovery rate: " + str(fdr))
    # FOR
    fomr = 1-npv
    print("FOR/false ommission rate: " + str(fomr))
    # F1
    f1 = 2*((precision*recall)/(precision+recall))
    print("F1 score: " + str(f1))
    # accuracy
    acc = (tp+tn)/(tp+tn+fp+fn)
    print("Accuracy: " + str(acc))
    # Matthews correlation coefficient (MCC)
    mcc = (tp*tn-fp*fn)/math.sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))
    print("MCC/Matthews correlation coefficient: " + str(mcc))
    # Informedness or Bookmaker Informedness (BM)
    bm = recall+specificity-1
    print("BM/Bookmaker Informedness: " + str(bm))
    # Markedness (MK)
    mk = precision+npv-1
    print("MK/Markedness: " + str(mk))
    return fall_out, recall
    # credit: https://github.com/scikit-learn/scikit-learn/blob/master/examples/model_selection/plot_confusion_matrix.py
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)

    plt.title(title)
    #plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    s = [['TN','FP'], ['FN', 'TP']]

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(2):
        for j in range(2):
            plt.text(j,i, (str(s[i][j])+" = "+str(format(cm[i][j],fmt))),horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
# Split the dataset
from sklearn.model_selection import train_test_split
#X_train, X_test, y_train, y_test = train_test_split(df_dataset['processed_Tweet'], df_dataset['label'], test_size=0.2, stratify=df_dataset['label'])
#len(X_train), len(X_test)

#sentences_train, sentences_test, y_train, y_test = train_test_split(sentences, y, test_size=0.20, random_state=1000)
X_train= df_AMITraining['processed_Tweet'].values # Manual or Automatic  Sentiment
y_train = df_AMITraining['label'].values

X_test = df_AMITesting['processed_Tweet'].values
y_test = df_AMITesting['label'].values

len(X_train), len(X_test)
print('Training: ', X_train.shape)
print('Testing: ', X_test.shape)

## Traditional Word Representations

### Bow

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

BOW =  CountVectorizer()
X_train_BOW  = BOW .fit_transform(X_train)
X_test_BOW  = BOW .transform(X_test)

X_train_BOW.shape, X_test_BOW.shape

### Bow+ SVM

In [None]:
# training using Support Vector Machines
from sklearn import svm
svm_clf = svm.SVC()
svm_clf.fit(X_train_BOW, y_train)

# Performance on training data
from sklearn.metrics import accuracy_score, classification_report

y_pred_train = svm_clf.predict(X_train_BOW)
print(classification_report(y_train, y_pred_train))

In [None]:
# Performance on testing data
y_pred_test = svm_clf.predict(X_test_BOW)
print(classification_report(y_test, y_pred_test))

In [None]:
confusion_matrix_scorer(y_test, y_pred_test)

### Bow+ LR

In [None]:
# training using Support Vector Machines
from sklearn.linear_model import LogisticRegression

LR_clf = LogisticRegression(max_iter=1000)
LR_clf.fit(X_train_BOW, y_train)

# Performance on training data
from sklearn.metrics import accuracy_score, classification_report

y_pred_train = LR_clf.predict(X_train_BOW)
print(classification_report(y_train, y_pred_train))

In [None]:
# Performance on testing data
y_pred_test = LR_clf.predict(X_test_BOW)
print(classification_report(y_test, y_pred_test))

In [None]:
confusion_matrix_scorer(y_test, y_pred_test)

### Bow+ DT

In [None]:
# training using DT
from sklearn.tree import DecisionTreeClassifier
DT_clf = DecisionTreeClassifier()
DT_clf.fit(X_train_BOW, y_train)

# Performance on training data
from sklearn.metrics import accuracy_score, classification_report

y_pred_train = DT_clf.predict(X_train_BOW)
print(classification_report(y_train, y_pred_train))

In [None]:
# Performance on testing data
y_pred_test = DT_clf.predict(X_test_BOW)
print(classification_report(y_test, y_pred_test))

In [None]:
confusion_matrix_scorer(y_test, y_pred_test)

### TF/IDF

In [None]:
# Split the dataset
#from sklearn.model_selection import train_test_split
#X_train, X_test, y_train, y_test = train_test_split(df_dataset['processed_Tweet'], df_dataset['label'], test_size=0.2, stratify=df_dataset['label'])
#len(X_train), len(X_test)

#sentences_train, sentences_test, y_train, y_test = train_test_split(sentences, y, test_size=0.20, random_state=1000) # مهم
X_train= df_AMITraining['processed_Tweet'].values
y_train = df_AMITraining['label'].values

X_test = df_AMITesting['processed_Tweet'].values
y_test = df_AMITesting['label'].values

len(X_train), len(X_test)
print('Training: ', X_train.shape)
print('Testing: ', X_test.shape)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

X_train_tfidf.shape, X_test_tfidf.shape

#### TF/IDF with the SVM Classifier

In [None]:
# training using Support Vector Machines
from sklearn import svm

svm_clf = svm.SVC()

svm_clf.fit(X_train_tfidf, y_train)

In [None]:
# Performance on training data
from sklearn.metrics import accuracy_score, classification_report

y_pred_train = svm_clf.predict(X_train_tfidf)
print(classification_report(y_train, y_pred_train))

In [None]:
# Performance on testing data

y_pred_test = svm_clf.predict(X_test_tfidf)
print(classification_report(y_test, y_pred_test))

In [None]:
confusion_matrix_scorer(y_test, y_pred_test)

In [None]:
# save trained SVM model
import joblib
base_dir = '/content/drive/MyDrive/Colab/NLP/'
joblib.dump(svm_clf, base_dir+'tfidf_svm_clf.model')

#### TF/IDF with the LR Classifier

In [None]:
# training using LR
from sklearn.linear_model import LogisticRegression
LR_clf = LogisticRegression(max_iter=1000)
LR_clf.fit(X_train_tfidf, y_train)

In [None]:
# Performance on training data
from sklearn.metrics import accuracy_score, classification_report

y_pred_train = LR_clf.predict(X_train_tfidf)
print(classification_report(y_train, y_pred_train))

In [None]:
# Performance on testing data
y_pred_test = LR_clf.predict(X_test_tfidf)
print(classification_report(y_test, y_pred_test))

In [None]:
confusion_matrix_scorer(y_test, y_pred_test)

#### TF/IDF with the DT Classifier

In [None]:
# training using DT
from sklearn.tree import DecisionTreeClassifier
DT_clf = DecisionTreeClassifier()
DT_clf.fit(X_train_tfidf, y_train)

In [None]:
# Performance on training data
from sklearn.metrics import accuracy_score, classification_report

y_pred_train = DT_clf.predict(X_train_tfidf)
print(classification_report(y_train, y_pred_train))

In [None]:
# Performance on testing data
y_pred_test = DT_clf.predict(X_test_tfidf)
print(classification_report(y_test, y_pred_test))

In [None]:
confusion_matrix_scorer(y_test, y_pred_test)

## Word Embeddings: Word2Vec

In [None]:
# Download the Word2Vec models


# using the following on the paper
# for cbow
!wget https://archive.org/download/aravec2.0/tweet_cbow_300.zip -O w2v_cbow_300.zip #3 use this in the paper
!mkdir -p w2v_cbow_300
!unzip -qq w2v_cbow_300.zip -d w2v_cbow_300
# for SG
#!wget https://archive.org/download/aravec2.0/tweets_sg_300.zip -O w2v_sg_300.zip #2 use this in the paper
#!mkdir -p w2v_sg_300
#!unzip -qq w2v_sg_300.zip -d w2v_sg_300

####cow + SG

In [None]:
from gensim.models import KeyedVectors
def load_w2v(filepath,binary):
    return KeyedVectors.load_word2vec_format(filepath, binary=binary)

In [None]:
import gensim
import re
import numpy as np
from nltk import ngrams


# for cbow
w2v_model = gensim.models.Word2Vec.load('./w2v_cbow_300/tweets_cbow_300') # 3*
#for SG
#w2v_model = gensim.models.Word2Vec.load('./w2v_sg_300/tweets_sg_300') # 2*

len(w2v_model.wv.index_to_key)

In [None]:
import numpy as np
# Replace the words in each text message with the learned word vector
words = set(w2v_model.wv.index_to_key)
X_train_w2v = np.array([np.array([w2v_model.wv[w] for w in l if w in words]) for l in X_train])
X_test_w2v = np.array([np.array([w2v_model.wv[w] for w in l if w in words]) for l in X_test])

In [None]:
# Average the word vectors for each sentence in the training data
X_train_w2v_avg = []

for v in X_train_w2v:
  if v.size:
    X_train_w2v_avg.append(v.mean(axis=0))
  else:
    X_train_w2v_avg.append(np.zeros(100, dtype=float))

# Average the word vectors for each sentence in the testing data
X_test_w2v_avg = []

for v in X_test_w2v:
  if v.size:
    X_test_w2v_avg.append(v.mean(axis=0))
  else:
    X_test_w2v_avg.append(np.zeros(100, dtype=float))

#### Using AraVec CBoW model with the SVM Classifier

In [None]:
# training using Support Vector Machines
from sklearn import svm

svm_clf = svm.SVC()

svm_clf.fit(X_train_w2v_avg, y_train)

In [None]:
# Performance on training data
from sklearn.metrics import accuracy_score, classification_report
y_pred_train = svm_clf.predict(X_train_w2v_avg)
print(classification_report(y_train, y_pred_train))

In [None]:
# Performance on testing data
y_pred_test = svm_clf.predict(X_test_w2v_avg)
print(classification_report(y_test, y_pred_test))

In [None]:
confusion_matrix_scorer(y_test, y_pred_test)

In [None]:
# save trained SVM model
import joblib
joblib.dump(svm_clf, base_dir+'w2v_svm_clf.model')

## Language Model: BERT

In [None]:
df_train =  pd.read_excel("/content/drive/MyDrive/AMI TrainingSet.xlsx", header=0)
df_train

In [None]:
# Labels Encoding (LabelEncoder) TRAINING
from sklearn.preprocessing import LabelEncoder
lbl_encoder = LabelEncoder()
df_train['label'] = lbl_encoder.fit_transform(df_train['ASentiment '])
df_train.sample(10)

In [None]:
# Labels Encoding (OneHotEncoder)
from sklearn.preprocessing import OneHotEncoder
import numpy as np

encoder = OneHotEncoder()

enc_df = pd.DataFrame(encoder.fit_transform(df_train[['ASentiment ']]).toarray())
classes_names = np.array(encoder.categories_).tolist()[0] # categories are list of arrays
enc_df.columns = classes_names
df_train = df_train.join(enc_df)

df_train.head()

In [None]:
df_AMITesting.drop(columns=["ID","MSentiment", 'ASentiment '])

In [None]:
df_test = pd.read_excel("/content/drive/MyDrive/AMI TestSet.xlsx", header=0)
df_test

In [None]:
# Labels Encoding (LabelEncoder) TRAINING
from sklearn.preprocessing import LabelEncoder
lbl_encoder = LabelEncoder()
df_test['label'] = lbl_encoder.fit_transform(df_test['ASentiment '])
df_test.sample(10)

In [None]:
df_AMITesting.drop(columns=["ID","MSentiment", 'ASentiment '])

In [None]:
len(df_train), len(df_test)


In [None]:
# Labels Encoding (OneHotEncoder)
from sklearn.preprocessing import OneHotEncoder
import numpy as np

encoder = OneHotEncoder()

enc_df = pd.DataFrame(encoder.fit_transform(df_test[['ASentiment ']]).toarray())
classes_names = np.array(encoder.categories_).tolist()[0] # categories are list of arrays
enc_df.columns = classes_names
df_test = df_test.join(enc_df)

df_test.head()

In [None]:
# Convert text into BERT word embeddings
import torch
from transformers import AutoTokenizer, AutoModel

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'running on {device} device')

model_name = "aubmindlab/bert-large-arabertv02-twitter" #"aubmindlab/bert-base-arabertv2"
bert_tokenizer = AutoTokenizer.from_pretrained(model_name)
bert_model = AutoModel.from_pretrained(model_name).to(device)

In [None]:
def getBERTWordEmbeddings(df_tweets, bert_tokenizer, bert_model,
                          max_length=128, batch_size=32,
                          device='cuda', return_tensors='pt'):

  batches_embeddings = []
  batches_labels = []

  # split all articles into patches
  tweets = df_tweets.Tweet.values.tolist()
  for i in tqdm(range(0, len(tweets), batch_size)):

    # tokenize all the atweets in batch
    tweets_tokens = bert_tokenizer(tweets[i:i+batch_size], padding = True,
                                    max_length=max_length, truncation = True,
                                    return_tensors=return_tensors)

    # move tokens to device
    if not device == 'cude':
      tweets_tokens = {k:v.to(device) for k,v in tweets_tokens.items()}

    # extract BERT embeddings
    with torch.no_grad():
      hidden_samples = bert_model(**tweets_tokens)

    # extract [CLS] last hidden states as embeddings and move them to CPU memory
    embeddings = hidden_samples.last_hidden_state[:,0,:]
    batches_embeddings.append(embeddings.to('cpu'))

    batches_labels.append(df_tweets[i:i+batch_size][classes_names])

  return batches_embeddings, batches_labels

In [None]:
train_embeddings, train_labels = getBERTWordEmbeddings(df_train, bert_tokenizer, bert_model)
test_embeddings, test_labels = getBERTWordEmbeddings(df_test, bert_tokenizer, bert_model)

#### Using AraBERT with the SVM Classifier

In [None]:
# training using Support Vector Machines
from sklearn import svm

svm_clf = svm.SVC()

X_train = torch.cat(train_embeddings)
y_train = df_train.label

svm_clf.fit(X_train, y_train)
print('training completed..')

In [None]:
from sklearn.metrics import accuracy_score, classification_report

# Performance on training data
y_pred_train = svm_clf.predict(X_train)
accuracy = accuracy_score(y_train, y_pred_train)
print('Training Accuracy: ', "%.2f" % (accuracy*100))
print(classification_report(y_train, y_pred_train))

In [None]:
# Performance on testing data
X_test = torch.cat(test_embeddings)
y_test = df_test.label

y_pred = svm_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('Testing Accuracy: ', "%.2f" % (accuracy*100))
print(classification_report(y_test, y_pred))

In [None]:
confusion_matrix_scorer(y_test, y_pred)

In [None]:
# Get predictions on sample article

sample_article = df_test.sample(n=1)

embeddings,_ = getBERTWordEmbeddings(sample_article, bert_tokenizer, bert_model)
y_pred = svm_clf.predict(torch.cat(embeddings))

print()
print(sample_article.Tweet.item())
print('Actual Category:', sample_article.Msentiment.item())
print('Predicted Category:', lbl_encoder.inverse_transform(y_pred)[0])


In [None]:
# save trained SVM model
import joblib
base_dir = '/content/drive/MyDrive/Colab/NLP/'
joblib.dump(svm_clf, base_dir+'arabert_svm_clf.model')

## Using LSTM - Bi-LSTM - CNN with fastText

In [None]:
pip install gensim

In [None]:
pip install --upgrade gensim

In [None]:
!pip install chakin # fast text embedding
!pip install -q pyarabic

In [None]:
import pandas as pd
import numpy as np
import nltk
import collections
nltk.download('punkt')
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from keras import layers
from keras.layers import Input, Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from keras.models import Model
from keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential
from keras.backend import clear_session
from keras.models import load_model
from keras.preprocessing import sequence
from keras.layers import Dense, Dropout, Activation,LSTM,SpatialDropout1D
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D,MaxPooling1D

In [None]:
import chakin
chakin.search(lang='Arabic')

In [None]:
 chakin.download(number=0, save_dir='./') # select fastText(ar)

In [None]:
#for embedding Layer
from gensim.models import KeyedVectors
def load_w2v(filepath,binary):
    return KeyedVectors.load_word2vec_format(filepath, binary=binary)


In [None]:
#for fast text embedding 300  dim
w2v = load_w2v("./cc.ar.300.vec.gz", binary=False) # takes ~10 mins to load
print(len(w2v.key_to_index))#2000000

In [None]:
def plot_history(history):
    accuracy = history.history['accuracy']
    val_accuracy = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']

    epochs = range(1,len(accuracy) + 1)

    # Plot accuracy
    plt.figure(1)
    plt.plot(epochs, accuracy, 'b', label='Training accuracy')
    plt.plot(epochs, val_accuracy, 'g', label='Validation accuracy')
    plt.title('Training and validation accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()

    # Plot loss
    plt.figure(2)
    plt.plot(epochs, loss, 'b', label='Training loss')
    plt.plot(epochs, val_loss, 'g', label='Validation loss')
    plt.title('Training and validation loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()

In [None]:
#exploratory analysis on the data
maxlen = 0
minlen=10000000
word_freqs = collections.Counter()
num_recs = 0

numberOfRow=int(len(df_AMITraining.index))
for ind in range (0,numberOfRow):
    sentence=df_AMITraining['processed_Tweet'][ind]

    words =nltk.word_tokenize(sentence)

    if (len(words) > maxlen):
       maxlen = len(words)
    if len(words) < minlen:
       minlen = len(words)
    for word in words:
        word_freqs[word] += 1
    num_recs +=1
print("maxlen", maxlen)
print("minlen", minlen)
print("len(word_freqs)",len(word_freqs))
print()

In [None]:
# data set for DNN model
X_train_DNN= df_AMITraining['processed_Tweet'].values # Manual or Automatic  Sentiment
y_train_DNN = df_AMITraining['label'].values

X_test_DNN = df_AMITesting['processed_Tweet'].values
y_test_DNN= df_AMITesting['label'].values

len(X_train_DNN), len(X_test)
print('Training: ', X_train_DNN.shape)
print('Testing: ', X_test_DNN.shape)


# Tokenize and preprocess the data
num_words = 20000

tokenizer = Tokenizer(num_words=num_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train_DNN)
word_index = tokenizer.word_index
vocab_size = len(word_index)+2 # pad  + OOV
max_len = 50
X_train_DNN= pad_sequences(tokenizer.texts_to_sequences(X_train_DNN), maxlen=max_len)
X_test_DNN= pad_sequences(tokenizer.texts_to_sequences(X_test_DNN), maxlen=max_len)

X_train_DNN, X_valid_DNN, y_train_DNN, y_valid_DNN = train_test_split(X_train_DNN, y_train_DNN, test_size=0.1, random_state=42)

print("Training:", len(X_train_DNN))
print("validation: ", len(X_valid_DNN))

In [None]:
MAX_NB_WORDS = len(w2v.key_to_index)
EMBEDDING_DIM = 300 #  FastText

In [None]:
#Create embedding layer -  ِfastText embedding 300  dim
def create_embedding_matrix(word_index):
    nb_words = min(MAX_NB_WORDS, len(word_index))+2
    #nb_words=len(tokenizer.word_index)
    embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
    for word, ii in word_index.items():
        if word in w2v.key_to_index:  #w2v.key_to_index
            embedding_matrix[ii] = w2v.word_vec(word)
    return embedding_matrix

embedding_matrix = create_embedding_matrix(word_index)
print(embedding_matrix.shape)
print(embedding_matrix)

#### LSTM

In [None]:
# Define the LSTM model
#LSTM

EMBEDDING_SIZE = 300
HIDDEN_LAYER_SIZE = 64
BATCH_SIZE = 32
NUM_EPOCHS = 5
clear_session()
model = Sequential()
#model.add(Embedding(num_words, EMBEDDING_SIZE,input_length=max_len))
model.add(layers.Embedding(vocab_size, EMBEDDING_SIZE,
                          weights=[embedding_matrix],
                          input_length=max_len, trainable=True))
model.add(layers.Dropout(0.2))
model.add(LSTM(HIDDEN_LAYER_SIZE, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
model.summary()


In [None]:
history = model.fit(X_train_DNN, y_train_DNN,
                    epochs=NUM_EPOCHS,
                    verbose=True,
                    validation_data=(X_valid_DNN, y_valid_DNN),
                    batch_size=BATCH_SIZE)

loss, accuracy = model.evaluate(X_train_DNN, y_train_DNN, verbose=True ,batch_size=BATCH_SIZE)
print("Training Accuracy: {:.4f}".format(accuracy))

loss_val, accuracy_val= model.evaluate(X_valid_DNN, y_valid_DNN, verbose=True,batch_size=BATCH_SIZE)
print("validation  Accuracy:  {:.4f}".format(accuracy_val))

In [None]:
plot_history(history)

In [None]:
score, acc = model.evaluate(X_test_DNN, y_test_DNN , batch_size=BATCH_SIZE)
print("Test score: %.3f, accuracy: %.3f" % (score, acc))
y_pred_LSTM=[]
for idx in range(0,len(X_test_DNN)):
  xtest = X_test_DNN[idx].reshape(1,max_len)
  ylabel = y_test_DNN[idx]
  ypred = model.predict(xtest)[0][0]
  y_pred_LSTM.append("%.0f" % (ypred) ) # for convision matrix
# str to int
for i in range(len(y_pred_LSTM)):
    y_pred_LSTM[i]=int(y_pred_LSTM[i])

In [None]:
#print(y_pred_LSTM)
#print( y_test_DNN)
confusion_matrix_scorer( y_test_DNN,y_pred_LSTM)

#### Using bi-LSTM with fastText

In [None]:
#create Bidirectional with fast text embedding

EMBEDDING_SIZE = 300
HIDDEN_LAYER_SIZE = 32
BATCH_SIZE = 32
NUM_EPOCHS = 5


clear_session()
model = Sequential()
model.add(layers.Embedding(vocab_size, EMBEDDING_SIZE,
                          weights=[embedding_matrix],
                          input_length=max_len,
                          trainable=True))
model.add(layers.Bidirectional(layers.LSTM(HIDDEN_LAYER_SIZE, dropout=0.2,
                                           recurrent_dropout=0.2,
                                           return_sequences=True)))
model.add(layers.GlobalMaxPool1D())
model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dropout(0.2))
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
history = model.fit(X_train_DNN, y_train_DNN,
                    epochs=NUM_EPOCHS,
                    verbose=True,
                    validation_data=(X_valid_DNN, y_valid_DNN),
                    batch_size=BATCH_SIZE)

loss, accuracy = model.evaluate(X_train_DNN, y_train_DNN, verbose=True ,batch_size=BATCH_SIZE)
print("Training Accuracy: {:.4f}".format(accuracy))

loss_val, accuracy_val= model.evaluate(X_valid_DNN, y_valid_DNN, verbose=True,batch_size=BATCH_SIZE)
print("validation  Accuracy:  {:.4f}".format(accuracy_val))

In [None]:
score, acc = model.evaluate(X_test_DNN, y_test_DNN , batch_size=BATCH_SIZE)
print("Test score: %.3f, accuracy: %.3f" % (score, acc))
y_pred_BiLSTM=[]
for idx in range(0,len(X_test_DNN)):
  xtest = X_test_DNN[idx].reshape(1,max_len)
  ylabel = y_test_DNN[idx]
  ypred = model.predict(xtest)[0][0]
  y_pred_BiLSTM.append("%.0f" % (ypred) ) # for convision matrix
# STR to int
for i in range(len(y_pred_BiLSTM)):
    y_pred_BiLSTM[i]=int(y_pred_BiLSTM[i])

In [None]:
#print(y_pred_BiLSTM)
#print( y_test_DNN)
confusion_matrix_scorer( y_test_DNN,y_pred_BiLSTM)

#### Using CNN with fastText

In [None]:
# set parameters:
BATCH_SIZE= 32
embedding_dims = 300 # 50 without fastText
filters = 250
kernel_size = 3
hidden_dims = 250
NUM_EPOCHS = 5

clear_session()
print('Build model...')
model = Sequential()

# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
'''model.add(Embedding(vocab_size,embedding_dims, input_length=MAX_SENTENCE_LENGTH))'''

model.add(layers.Embedding(vocab_size, embedding_dims,
                          weights=[embedding_matrix],
                          input_length= max_len,
                          trainable=True))
model.add(Dropout(0.2))

# we add a Convolution1D, which will learn filters
# word group filters of size filter_length:
model.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1))
# we use max pooling:
model.add(GlobalMaxPooling1D())

# We add a vanilla hidden layer:
model.add(Dense(hidden_dims))
model.add(Dropout(0.2))
model.add(Activation('relu'))

# We project onto a single unit output layer, and squash it with a sigmoid:
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:
history = model.fit(X_train_DNN, y_train_DNN,
                    epochs=NUM_EPOCHS,
                    verbose=True,
                    validation_data=(X_valid_DNN, y_valid_DNN),
                    batch_size=BATCH_SIZE)

loss, accuracy = model.evaluate(X_train_DNN, y_train_DNN, verbose=True ,batch_size=BATCH_SIZE)
print("Training Accuracy: {:.4f}".format(accuracy))

loss_val, accuracy_val= model.evaluate(X_valid_DNN, y_valid_DNN, verbose=True,batch_size=BATCH_SIZE)
print("validation  Accuracy:  {:.4f}".format(accuracy_val))

In [None]:
score, acc = model.evaluate(X_test_DNN, y_test_DNN , batch_size=BATCH_SIZE)
print("Test score: %.3f, accuracy: %.3f" % (score, acc))
y_pred_CNN=[]
for idx in range(0,len(X_test_DNN)):
  xtest = X_test_DNN[idx].reshape(1,max_len) # max len for tweet
  ylabel = y_test_DNN[idx]
  ypred = model.predict(xtest)[0][0]
  y_pred_CNN.append("%.0f" % (ypred) ) # for convision matrix
# string to int
for i in range(len(y_pred_CNN)):
    y_pred_CNN[i]=int(y_pred_CNN[i])

In [None]:
#print( y_pred_CNN) # predected class
#print( y_test_DNN) # actual label
confusion_matrix_scorer( y_test_DNN, y_pred_CNN)