<a href="https://colab.research.google.com/github/AhmedAbdou07/Coursera-Capstone/blob/master/Thesis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install --upgrade pip
!pip install sentencepiece
!pip install transformers
!pip install torch
!pip install gensim 
!pip install tabulate
!pip install pprintpp

In [None]:
import pandas as pd
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import AutoModelForSequenceClassification
from torch.utils.data import DataLoader
import numpy as np
from scipy.special import softmax

In [None]:
# read tweets
df=pd.read_csv('/content/stc_care_tweets.csv')

In [None]:
pd.set_option('display.max_colwidth', None)
df.head(10)

In [None]:
# check data no of columns and rows
df.shape

In [None]:
# chec data types
df.dtypes

In [None]:
# checl last 5 tweets
df.tail()

In [None]:
pd.set_option('display.max_colwidth', None)
df.head()

In [None]:
#remove tagged accounts
import re
def removetags(x):
    return re.sub(r'\@+[\w]+[\w]+', ' ', x)

df['processed_text']=df['Text'].apply(lambda x : removetags(x))

In [None]:
#remove http links
import re
def removelinks(x):
    return re.sub(r'http\S+', '', x)

df['processed_text']=df['processed_text'].apply(lambda x : removelinks(x))

In [None]:
# check if we have English words 
def checkNonArabicChar(x):
    if re.match(r'[a-zA-Z]+', x):
      return 1
    else:
      return 0
  
df['English_content']=df['processed_text'].apply(lambda x : checkNonArabicChar(x))


In [None]:
df['English_content'].value_counts()

In [None]:
import nltk
nltk.download("stopwords")
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

def removestopwords(x):
  sw = stopwords.words('arabic')
  tokens = nltk.word_tokenize(x)
  stopped_tokens = [i for i in tokens if not i in sw]
  s=''
  for i in stopped_tokens:
    s=s+' '+i
  return s

df['processed_text']=df['processed_text'].apply(lambda x : removestopwords(x))

In [None]:
df['tweet_length']=df['processed_text'].apply(lambda x : len(x))

In [None]:
df.head()

In [None]:

import matplotlib.pyplot as plt
plt.figure(figsize=(36,12))
df['tweet_length'].value_counts().plot(kind = 'bar', ylabel = 'frequency')
plt.show()

In [None]:
plt.figure(figsize=(36,12))
df['tweet_length'].value_counts().hist()
plt.show()

In [None]:
df['tweet_length'].describe()

In [None]:
# tokenize tweets 
df['processed_text']=df['Text'].apply(lambda x : preprocess(x))

In [None]:
df.head()

In [None]:
df.tail()

Data Annotation  xlm-Roberta

In [None]:
# initialize our xlm-roberata model
CUDA = True # set to true if using GPU (Runtime -> Change runtime Type -> GPU)
BATCH_SIZE = 32
MODEL = "cardiffnlp/twitter-xlm-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=True)
config = AutoConfig.from_pretrained(MODEL) # used for id to label name
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
if CUDA:
  model = model.to('cuda')
_ = model.eval()

In [None]:
# function to tokenize tweets and words
def preprocess(corpus):
  outcorpus = []
  for text in corpus:
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    new_text = " ".join(new_text)
    outcorpus.append(new_text)
  return outcorpus

In [None]:
# function to tokenize tweets and run xlmroberata model on it
def forward(text, cuda=True):
  text = preprocess(text)
  encoded_input = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
  if cuda:
    encoded_input.to('cuda')
    output = model(**encoded_input)
    scores = output[0].detach().cpu().numpy()
  else:
    output = model(**encoded_input)
    scores = output[0].detach().numpy()
  
  scores = softmax(scores, axis=-1)
  return scores

In [None]:
# create dataset from tweets only
dataset=df['processed_text']

In [None]:
# run the model and predict sentiment for each tweet
dl = DataLoader(dataset, batch_size=BATCH_SIZE)
all_preds = []
for idx,batch in enumerate(dl):
  print('Batch ',idx+1,' of ',len(dl))
  text = preprocess(batch)
  scores = forward(text, cuda=CUDA)
  preds = np.argmax(scores, axis=-1)
  all_preds.extend(preds)

In [None]:
# sample of tweets along with their annotated sentiment
for example in [0,1, 2, 3, 4, 22, 870,1740,2610,3480,4350,5220,6090, 10000]:
  pred = all_preds[example]
  print(dataset[example], '--->', config.id2label[pred])

In [None]:
# create dataset include original tweets and sentiment 
tweets=[]
sentiment=[]
for example in range(10001):
  pred = all_preds[example]
  tweets.append(dataset[example])
  sentiment.append(config.id2label[pred])
tweets_dict={'tweets':tweets, 'sentiment':sentiment}
tweets_df=pd.DataFrame(tweets_dict)
tweets_df.head()

In [None]:
tweets_df['sentiment'].value_counts()

In [None]:
tweets=pd.concat([df, tweets_df], axis=1)
tweets.head()

In [None]:
tweets.tail()

Reprocessing Part 2

In [None]:
# remove unnecessary spaces
def removeUnnecessarySpaces(text):
    return re.sub(r'[\n\t\ ]+', ' ', text)

tweets['processed_text']=tweets['processed_text'].apply(lambda x : removeUnnecessarySpaces(x))

In [None]:
# remove emojis

def remove_emoji(string):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)
tweets['processed_text']=tweets['processed_text'].apply(lambda x : remove_emoji(x))

In [None]:
tweets[tweets['tweet_length']==2]

In [None]:
# stemming
from nltk.stem.isri import ISRIStemmer
st = ISRIStemmer()

tweets['stemmed_text']=tweets['processed_text'].apply(lambda x : st.suf32(x))

In [None]:
# normalization
!pip install PyArabic
import pyarabic.araby as araby
def normalizeArabic(text):
    text = text.strip()
    text = re.sub("[إأٱآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "ء", text)
    text = re.sub("ئ", "ء", text)
    text = re.sub("ة", "ه", text)
    noise = re.compile(""" ّ    | # Tashdid
                             َ    | # Fatha
                             ً    | # Tanwin Fath
                             ُ    | # Damma
                             ٌ    | # Tanwin Damm
                             ِ    | # Kasra
                             ٍ    | # Tanwin Kasr
                             ْ    | # Sukun
                             ـ     # Tatwil/Kashida
                         """, re.VERBOSE)
    text = re.sub(noise, '', text)
    text = re.sub(r'(.)\1+', r"\1\1", text) # Remove longation
    return araby.strip_tashkeel(text)
  
tweets['normalized_text']=tweets['stemmed_text'].apply(lambda x : normalizeArabic(x))

In [None]:
# remove non arabic characters
def removeNonArabicChar(text):
    return re.sub(r'[^0-9\u0600-\u06ff\u0750-\u077f\ufb50-\ufbc1\ufbd3-\ufd3f\ufd50-\ufd8f\ufd50-\ufd8f\ufe70-\ufefc\uFDF0-\uFDFD.0-9]+', ' ', text)

tweets['Arabic_processed_tweets']=tweets['normalized_text'].apply(lambda x : removeNonArabicChar(x))

# remove unnecessary spaces
def removeUnnecessarySpaces(text):
    return re.sub(r'[\n\t\ ]+', ' ', text)

tweets['Arabic_processed_tweets']=tweets['Arabic_processed_tweets'].apply(lambda x : removeUnnecessarySpaces(x))

In [None]:
# remove extra white spaces
tweets['Arabic_processed_tweets']=tweets['Arabic_processed_tweets'].apply(lambda x : re.sub('\s\s+', '', x))

In [None]:
 ## Remove punctuations
tweets['Arabic_processed_tweets']=tweets['Arabic_processed_tweets'].apply(lambda x :re.sub('[%s]' % re.escape("""!"#$%&'()*+,،-./:;<=>؟?@[\]^_`{|}~"""), ' ', x))

In [None]:
tweets['tweet_length']=tweets['Arabic_processed_tweets'].apply(lambda x : len(x))

In [None]:
 ## Remove cleaned arabic tweets with one or two letters
tweets=tweets[tweets['tweet_length']>2]

In [None]:
tweets.iloc[10:20]

In [None]:
plt.figure(figsize=(36,12))
tweets['tweet_length'].value_counts().hist(bins=10)
plt.show()

In [None]:
tweets.to_csv('Arabic_cleaned_tweets.csv')

**word cloud**

In [None]:
tweets=pd.read_csv('/content/Arabic_cleaned_tweets.csv')
tweets.head()

In [None]:
# install libaries for arabid word cloud
!pip install python-bidi
!pip install arabic-reshaper
!pip install ar_wordcloud

In [None]:
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
from wordcloud import WordCloud, STOPWORDS
from bidi.algorithm import get_display
import arabic_reshaper

In [None]:
#tweets array and prepare list for arabic cloud
documents = tweets['Arabic_processed_tweets'].values
long_string = ','.join(list(documents))
long_string.replace(',', '')

In [None]:
import nltk
nltk.download("stopwords")
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [None]:
# arabic cloud
from ar_wordcloud import ArabicWordCloud
awc = ArabicWordCloud(background_color="white",  width=2000, height=1000)
wc = awc.from_text(long_string.replace(',', ''))
plt.figure(figsize=(15,8))
plt.title("wordcloud")
plt.imshow(wc)
plt.axis("off")
plt.show()

In [None]:
# extend stop words

def removestopwords(x):
  sw = stopwords.words('arabic')
  sw.extend(['السلام','عليكم','الي','اللي','اذا','انا','الله','وانا','والله','او', 'ان', 'وانا'])
  tokens = nltk.word_tokenize(x)
  stopped_tokens = [i for i in tokens if not i in sw]
  s=''
  for i in stopped_tokens:
    s=s+' '+i
  return s

tweets['Arabic_processed_tweets']=tweets['Arabic_processed_tweets'].apply(lambda x : removestopwords(x))

In [None]:
#tweets array and prepare list for arabic cloud
documents = tweets['Arabic_processed_tweets'].values
long_string = ','.join(list(documents))
long_string.replace(',', '')

# arabic cloud version 2
from ar_wordcloud import ArabicWordCloud
awc = ArabicWordCloud(background_color="white",  width=2000, height=1000)
wc = awc.from_text(long_string.replace(',', ''))
plt.figure(figsize=(15,8))
plt.title("wordcloud")
plt.imshow(wc)
plt.axis("off")
plt.show()

**LDA**

In [None]:
from gensim.models.coherencemodel import CoherenceModel
import gensim.corpora as corpora
from gensim.models import LdaMulticore
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

In [None]:
texts = [[word for word in str(document).split()] for document in documents]
id2word = corpora.Dictionary(texts)
corpus = [id2word.doc2bow(text) for text in texts]

**LDA hyperparameters tuning Alpha and Beta and no of topics**

In [None]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3, alpha=0.01, eta=0.01):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    alpha =[]
    beta=[]
    topics=[]
    for num_topics in range(start, limit, step):
        for i in list(np.arange(0.01, 1, 0.3)):
           for j in list(np.arange(0.01, 1, 0.3)):
                  model = LdaMulticore(corpus=corpus, num_topics=num_topics, id2word=id2word, alpha=i, eta=j)
                  model_list.append(model)
                  coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
                  coherence_values.append(coherencemodel.get_coherence())
                  alpha.append(i)
                  beta.append(j)
                  topics.append(num_topics)

    return model_list, coherence_values, alpha, beta, topics

In [None]:
# Can take a long time to run.
model_list, coherence_values, alpha, beta, topics = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=texts, start=2, limit=10, step=1)

In [None]:
# Print the coherence scores
for m, cv, a, b in zip(topics, coherence_values, alpha, beta):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4), " has alpha=", a, " and beta=", b)

**Check best no of topics with default hyper parameters**

In [None]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    
    for num_topics in range(start, limit, step):
        model = LdaMulticore(corpus=corpus, num_topics=num_topics, id2word=id2word)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
      

    return model_list, coherence_values

In [None]:
# Can take a long time to run.
model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=texts, start=2, limit=10, step=1)

In [None]:
# Show graph
limit=10; start=2; step=1;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

In [None]:
# Print the coherence scores
for m, cv in zip(x, coherence_values):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

In [None]:
# Select the model and print the topics
optimal_model = model_list[5]
model_topics = optimal_model.show_topics(formatted=False)
print(optimal_model.print_topics(num_words=10))

In [None]:
def format_topics_sentences(ldamodel=optimal_model, corpus=corpus, texts=texts):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=optimal_model, corpus=corpus, texts=tweets['Arabic_processed_tweets'].values.tolist())

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
pd.set_option('display.max_colwidth', None)
df_dominant_topic.head(10)

In [None]:
df_dominant_topic['Dominant_Topic'].value_counts()

In [None]:
df_dominant_topic[df_dominant_topic['Dominant_Topic']==6].head()

In [None]:
df_dominant_topic[df_dominant_topic['Dominant_Topic']==5].head()

In [None]:
df_dominant_topic[df_dominant_topic['Dominant_Topic']==4].head()

In [None]:
df_dominant_topic[df_dominant_topic['Dominant_Topic']==3].head()

In [None]:
df_dominant_topic[df_dominant_topic['Dominant_Topic']==2].head()

In [None]:
df_dominant_topic[df_dominant_topic['Dominant_Topic']==1].head()

In [None]:
df_dominant_topic[df_dominant_topic['Dominant_Topic']==0].head()

**Features Extraction using BERT**


In [None]:
tweets.head()

In [None]:
tweets.shape

In [None]:
tweets['sentiment'].value_counts()

In [None]:
# read tweets
tweets=pd.read_csv('/content/df_model_input.csv')

In [None]:
tweets.head()

In [None]:
import torch
from transformers import AutoTokenizer, AutoModel

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModel.from_pretrained("distilbert-base-uncased").to(device)

In [None]:
import torch
from transformers import AutoTokenizer, AutoModel

device = torch.device('cpu')

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModel.from_pretrained("distilbert-base-uncased").to(device)

In [None]:
df=tweets
df.head()

In [None]:
df['sentiment'].value_counts()

In [None]:
from sklearn.model_selection import train_test_split
df_train, df_val = train_test_split(df, test_size=0.2, random_state=42, shuffle=True)

In [None]:
df_train.shape, df_val.shape

Now we have to tokenize the text. Remember to define the padding in this way each tokenized sentence will have the same length, and the truncation so if the sentence is too long it will be cut off. The last argument is to return a PyTorch tensor.

The result of tokenizing a text will be a dictionary that contains the input_ids , that are the tokens expressed in numbers, and the attention_mask that tells us if the token is or is not a [PAD].

In [None]:

tokenized_train = tokenizer(df_train["Arabic_processed_tweets"].values.tolist(), padding = True, truncation = True, return_tensors="pt")
tokenized_val = tokenizer(df_val["Arabic_processed_tweets"].values.tolist() , padding = True, truncation = True,  return_tensors="pt")

print(tokenized_train.keys())

#move on device (GPU)
tokenized_train = {k:torch.tensor(v).to(device) for k,v in tokenized_train.items()}
tokenized_val = {k:torch.tensor(v).to(device) for k,v in tokenized_val.items()}

Get the texts ([CLS]) hidden states by running the model.

In [None]:
with torch.no_grad():
  hidden_train = model(**tokenized_train) #dim : [batch_size(nr_sentences), tokens, emb_dim]
  hidden_val = model(**tokenized_val)

#get only the [CLS] hidden states
cls_train = hidden_train.last_hidden_state[:,0,:]
cls_val = hidden_val.last_hidden_state[:,0,:]

In [None]:
x_train = cls_train.to("cpu")
y_train = df_train["sentiment"]

x_val = cls_val.to("cpu")
y_val = df_val["sentiment"]

print(x_train.shape, y_train.shape, x_val.shape, y_val.shape)

In [None]:
df.to_csv('df_model_input.csv')

In [None]:
df=pd.read_csv('/content/df_model_input.csv')
df=df[['Arabic_processed_tweets','sentiment']]
df.head()

In [None]:
torch.cuda.empty_cache()
torch.cuda.memory_summary(device=None, abbreviated=False)

In [None]:
from sklearn import svm
# Chooses the support vector machine algorithm for our classifier
clf = svm.SVC(kernel = "linear")

In [None]:
clf_trained=clf.fit(x_train, y_train)
# Scoring the classifier
clf_trained.score(x_train, y_train)

In [None]:
#import the necessary functions
from sklearn import metrics
#extract the predictions of the model
test_pred_svm = clf_trained.predict(x_val)
#print the classification report
print (metrics.classification_report(y_val, test_pred_svm))

**Using MARBERT Features Extraction with SVM**

In [None]:
from transformers import pipeline
model = pipeline('text-classification', model='Ammar-alhaj-ali/arabic-MARBERT-sentiment')
sentences = ['لقد استمتعت بالحفلة', 'خدمة المطعم كانت محبطة']
model(sentences)

In [None]:
import torch
from transformers import AutoTokenizer, AutoModel

device = torch.device('cpu')

tokenizer = AutoTokenizer.from_pretrained("Ammar-alhaj-ali/arabic-MARBERT-sentiment")
model = AutoModel.from_pretrained("Ammar-alhaj-ali/arabic-MARBERT-sentiment").to(device)

In [None]:
df['sentiment'].value_counts()

In [None]:
from sklearn.model_selection import train_test_split
df_train, df_val = train_test_split(df, test_size=0.2, random_state=42, shuffle=True)

In [None]:
tokenized_train_1 = tokenizer(df_train["Arabic_processed_tweets"].values.tolist(), padding = True, truncation = True, return_tensors="pt")
tokenized_val_1 = tokenizer(df_val["Arabic_processed_tweets"].values.tolist() , padding = True, truncation = True,  return_tensors="pt")

print(tokenized_train_1.keys())

#move on device (GPU)
tokenized_train_1 = {k:torch.tensor(v).to(device) for k,v in tokenized_train_1.items()}
tokenized_val_1= {k:torch.tensor(v).to(device) for k,v in tokenized_val_1.items()}

In [None]:
with torch.no_grad():
  hidden_train_1 = model(**tokenized_train_1) #dim : [batch_size(nr_sentences), tokens, emb_dim]
  hidden_val_1 = model(**tokenized_val_1)

#get only the [CLS] hidden states
cls_train_1 = hidden_train_1.last_hidden_state[:,0,:]
cls_val_1 = hidden_val_1.last_hidden_state[:,0,:]

In [None]:
x_train_1 = cls_train_1.to("cpu")
y_train_1 = df_train["sentiment"]

x_val_1 = cls_val_1.to("cpu")
y_val_1 = df_val["sentiment"]

print(x_train_1.shape, y_train_1.shape, x_val_1.shape, y_val_1.shape)

In [None]:
torch.cuda.empty_cache()
torch.cuda.memory_summary(device=None, abbreviated=False)

In [None]:
from sklearn import svm
# Chooses the support vector machine algorithm for our classifier
clf = svm.SVC(kernel = "linear")

In [None]:
clf_trained=clf.fit(x_train_1, y_train_1)
# Scoring the classifier
clf_trained.score(x_train_1, y_train_1)

In [None]:
#import the necessary functions
from sklearn import metrics
#extract the predictions of the model
test_pred_svm_1 = clf_trained.predict(x_val_1)
#print the classification report
print (metrics.classification_report(y_val_1, test_pred_svm_1))

**MARBERT**

In [None]:
df=pd.read_csv('/content/df_model_input.csv')
df.head()

In [None]:
df=df[['Arabic_processed_tweets', 'sentiment']]
df.head()

In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np

from tabulate import tabulate
from tqdm import trange
import random

In [None]:
text = df.Arabic_processed_tweets.values
labels = df.sentiment.values

In [None]:
# initialize Bert tokenizer
tokenizer = BertTokenizer.from_pretrained('UBC-NLP/MARBERT')

In [None]:
def print_rand_sentence():
  '''Displays the tokens and respective IDs of a random text sample'''
  index = random.randint(0, len(text)-1)
  table = np.array([tokenizer.tokenize(text[index]), 
                    tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text[index]))]).T
  print(tabulate(table,
                 headers = ['Token IDs', 'Tokens'],
                 tablefmt = 'fancy_grid'))

print_rand_sentence()

In [None]:
token_id = []
attention_masks = []

def preprocessing(input_text, tokenizer):
  '''
  Returns <class transformers.tokenization_utils_base.BatchEncoding> with the following fields:
    - input_ids: list of token ids
    - token_type_ids: list of token type ids
    - attention_mask: list of indices (0,1) specifying which tokens should considered by the model (return_attention_mask = True).
  '''
  return tokenizer.encode_plus(
                        input_text,
                        add_special_tokens = True,
                        max_length = 32,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt'
                   )


for sample in text:
  encoding_dict = preprocessing(sample, tokenizer)
  token_id.append(encoding_dict['input_ids']) 
  attention_masks.append(encoding_dict['attention_mask'])


token_id = torch.cat(token_id, dim = 0)
attention_masks = torch.cat(attention_masks, dim = 0)
labels = torch.tensor(labels)

In [None]:
def print_rand_sentence_encoding():
  '''Displays tokens, token IDs and attention mask of a random text sample'''
  index = random.randint(0, len(text) - 1)
  tokens = tokenizer.tokenize(tokenizer.decode(token_id[index]))
  token_ids = [i.numpy() for i in token_id[index]]
  attention = [i.numpy() for i in attention_masks[index]]

  table = np.array([tokens, token_ids, attention])
  print(tabulate(table, 
                 #headers = ['Tokens', 'Token IDs', 'Attention Mask'],
                 tablefmt = 'fancy_grid'
                 ))

print_rand_sentence_encoding()

In [None]:
val_ratio = 0.2
# Recommended batch size: 16, 32. See: https://arxiv.org/pdf/1810.04805.pdf
batch_size = 16

# Indices of the train and validation splits stratified by labels
train_idx, val_idx = train_test_split(
    np.arange(len(labels)),
    test_size = val_ratio,
    shuffle = True,
    stratify = labels)

# Train and validation sets
train_set = TensorDataset(token_id[train_idx], 
                          attention_masks[train_idx], 
                          labels[train_idx])

val_set = TensorDataset(token_id[val_idx], 
                        attention_masks[val_idx], 
                        labels[val_idx])

# Prepare DataLoader
train_dataloader = DataLoader(
            train_set,
            sampler = RandomSampler(train_set),
            batch_size = batch_size
        )

validation_dataloader = DataLoader(
            val_set,
            sampler = SequentialSampler(val_set),
            batch_size = batch_size
        )

In [None]:
def b_tp(preds, labels):
  '''Returns True Positives (TP): count of correct predictions of actual class 1'''
  return sum([preds == labels and preds == 1 for preds, labels in zip(preds, labels)])

def b_fp(preds, labels):
  '''Returns False Positives (FP): count of wrong predictions of actual class 1'''
  return sum([preds != labels and preds == 1 for preds, labels in zip(preds, labels)])

def b_tn(preds, labels):
  '''Returns True Negatives (TN): count of correct predictions of actual class 0'''
  return sum([preds == labels and preds == 0 for preds, labels in zip(preds, labels)])

def b_fn(preds, labels):
  '''Returns False Negatives (FN): count of wrong predictions of actual class 0'''
  return sum([preds != labels and preds == 0 for preds, labels in zip(preds, labels)])

def b_metrics(preds, labels):
  '''
  Returns the following metrics:
    - accuracy    = (TP + TN) / N
    - precision   = TP / (TP + FP)
    - recall      = TP / (TP + FN)
    - specificity = TN / (TN + FP)
  '''
  preds = np.argmax(preds, axis = 1).flatten()
  labels = labels.flatten()
  tp = b_tp(preds, labels)
  tn = b_tn(preds, labels)
  fp = b_fp(preds, labels)
  fn = b_fn(preds, labels)
  b_accuracy = (tp + tn) / len(labels)
  b_precision = tp / (tp + fp) if (tp + fp) > 0 else 'nan'
  b_recall = tp / (tp + fn) if (tp + fn) > 0 else 'nan'
  b_specificity = tn / (tn + fp) if (tn + fp) > 0 else 'nan'
  return b_accuracy, b_precision, b_recall, b_specificity

In [None]:
# Load the BertForSequenceClassification model
model = BertForSequenceClassification.from_pretrained(
    'UBC-NLP/MARBERT',
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False,
)

# Recommended learning rates (Adam): 5e-5, 3e-5, 2e-5. See: https://arxiv.org/pdf/1810.04805.pdf
optimizer = torch.optim.AdamW(model.parameters(), 
                              lr = 5e-5,
                              eps = 1e-08
                              )

# Run on GPU
model.cuda()

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Recommended number of epochs: 2, 3, 4. See: https://arxiv.org/pdf/1810.04805.pdf
epochs = 2

for _ in trange(epochs, desc = 'Epoch'):
    
    # ========== Training ==========
    
    # Set model to training mode
    model.train()
    
    # Tracking variables
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0

    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        optimizer.zero_grad()
        # Forward pass
        train_output = model(b_input_ids, 
                             token_type_ids = None, 
                             attention_mask = b_input_mask, 
                             labels = b_labels)
        # Backward pass
        train_output.loss.backward()
        optimizer.step()
        # Update tracking variables
        tr_loss += train_output.loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

    # ========== Validation ==========

    # Set model to evaluation mode
    model.eval()

    # Tracking variables 
    val_accuracy = []
    val_precision = []
    val_recall = []
    val_specificity = []
    

    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
          # Forward pass
          eval_output = model(b_input_ids, 
                              token_type_ids = None, 
                              attention_mask = b_input_mask)
        logits = eval_output.logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        # Calculate validation metrics
        b_accuracy, b_precision, b_recall, b_specificity = b_metrics(logits, label_ids)
        val_accuracy.append(b_accuracy)
        # Update precision only when (tp + fp) !=0; ignore nan
        if b_precision != 'nan': val_precision.append(b_precision)
        # Update recall only when (tp + fn) !=0; ignore nan
        if b_recall != 'nan': val_recall.append(b_recall)
        # Update specificity only when (tn + fp) !=0; ignore nan
        if b_specificity != 'nan': val_specificity.append(b_specificity)
        
    print('\n\t - Train loss: {:.4f}'.format(tr_loss / nb_tr_steps))
    print('\t - Validation Accuracy: {:.4f}'.format(sum(val_accuracy)/len(val_accuracy)))
    print('\t - Validation Precision: {:.4f}'.format(sum(val_precision)/len(val_precision)) if len(val_precision)>0 else '\t - Validation Precision: NaN')
    print('\t - Validation Recall: {:.4f}'.format(sum(val_recall)/len(val_recall)) if len(val_recall)>0 else '\t - Validation Recall: NaN')
    print('\t - Validation Specificity: {:.4f}\n'.format(sum(val_specificity)/len(val_specificity)) if len(val_specificity)>0 else '\t - Validation Specificity: NaN')
    print('\t - validation F1-Score: {:.4f}\n'.format(2*(sum(val_precision)/len(val_precision))*(sum(val_recall)/len(val_recall))/((sum(val_precision)/len(val_precision))+(sum(val_recall)/len(val_recall)))))

Ammar-alhaj-ali/arabic-MARBERT-sentiment

In [None]:
# initialize Bert tokenizer
tokenizer = BertTokenizer.from_pretrained('Ammar-alhaj-ali/arabic-MARBERT-sentiment')

In [None]:
token_id = []
attention_masks = []

def preprocessing(input_text, tokenizer):
  '''
  Returns <class transformers.tokenization_utils_base.BatchEncoding> with the following fields:
    - input_ids: list of token ids
    - token_type_ids: list of token type ids
    - attention_mask: list of indices (0,1) specifying which tokens should considered by the model (return_attention_mask = True).
  '''
  return tokenizer.encode_plus(
                        input_text,
                        add_special_tokens = True,
                        max_length = 32,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt'
                   )


for sample in text:
  encoding_dict = preprocessing(sample, tokenizer)
  token_id.append(encoding_dict['input_ids']) 
  attention_masks.append(encoding_dict['attention_mask'])


token_id = torch.cat(token_id, dim = 0)
attention_masks = torch.cat(attention_masks, dim = 0)
labels = torch.tensor(labels)

In [None]:
val_ratio = 0.2
# Recommended batch size: 16, 32. See: https://arxiv.org/pdf/1810.04805.pdf
batch_size = 16

# Indices of the train and validation splits stratified by labels
train_idx, val_idx = train_test_split(
    np.arange(len(labels)),
    test_size = val_ratio,
    shuffle = True,
    stratify = labels)

# Train and validation sets
train_set = TensorDataset(token_id[train_idx], 
                          attention_masks[train_idx], 
                          labels[train_idx])

val_set = TensorDataset(token_id[val_idx], 
                        attention_masks[val_idx], 
                        labels[val_idx])

# Prepare DataLoader
train_dataloader = DataLoader(
            train_set,
            sampler = RandomSampler(train_set),
            batch_size = batch_size
        )

validation_dataloader = DataLoader(
            val_set,
            sampler = SequentialSampler(val_set),
            batch_size = batch_size
        )

In [None]:
def b_tp(preds, labels):
  '''Returns True Positives (TP): count of correct predictions of actual class 1'''
  return sum([preds == labels and preds == 1 for preds, labels in zip(preds, labels)])

def b_fp(preds, labels):
  '''Returns False Positives (FP): count of wrong predictions of actual class 1'''
  return sum([preds != labels and preds == 1 for preds, labels in zip(preds, labels)])

def b_tn(preds, labels):
  '''Returns True Negatives (TN): count of correct predictions of actual class 0'''
  return sum([preds == labels and preds == 0 for preds, labels in zip(preds, labels)])

def b_fn(preds, labels):
  '''Returns False Negatives (FN): count of wrong predictions of actual class 0'''
  return sum([preds != labels and preds == 0 for preds, labels in zip(preds, labels)])

def b_metrics(preds, labels):
  '''
  Returns the following metrics:
    - accuracy    = (TP + TN) / N
    - precision   = TP / (TP + FP)
    - recall      = TP / (TP + FN)
    - specificity = TN / (TN + FP)
  '''
  preds = np.argmax(preds, axis = 1).flatten()
  labels = labels.flatten()
  tp = b_tp(preds, labels)
  tn = b_tn(preds, labels)
  fp = b_fp(preds, labels)
  fn = b_fn(preds, labels)
  b_accuracy = (tp + tn) / len(labels)
  b_precision = tp / (tp + fp) if (tp + fp) > 0 else 'nan'
  b_recall = tp / (tp + fn) if (tp + fn) > 0 else 'nan'
  b_specificity = tn / (tn + fp) if (tn + fp) > 0 else 'nan'
  return b_accuracy, b_precision, b_recall, b_specificity

In [None]:
# Load the BertForSequenceClassification model
model = BertForSequenceClassification.from_pretrained(
    'Ammar-alhaj-ali/arabic-MARBERT-sentiment',
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False,
    ignore_mismatched_sizes=True
)

# Recommended learning rates (Adam): 5e-5, 3e-5, 2e-5. See: https://arxiv.org/pdf/1810.04805.pdf
optimizer = torch.optim.AdamW(model.parameters(), 
                              lr = 5e-5,
                              eps = 1e-08
                              )

# Run on GPU
model.cuda()

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Recommended number of epochs: 2, 3, 4. See: https://arxiv.org/pdf/1810.04805.pdf
epochs = 2

for _ in trange(epochs, desc = 'Epoch'):
    
    # ========== Training ==========
    
    # Set model to training mode
    model.train()
    
    # Tracking variables
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0

    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        optimizer.zero_grad()
        # Forward pass
        train_output = model(b_input_ids, 
                             token_type_ids = None, 
                             attention_mask = b_input_mask, 
                             labels = b_labels)
        # Backward pass
        train_output.loss.backward()
        optimizer.step()
        # Update tracking variables
        tr_loss += train_output.loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

    # ========== Validation ==========

    # Set model to evaluation mode
    model.eval()

    # Tracking variables 
    val_accuracy = []
    val_precision = []
    val_recall = []
    val_specificity = []
    

    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
          # Forward pass
          eval_output = model(b_input_ids, 
                              token_type_ids = None, 
                              attention_mask = b_input_mask)
        logits = eval_output.logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        # Calculate validation metrics
        b_accuracy, b_precision, b_recall, b_specificity = b_metrics(logits, label_ids)
        val_accuracy.append(b_accuracy)
        # Update precision only when (tp + fp) !=0; ignore nan
        if b_precision != 'nan': val_precision.append(b_precision)
        # Update recall only when (tp + fn) !=0; ignore nan
        if b_recall != 'nan': val_recall.append(b_recall)
        # Update specificity only when (tn + fp) !=0; ignore nan
        if b_specificity != 'nan': val_specificity.append(b_specificity)
        
    print('\n\t - Train loss: {:.4f}'.format(tr_loss / nb_tr_steps))
    print('\t - Validation Accuracy: {:.4f}'.format(sum(val_accuracy)/len(val_accuracy)))
    print('\t - Validation Precision: {:.4f}'.format(sum(val_precision)/len(val_precision)) if len(val_precision)>0 else '\t - Validation Precision: NaN')
    print('\t - Validation Recall: {:.4f}'.format(sum(val_recall)/len(val_recall)) if len(val_recall)>0 else '\t - Validation Recall: NaN')
    print('\t - Validation Specificity: {:.4f}\n'.format(sum(val_specificity)/len(val_specificity)) if len(val_specificity)>0 else '\t - Validation Specificity: NaN')
    print('\t - validation F1-Score: {:.4f}\n'.format(2*(sum(val_precision)/len(val_precision))*(sum(val_recall)/len(val_recall))/((sum(val_precision)/len(val_precision))+(sum(val_recall)/len(val_recall)))))

UBC-NLP/MARBERT Max_length 64 and no of epochs 4

In [None]:
# initialize Bert tokenizer
tokenizer = BertTokenizer.from_pretrained('UBC-NLP/MARBERT')

token_id = []
attention_masks = []

def preprocessing(input_text, tokenizer):
  '''
  Returns <class transformers.tokenization_utils_base.BatchEncoding> with the following fields:
    - input_ids: list of token ids
    - token_type_ids: list of token type ids
    - attention_mask: list of indices (0,1) specifying which tokens should considered by the model (return_attention_mask = True).
  '''
  return tokenizer.encode_plus(
                        input_text,
                        add_special_tokens = True,
                        max_length = 64,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt'
                   )


for sample in text:
  encoding_dict = preprocessing(sample, tokenizer)
  token_id.append(encoding_dict['input_ids']) 
  attention_masks.append(encoding_dict['attention_mask'])


token_id = torch.cat(token_id, dim = 0)
attention_masks = torch.cat(attention_masks, dim = 0)
labels = torch.tensor(labels)

val_ratio = 0.2
# Recommended batch size: 16, 32. See: https://arxiv.org/pdf/1810.04805.pdf
batch_size = 16

# Indices of the train and validation splits stratified by labels
train_idx, val_idx = train_test_split(
    np.arange(len(labels)),
    test_size = val_ratio,
    shuffle = True,
    stratify = labels)

# Train and validation sets
train_set = TensorDataset(token_id[train_idx], 
                          attention_masks[train_idx], 
                          labels[train_idx])

val_set = TensorDataset(token_id[val_idx], 
                        attention_masks[val_idx], 
                        labels[val_idx])

# Prepare DataLoader
train_dataloader = DataLoader(
            train_set,
            sampler = RandomSampler(train_set),
            batch_size = batch_size
        )

validation_dataloader = DataLoader(
            val_set,
            sampler = SequentialSampler(val_set),
            batch_size = batch_size
        )


# Load the BertForSequenceClassification model
model = BertForSequenceClassification.from_pretrained(
    'UBC-NLP/MARBERT',
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False,
)

# Recommended learning rates (Adam): 5e-5, 3e-5, 2e-5. See: https://arxiv.org/pdf/1810.04805.pdf
optimizer = torch.optim.AdamW(model.parameters(), 
                              lr = 5e-5,
                              eps = 1e-08
                              )

# Run on GPU
model.cuda()


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Recommended number of epochs: 2, 3, 4. See: https://arxiv.org/pdf/1810.04805.pdf
epochs = 4

for _ in trange(epochs, desc = 'Epoch'):
    
    # ========== Training ==========
    
    # Set model to training mode
    model.train()
    
    # Tracking variables
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0

    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        optimizer.zero_grad()
        # Forward pass
        train_output = model(b_input_ids, 
                             token_type_ids = None, 
                             attention_mask = b_input_mask, 
                             labels = b_labels)
        # Backward pass
        train_output.loss.backward()
        optimizer.step()
        # Update tracking variables
        tr_loss += train_output.loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

    # ========== Validation ==========

    # Set model to evaluation mode
    model.eval()

    # Tracking variables 
    val_accuracy = []
    val_precision = []
    val_recall = []
    val_specificity = []
    

    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
          # Forward pass
          eval_output = model(b_input_ids, 
                              token_type_ids = None, 
                              attention_mask = b_input_mask)
        logits = eval_output.logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        # Calculate validation metrics
        b_accuracy, b_precision, b_recall, b_specificity = b_metrics(logits, label_ids)
        val_accuracy.append(b_accuracy)
        # Update precision only when (tp + fp) !=0; ignore nan
        if b_precision != 'nan': val_precision.append(b_precision)
        # Update recall only when (tp + fn) !=0; ignore nan
        if b_recall != 'nan': val_recall.append(b_recall)
        # Update specificity only when (tn + fp) !=0; ignore nan
        if b_specificity != 'nan': val_specificity.append(b_specificity)
        
    print('\n\t - Train loss: {:.4f}'.format(tr_loss / nb_tr_steps))
    print('\t - Validation Accuracy: {:.4f}'.format(sum(val_accuracy)/len(val_accuracy)))
    print('\t - Validation Precision: {:.4f}'.format(sum(val_precision)/len(val_precision)) if len(val_precision)>0 else '\t - Validation Precision: NaN')
    print('\t - Validation Recall: {:.4f}'.format(sum(val_recall)/len(val_recall)) if len(val_recall)>0 else '\t - Validation Recall: NaN')
    print('\t - Validation Specificity: {:.4f}\n'.format(sum(val_specificity)/len(val_specificity)) if len(val_specificity)>0 else '\t - Validation Specificity: NaN')
    print('\t - validation F1-Score: {:.4f}\n'.format(2*(sum(val_precision)/len(val_precision))*(sum(val_recall)/len(val_recall))/((sum(val_precision)/len(val_precision))+(sum(val_recall)/len(val_recall)))))

Ammar-alhaj-ali/arabic-MARBERT-sentiment Max length =64 epochs 4 and truncation is true

In [None]:
# initialize Bert tokenizer
tokenizer = BertTokenizer.from_pretrained('Ammar-alhaj-ali/arabic-MARBERT-sentiment')

token_id = []
attention_masks = []

def preprocessing(input_text, tokenizer):
  '''
  Returns <class transformers.tokenization_utils_base.BatchEncoding> with the following fields:
    - input_ids: list of token ids
    - token_type_ids: list of token type ids
    - attention_mask: list of indices (0,1) specifying which tokens should considered by the model (return_attention_mask = True).
  '''
  return tokenizer.encode_plus(
                        input_text,
                        add_special_tokens = True,
                        max_length = 128,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt',
                        truncation=True
                   )


for sample in text:
  encoding_dict = preprocessing(sample, tokenizer)
  token_id.append(encoding_dict['input_ids']) 
  attention_masks.append(encoding_dict['attention_mask'])


token_id = torch.cat(token_id, dim = 0)
attention_masks = torch.cat(attention_masks, dim = 0)
labels = torch.tensor(labels)

val_ratio = 0.2
# Recommended batch size: 16, 32. See: https://arxiv.org/pdf/1810.04805.pdf
batch_size = 16

# Indices of the train and validation splits stratified by labels
train_idx, val_idx = train_test_split(
    np.arange(len(labels)),
    test_size = val_ratio,
    shuffle = True,
    stratify = labels)

# Train and validation sets
train_set = TensorDataset(token_id[train_idx], 
                          attention_masks[train_idx], 
                          labels[train_idx])

val_set = TensorDataset(token_id[val_idx], 
                        attention_masks[val_idx], 
                        labels[val_idx])

# Prepare DataLoader
train_dataloader = DataLoader(
            train_set,
            sampler = RandomSampler(train_set),
            batch_size = batch_size
        )

validation_dataloader = DataLoader(
            val_set,
            sampler = SequentialSampler(val_set),
            batch_size = batch_size
        )


# Load the BertForSequenceClassification model
model = BertForSequenceClassification.from_pretrained(
    'Ammar-alhaj-ali/arabic-MARBERT-sentiment',
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False,
    ignore_mismatched_sizes=True
)

# Recommended learning rates (Adam): 5e-5, 3e-5, 2e-5. See: https://arxiv.org/pdf/1810.04805.pdf
optimizer = torch.optim.AdamW(model.parameters(), 
                              lr = 2e-5,
                              eps = 1e-08
                              )

# Run on GPU
model.cuda()


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Recommended number of epochs: 2, 3, 4. See: https://arxiv.org/pdf/1810.04805.pdf
epochs = 4

for _ in trange(epochs, desc = 'Epoch'):
    
    # ========== Training ==========
    
    # Set model to training mode
    model.train()
    
    # Tracking variables
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0

    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        optimizer.zero_grad()
        # Forward pass
        train_output = model(b_input_ids, 
                             token_type_ids = None, 
                             attention_mask = b_input_mask, 
                             labels = b_labels)
        # Backward pass
        train_output.loss.backward()
        optimizer.step()
        # Update tracking variables
        tr_loss += train_output.loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

    # ========== Validation ==========

    # Set model to evaluation mode
    model.eval()

    # Tracking variables 
    val_accuracy = []
    val_precision = []
    val_recall = []
    val_specificity = []
    

    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
          # Forward pass
          eval_output = model(b_input_ids, 
                              token_type_ids = None, 
                              attention_mask = b_input_mask)
        logits = eval_output.logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        # Calculate validation metrics
        b_accuracy, b_precision, b_recall, b_specificity = b_metrics(logits, label_ids)
        val_accuracy.append(b_accuracy)
        # Update precision only when (tp + fp) !=0; ignore nan
        if b_precision != 'nan': val_precision.append(b_precision)
        # Update recall only when (tp + fn) !=0; ignore nan
        if b_recall != 'nan': val_recall.append(b_recall)
        # Update specificity only when (tn + fp) !=0; ignore nan
        if b_specificity != 'nan': val_specificity.append(b_specificity)
        
    print('\n\t - Train loss: {:.4f}'.format(tr_loss / nb_tr_steps))
    print('\t - Validation Accuracy: {:.4f}'.format(sum(val_accuracy)/len(val_accuracy)))
    print('\t - Validation Precision: {:.4f}'.format(sum(val_precision)/len(val_precision)) if len(val_precision)>0 else '\t - Validation Precision: NaN')
    print('\t - Validation Recall: {:.4f}'.format(sum(val_recall)/len(val_recall)) if len(val_recall)>0 else '\t - Validation Recall: NaN')
    print('\t - Validation Specificity: {:.4f}\n'.format(sum(val_specificity)/len(val_specificity)) if len(val_specificity)>0 else '\t - Validation Specificity: NaN')
    print('\t - validation F1-Score: {:.4f}\n'.format(2*(sum(val_precision)/len(val_precision))*(sum(val_recall)/len(val_recall))/((sum(val_precision)/len(val_precision))+(sum(val_recall)/len(val_recall)))))

In [None]:
df=df[['Arabic_processed_tweets','sentiment']]
df.rename(columns={"Arabic_processed_tweets": "text", "sentiment": "label"}, inplace=True)
df.head()

In [None]:
import torch

In [None]:
from transformers import AutoTokenizer, AutoModel
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModel.from_pretrained("Ammar-alhaj-ali/arabic-MARBERT-sentiment").to(device)
tokenizer = AutoTokenizer.from_pretrained("Ammar-alhaj-ali/arabic-MARBERT-sentiment")

In [None]:
# Define preprocessing util function
def text_preprocessing(text):
    """
    - Remove entity mentions (eg. '@united')
    - Correct errors (eg. '&amp;' to '&')
    @param    text (str): a string to be processed.
    @return   text (Str): the processed string.
    """
  

    # Normalize unicode encoding
    text = unicodedata.normalize('NFC', text)
    # Remove '@name'
    text = re.sub(r'(@.*?)[\s]', ' ', text)

    # Replace '&amp;' with '&'
    text = re.sub(r'&amp;', '&', text)

    # Remove trailing whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    #Remove URLs
    text = re.sub(r'^https?:\/\/.*[\r\n]*', '<URL>', text)


    return text

In [None]:
pip install emoji

In [None]:
# Create a function to tokenize a set of texts
import emoji
import unicodedata
def preprocessing_for_bert(data, version="mini", text_preprocessing_fn = text_preprocessing ):
    """Perform required preprocessing steps for pretrained BERT.
    @param    data (np.array): Array of texts to be processed.
    @return   input_ids (torch.Tensor): Tensor of token ids to be fed to a model.
    @return   attention_masks (torch.Tensor): Tensor of indices specifying which
                  tokens should be attended to by the model.
    """
    # Create empty lists to store outputs
    input_ids = []
    attention_masks = []
    tokenizer = AutoTokenizer.from_pretrained("Ammar-alhaj-ali/arabic-MARBERT-sentiment") if version == "mini" else AutoTokenizer.from_pretrained("Ammar-alhaj-ali/arabic-MARBERT-sentiment")

    # For every sentence...
    for i,sent in enumerate(data):
        # `encode_plus` will:
        #    (1) Tokenize the sentence
        #    (2) Add the `[CLS]` and `[SEP]` token to the start and end
        #    (3) Truncate/Pad sentence to max length
        #    (4) Map tokens to their IDs
        #    (5) Create attention mask
        #    (6) Return a dictionary of outputs
        encoded_sent = tokenizer.encode_plus(
            text=text_preprocessing_fn(sent),  # Preprocess sentence
            add_special_tokens=True,        # Add `[CLS]` and `[SEP]`
            max_length=MAX_LEN,                  # Max length to truncate/pad
            padding='max_length',        # Pad sentence to max length
            #return_tensors='pt',           # Return PyTorch tensor
            return_attention_mask=True,     # Return attention mask
            truncation = True 
            )
        
        # Add the outputs to the lists
        input_ids.append(encoded_sent.get('input_ids'))
        attention_masks.append(encoded_sent.get('attention_mask'))
    # Convert lists to tensors
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)

    return input_ids, attention_masks

In [None]:
train_df = df


from sklearn.model_selection import train_test_split
X = train_df.text.values
y = train_df.label.values

# The train val split is used by the DL approach but not classical ML
X_train, X_val, y_train, y_val = train_test_split(X,y,test_size=0.2, random_state=2020)

In [None]:
import re

In [None]:
# Specify `MAX_LEN`
MAX_LEN =  280

# Print sentence 0 and its encoded token ids
token_ids = list(preprocessing_for_bert([X[0]])[0].squeeze().numpy())
print('Original: ', X[0])
print('Token IDs: ', token_ids)

# Run function `preprocessing_for_bert` on the train set and the validation set
print('Tokenizing data...')
train_inputs, train_masks = preprocessing_for_bert(X_train)
val_inputs, val_masks = preprocessing_for_bert(X_val)

In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# Convert other data types to torch.Tensor
train_labels = torch.tensor(y_train)
val_labels = torch.tensor(y_val)

# For fine-tuning BERT, the authors recommend a batch size of 16 or 32.
batch_size = 32

# Create the DataLoader for our training set
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set
val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

In [None]:
%%time
import torch
import torch.nn as nn
from transformers import BertModel

# Create the BertClassfier class
class BertClassifier(nn.Module):
    """Bert Model for Classification Tasks.
    """
    def __init__(self, freeze_bert=False, version="mini"):
        """
        @param    bert: a BertModel object
        @param    classifier: a torch.nn.Module classifier
        @param    freeze_bert (bool): Set `False` to fine-tune the BERT model
        """
        super(BertClassifier, self).__init__()
        # Specify hidden size of BERT, hidden size of our classifier, and number of labels
        D_in = 256 if version == "mini" else 256
        H, D_out = 50, 2

        # Instantiate BERT model
        self.bert = AutoModel.from_pretrained("Ammar-alhaj-ali/arabic-MARBERT-sentiment") if version == "mini" else AutoModel.from_pretrained("Ammar-alhaj-ali/arabic-MARBERT-sentiment")
        # Instantiate an one-layer feed-forward classifier
        self.classifier = nn.Sequential(
            nn.AdaptiveMaxPool2d((D_in,256)),
            nn.Linear(256, H),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(H, D_out)
        )

        # Freeze the BERT model
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False
        
    def forward(self, input_ids, attention_mask):
        """
        Feed input to BERT and the classifier to compute logits.
        @param    input_ids (torch.Tensor): an input tensor with shape (batch_size,
                      max_length)
        @param    attention_mask (torch.Tensor): a tensor that hold attention mask
                      information with shape (batch_size, max_length)
        @return   logits (torch.Tensor): an output tensor with shape (batch_size,
                      num_labels)
        """
        # Feed input to BERT
        outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask, )
        
        # Extract the last hidden state of the token `[CLS]` for classification task
        last_hidden_state_cls = outputs[0][:, 0, :]

        # Feed input to classifier to compute logits
        logits = self.classifier(last_hidden_state_cls)

        return logits

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup

from torch.optim import SparseAdam, Adam
def initialize_model(epochs=4, version="mini"):
    """Initialize the Bert Classifier, the optimizer and the learning rate scheduler.
    """
    # Instantiate Bert Classifier
    bert_classifier = BertClassifier(freeze_bert=False, version=version)
    # Tell PyTorch to run the model on GPU
    bert_classifier.to(device)

    # Create the optimizer
    optimizer = AdamW(params=list(bert_classifier.parameters()),
                      lr=5e-5,    # Default learning rate
                      eps=1e-8    # Default epsilon value
                      )

    # Total number of training steps
    total_steps = len(train_dataloader) * epochs

    # Set up the learning rate scheduler
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0, # Default value
                                                num_training_steps=total_steps)
    return bert_classifier, optimizer, scheduler

In [None]:
import random
import time
import torch
import torch.nn as nn
# Specify loss function
loss_fn = nn.CrossEntropyLoss()

def set_seed(seed_value=42):
    """Set seed for reproducibility.
    """
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

def train(model, train_dataloader, val_dataloader=None, epochs=4, evaluation=False):
    """Train the BertClassifier model.
    """
    # Start training loop
    print("Start training...\n")
    for epoch_i in range(epochs):
        # =======================================
        #               Training
        # =======================================
        # Print the header of the result table
        print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}")
        print("-"*70)

        # Measure the elapsed time of each epoch
        t0_epoch, t0_batch = time.time(), time.time()

        # Reset tracking variables at the beginning of each epoch
        total_loss, batch_loss, batch_counts = 0, 0, 0

        # Put the model into the training mode
        model.train()

        # For each batch of training data...
        for step, batch in enumerate(train_dataloader):
            batch_counts +=1
            # Load batch to GPU
            b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

            # Zero out any previously calculated gradients
            model.zero_grad()

            # Perform a forward pass. This will return logits.
            logits = model(b_input_ids, b_attn_mask)

            # Compute loss and accumulate the loss values
            loss = loss_fn(logits, b_labels)
            batch_loss += loss.item()
            total_loss += loss.item()

            # Perform a backward pass to calculate gradients
            loss.backward()

            # Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Update parameters and the learning rate
            optimizer.step()
            scheduler.step()

            # Print the loss values and time elapsed for every 20 batches
            if (step % 20 == 0 and step != 0) or (step == len(train_dataloader) - 1):
                # Calculate time elapsed for 20 batches
                time_elapsed = time.time() - t0_batch

                # Print training results
                print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {time_elapsed:^9.2f}")

                # Reset batch tracking variables
                batch_loss, batch_counts = 0, 0
                t0_batch = time.time()

        # Calculate the average loss over the entire training data
        avg_train_loss = total_loss / len(train_dataloader)

        print("-"*70)
        # =======================================
        #               Evaluation
        # =======================================
        if evaluation == True:
            # After the completion of each training epoch, measure the model's performance
            # on our validation set.
            val_loss, val_accuracy = evaluate(model, val_dataloader)

            # Print performance over the entire training data
            time_elapsed = time.time() - t0_epoch
            
            print(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_accuracy:^9.2f} | {time_elapsed:^9.2f}")
            print("-"*70)
        print("\n")
    
    print("Training complete!")


def evaluate(model, val_dataloader):
    """After the completion of each training epoch, measure the model's performance
    on our validation set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()

    # Tracking variables
    val_accuracy = []
    val_loss = []

    # For each batch in our validation set...
    for batch in val_dataloader:
        # Load batch to GPU
        b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)

        # Compute loss
        loss = loss_fn(logits, b_labels)
        val_loss.append(loss.item())

        # Get the predictions
        preds = torch.argmax(logits, dim=1).flatten()

        # Calculate the accuracy rate
        accuracy = (preds == b_labels).cpu().numpy().mean() * 100
        val_accuracy.append(accuracy)

    # Compute the average accuracy and loss over the validation set.
    val_loss = np.mean(val_loss)
    val_accuracy = np.mean(val_accuracy)

    return val_loss, val_accuracy

In [None]:
torch.cuda.memory_summary(device=None, abbreviated=False)

In [None]:
set_seed(42) 
bert_classifier, optimizer, scheduler = initialize_model(epochs=2)
train(bert_classifier, train_dataloader, val_dataloader, epochs=2, evaluation=True)

In [None]:
def extract_hidden_states(batch):
    inputs = {k:v.to(device) for k,v in batch.items()
             if k in tokenizer.model_input_names}
    with torch.no_grad():
        last_hidden_state = model(**inputs).last_hidden_state
    return {"hidden_state": last_hidden_state[:,0].cpu().numpy()}

In [None]:
def tokenize(batch):
    return tokenizer(batch, padding=True, truncation=True)

In [None]:
df['label'].value_counts()

In [None]:
from sklearn.model_selection import train_test_split
train_ds, df_r = train_test_split(df, test_size=0.2, random_state=42, shuffle=True)
val_ds, test_ds = train_test_split(df, test_size=0.5, random_state=42, shuffle=True)

In [None]:
emotion_dict = {'Negative':0, 'Positive':1}

In [None]:
p = dict(zip(emotion_dict.values(),emotion_dict.keys()))

In [None]:
p[0]

In [None]:
from datasets import load_dataset

In [None]:
df = load_dataset("text", data_files="/content/df_model_input.txt")['train']

In [None]:
df[0]['text'].split(',')[0]

In [None]:
def split_data(data):
    after_process = data['text'].split(',')
    data['text'] = after_process[1]
    data['label'] = emotion_dict[after_process[0]]
    return data

In [None]:
train_ds =  train_ds.map(lambda x: split_data(x))
test_ds = test_ds.map(lambda x: split_data(x))
val_ds = val_ds.map(lambda x: split_data(x))

In [None]:
import pandas as pd
import torch
import random

# creating dummy targets (float values)
targets_data = [random.random() for i in range(10)]

# creating DataFrame from targets_data
targets_df = pd.DataFrame(data=targets_data)
targets_df.columns = ['targets']

# creating tensor from targets_df 
torch_tensor = torch.tensor(targets_df['targets'].values)

# printing out result
print(torch_tensor)

In [None]:
train_ds_encoded =  train_ds.map(tokenize, batched=True, batch_size=None)
test_ds_encoded =  test_ds.map(tokenize, batched=True, batch_size=None)
val_ds_encoded =  val_ds.map(tokenize, batched=True, batch_size=None)

In [None]:
train_ds_encoded.set_format("torch", columns=["input_ids", "attention_mask", "label"])
test_ds_encoded.set_format("torch", columns=["input_ids", "attention_mask", "label"])
val_ds_encoded.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [None]:
d=pd.read_csv('/content/train.txt')

In [None]:
d.head()

In [None]:
emotion_dict = {'surprise':0, 'love':1 , 'joy':2 , 'fear': 3, 'sadness': 4, 'anger':5}

In [None]:
!pip install datasets

In [None]:
from datasets import load_dataset

In [None]:
before_train_ds = load_dataset(df)['train']

In [None]:
before_train_ds

In [None]:
 emotion_dict[before_train_ds[100]['text'].split(';')[1]]

In [None]:
def split_data(data):
    after_process = data['text'].split(';')
    data['text'] = after_process[0]
    data['label'] = emotion_dict[after_process[1]]
    return data

train_ds =  before_train_ds.map(lambda x: split_data(x))

In [None]:
train_ds

In [None]:
train_ds.set_format(type="pandas")
df = train_ds[:]
df

In [None]:
p = dict(zip(emotion_dict.values(),emotion_dict.keys()))

In [None]:
df['label_text'] =df['label'].apply(lambda x: p[x])

In [None]:
df

In [None]:
train_ds_encoded =  train_ds.map(tokenize, batched=True, batch_size=None)

In [None]:
df = load_dataset("text", skiprow=1, data_files="/content/df_model_input.csv")['train']

In [None]:
df

In [None]:
df.iloc[1:, :]

In [None]:
df[1]

In [None]:
def split_data(data):
    after_process = data['text'].split(',')
    data['text'] = after_process[0]
    data['label'] = after_process[1]
    return data

In [None]:
df =  df.map(lambda x: split_data(x))