## Configurations

In [None]:
# install datasets
!pip install datasets
!pip install demoji
!pip install arabic-stopwords
!pip install transformers

Collecting datasets
  Downloading datasets-2.14.5-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.14.0 (from datasets)
  Downloading huggingface_hub-0.17.2-py3-none-a

In [None]:
# Import the required libraries
from sklearn.model_selection import train_test_split
import numpy as np
import tqdm
import unicodedata
from bs4 import BeautifulSoup
import string
import nltk
import pandas as pd
import demoji
import re
from datasets import load_dataset
import arabicstopwords.arabicstopwords as ast
from nltk.stem import ISRIStemmer
from transformers import AutoTokenizer, TFAutoModel
import tensorflow as tf
import pickle

# Loading Data

In [None]:
# Loading the dataset
dataset = load_dataset("emotone_ar")

Downloading builder script:   0%|          | 0.00/2.90k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.64k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/5.04k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/535k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/10065 [00:00<?, ? examples/s]

In [None]:
# Observing the dataset object
dataset

DatasetDict({
    train: Dataset({
        features: ['tweet', 'label'],
        num_rows: 10065
    })
})

In [None]:
# Getting the train dataset
Data= pd.DataFrame(dataset['train'])

# Observing the data and meta data

In [None]:
Data.head()

Unnamed: 0,tweet,label
0,الاوليمبياد الجايه هكون لسه ف الكليه ..,0
1,عجز الموازنه وصل ل93.7 % من الناتج المحلي يعني...,1
2,كتنا نيله ف حظنا الهباب xD,3
3,جميعنا نريد تحقيق اهدافنا لكن تونس تالقت في حر...,2
4,الاوليمبياد نظامها مختلف .. ومواعيد المونديال ...,0


In [None]:
Data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10065 entries, 0 to 10064
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   tweet   10065 non-null  object
 1   label   10065 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 157.4+ KB


In [None]:
Data['label'].value_counts()

0    1550
1    1444
2    1281
3    1256
4    1220
7    1207
5    1062
6    1045
Name: label, dtype: int64

In [None]:
label_to_class = {
    0: 'none',
    1: 'anger',
    2: 'joy',
    3: 'sadness',
    4: 'love',
    5: 'sympathy',
    6: 'surprise',
    7: 'fear'
}
classes = ['none', 'anger', 'joy', 'sadness', 'love', 'sympathy', 'surprise', 'fear']

# Dealing with EMOJIS

*   list all emojis in the dataset.
*   replace each emoji by one word that describes the relevant emotion.



In [None]:
def extract_emojis(tweet):
  """
  Extracts emojis from a tweet.
  Args:
    tweet: The tweet to extract emojis from.

  Returns:
    A list of emojis found in the tweet.
  """
  return demoji.findall(tweet)

def extract_emoticons(text):
  """
  Extracts emoticons from a text.
  Args:
    text: The text to extract emoticons from.

  Returns:
    A list of emoticons found in the text.
  """
  emoticon_pattern = r'(?::|;|=)(?:-)?(?:\)|\(|D|P)'
  emoticons = re.findall(emoticon_pattern, text)
  return emoticons

In [None]:
# Initialize lists to store all emojis and emoticons
all_emojis = []
all_emoticons = []

# Iterate through the DataFrame and extract emojis and emoticons
for text in Data['tweet']:
    emojis = extract_emojis(text)
    emoticons = extract_emoticons(text)
    all_emojis.extend(emojis)
    all_emoticons.extend(emoticons)

# Print the extracted emojis and emoticons
print("All Emojis:", set(all_emojis))
print("All Emoticons:", set(all_emoticons))


All Emojis: {'😊', '🕋', '🎷', '💪🏻', '💌', '👃', '🙈', '🍁', '🙌', '©', '😧', '😑', '🇸🇦', '💗', '😡', '❣️', '🔪', '🙉', '👋', '😆', '📮', '✨', '🎶', '⁉️', '☁️', '✏️', '💞', '✍🏻', '😉', '2️⃣', '💫', '🙍🏻', '👎', '👏', '🏀', '😢', '🌸', '🙏🏻', '♥️', '😐', '😣', '🎥', '🙇', '👏🏻', '😛', '😭', '💓', '😌', '🏃', '🌷', '😍', '❤', '😮', '😋', '🐔', '👎🏻', '💦', '❄️', '😁', '📩', '🍔', '👵🏽', '☝', '™', '😥', '👇', '😶', '🙆🏻', '🚶🏻', '🔑', '👻', '🚬', '👌🏽', '1️⃣', '☔', '🤔', '💃', '🌟', '🇯🇴', '👆', '🚶🏾\u200d♀️', '🤕', '💋', '🇧🇷', '✋', '🌿', '💎', '☹️', '💵', '🤗', '😯', '✊', '☂', '📝', '🖋', '🔐', '😖', '👣', '🗣', '😕', '🙌🏼', '😿', '😃', '🌞', '🙄', '🚶', '✌️', '😲', '❤️', '😳', '🎀', '💘', '✒', '✈️', '💚', '😻', '😇', '💕', '👊🏿', '♥', '👏🏽', '⚪', '👽', '🏃🏼', '✋🏼', '🍷', '😹', '👊', '🔕', '🍂', '💩', '✌', '👌🏻', '👯', '🌚', '🍳', '🙏', '⚜', '✌🏼', '✋🏻', '😄', '😟', '🐣', '🌾', '🙁', '🌹', '💔', '☠️', '😔', '🎼', '👍🏼', '🕊', '😠', '🐓', '😎', '💂🏿', '😦', '💪🏽', '☺️', '😒', '😜', '🇩🇿', '😱', '🇧🇭', '🍃', '⚽', '😨', '⚠️', '💤', '😈', '🌼', '👍', '😪', '✔', '👀', '🎩', '👌🏼', '🗨', '☘', '💀', '☕', '♦️', '😂', '🇮🇶', '😅', '😓', '🙋

In [None]:
# Extracting the unique emojis and emoticons from the dataset
all_emojis= set(all_emojis)
all_emoticons= set(all_emoticons)
print("len Emojis:", len(all_emojis))
print("All Emojis:", all_emojis)
print("len Emoticons:", len(all_emoticons))
print("All Emoticons:", all_emoticons)


len Emojis: 276
All Emojis: {'😊', '🕋', '🎷', '💪🏻', '💌', '👃', '🙈', '🍁', '🙌', '©', '😧', '😑', '🇸🇦', '💗', '😡', '❣️', '🔪', '🙉', '👋', '😆', '📮', '✨', '🎶', '⁉️', '☁️', '✏️', '💞', '✍🏻', '😉', '2️⃣', '💫', '🙍🏻', '👎', '👏', '🏀', '😢', '🌸', '🙏🏻', '♥️', '😐', '😣', '🎥', '🙇', '👏🏻', '😛', '😭', '💓', '😌', '🏃', '🌷', '😍', '❤', '😮', '😋', '🐔', '👎🏻', '💦', '❄️', '😁', '📩', '🍔', '👵🏽', '☝', '™', '😥', '👇', '😶', '🙆🏻', '🚶🏻', '🔑', '👻', '🚬', '👌🏽', '1️⃣', '☔', '🤔', '💃', '🌟', '🇯🇴', '👆', '🚶🏾\u200d♀️', '🤕', '💋', '🇧🇷', '✋', '🌿', '💎', '☹️', '💵', '🤗', '😯', '✊', '☂', '📝', '🖋', '🔐', '😖', '👣', '🗣', '😕', '🙌🏼', '😿', '😃', '🌞', '🙄', '🚶', '✌️', '😲', '❤️', '😳', '🎀', '💘', '✒', '✈️', '💚', '😻', '😇', '💕', '👊🏿', '♥', '👏🏽', '⚪', '👽', '🏃🏼', '✋🏼', '🍷', '😹', '👊', '🔕', '🍂', '💩', '✌', '👌🏻', '👯', '🌚', '🍳', '🙏', '⚜', '✌🏼', '✋🏻', '😄', '😟', '🐣', '🌾', '🙁', '🌹', '💔', '☠️', '😔', '🎼', '👍🏼', '🕊', '😠', '🐓', '😎', '💂🏿', '😦', '💪🏽', '☺️', '😒', '😜', '🇩🇿', '😱', '🇧🇭', '🍃', '⚽', '😨', '⚠️', '💤', '😈', '🌼', '👍', '😪', '✔', '👀', '🎩', '👌🏼', '🗨', '☘', '💀', '☕', '♦️', '😂', '🇮

## Build needed Dictionary

In [None]:
# Building the emoji dictionary to translate emojis to their meanings
filtered_emojis = {
    '😩': 'حزن',
    '🚫': 'حزن',
    '♥': 'حب',
    '😔': 'حزن',
    '🇸🇦': 'تعاطف',
    '✌🏼': 'تعاطف',
    '🐸': 'سخريه',
    '😠': 'غضب',
    '😭': 'حزن',
    '🎧': '',
    '™': '',
    '☺️': 'فرح',
    '👋': '',
    '👌🏼': '',
    '💕': 'حب',
    '😓': 'حزن',
    '🙊': 'سخريه',
    '🌚': '',
    '👊🏽': 'غضب',
    '💛': 'حب',
    '💦': '',
    '😤': 'غضب',
    '🍃': '',
    '♨': '',
    '😛': 'فرح',
    '✏️': '',
    '⚪': '',
    '🙁': 'حزن',
    '💪🏼': 'تعاطف',
    '👸': '',
    '☠️': '',
    '🇧🇭': 'فرح',
    '😮': 'مفاجأة',
    '😫': 'حزن',
    '👻': 'مفاجأة',
    '💚': 'حب',
    '🌺': 'حب',
    '🔝': '',
    '✌🏻': 'تعاطف',
    '🙈': '',
    '💗': 'حب',
    '👃': '',
    '😌': '',
    '👐🏽': '',
    '💘': 'حب',
    '😖': 'حزن',
    '🎻': '',
    '©': '',
    '💜': 'حب',
    '👎🏻': '',
    '💓': 'حب',
    '🏊': '',
    '🚶🏽': '',
    '💟': 'حب',
    '👊🏿': 'غضب',
    '👉': '',
    '🌹': 'حب',
    '😁': 'فرح',
    '♦️': '',
    '🏴': '',
    '✔': '',
    '💪🏽': 'تعاطف',
    '👍🏻': '',
    '😱': 'خوف',
    '🌼': 'حب',
    '✖': '',
    '👀': '',
    '💝': 'حب',
    '🗣': '',
    '🕶': '',
    '😢': 'حزن',
    '❤️': 'حب',
    '🍁': 'حب',
    '💐': 'هدية',
    '👌🏽': '',
    '⚽': '',
    '💡': '',
    '🔕': '',
    '☝': '',
    '🙋': 'غضب',
    '😨': 'خوف',
    '💪': 'تعاطف',
    '✌️': 'تعاطف',
    '👽': 'خوف',
    '🔑': '',
    '✈️': '',
    '😻': '',
    '🎩': '',
    '😷': '',
    '🙏': 'تعاطف',
    'ℹ': '',
    '👎🏼': 'غضب',
    '◾': '',
    '😋': 'فرح',
    '😯': 'مفاجأة',
    '🙂': 'خوف',
    '✍🏻': '',
    '😣': 'حزن',
    '🇯🇴': 'تعاطف',
    '🐣': '',
    '👏🏽': 'فرح',
    '💫': '',
    '🌸': '',
    '😂': 'سخريه',
    '💭': 'حب',
    '😲': 'مفاجأة',
    '🙄': '',
    '💎': '',
    '💙': 'حب',
    '✋🏼': '',
    '🍔': '',
    '🏀': '',
    '🎬': '',
    '😈': '',
    '👊🏼': 'غضب',
    '👣': '',
    '✊': 'غضب',
    '😥': 'حزن',
    '⚠️': '',
    '✋🏻': '',
    '😆': 'فرح',
    '😏': 'سخريه',
    '💤': '',
    '📍': '',
    '🙏🏼': '',
    '🤕': 'جرح',
    '🎵': '',
    '💌': 'حب',
    '👐': '',
    '🎉': 'فرح',
    '👏🏻': 'فرح',
    '🇧🇷': 'فرح',
    '😉': 'فرح',
    '🌾': '',
    '😅': 'فرح',
    '😰': 'خوف',
    '😐': 'حزن',
    '🔐': '',
    '☂': '',
    '🚬': '',
    '🙌🏼': '',
    '🎼': '',
    '☄': '',
    '💖': 'حب',
    '🐔': '',
    '💔': 'حزن',
    '⚜': '',
    '1️⃣': '',
    '😑': '',
    '✋': 'رأي',
    '💉': '',
    '😳': 'مفاجأة',
    '✌': 'تعاطف',
    '👍🏼': '',
    '2️⃣': '',
    '👌🏻': '',
    '👏🏼': 'فرح',
    '💀': '',
    '😬': '',
    '😹': '',
    '🕋': 'حب',
    '👑': '',
    '📮': '',
    '🚶': '',
    '🇮🇶': 'تعاطف',
    '😒': '',
    '👎': 'حزن',
    '🎈': '',
    '🍷': '',
    '🌷': 'حب',
    '🔪': '',
    '😟': 'حزن',
    '🎀': '',
    '🎾': '',
    '👯': '',
    '‼': '',
    '😴': 'حب',
    '🔴': '',
    '😪': 'حزن',
    '👬': '',
    '💵': '',
    '🖋': '',
    '📩': '',
    '☺': 'فرح',
    '❤': 'حب',
    '📝': '',
    '👇': '',
    '😧': 'مفاجأة',
    '‼️': '',
    '⭕': '',
    '😚': '',
    '😡': 'غضب',
    '🙍🏻': 'حزن',
    '👵🏽': '',
    '😎': '',
    '🏃': '',
    '❗': '',
    '🍂': '',
    '🤗': 'سخريه',
    '🎆': '',
    '😜': 'فرح',
    '⁉️': 'مفاجأة',
    '💃': '',
    '❣️': '',
    '☁️': '',
    '👈': '',
    '😞': 'حزن',
    '🌿': 'حب',
    '✒': '',
    '❣': '',
    '💞': 'حب',
    '🍟': '',
    '😘': 'حب',
    '😦': '',
    '🙆🏻': 'خوف',
    '🇩🇿': 'تعاطف',
    '👍': '',
    '💂🏿': '',
    '😄': 'فرح',
    '💪🏻': '',
    '💃🏼': '',
    '😃': 'فرح',
    '☹️': 'حزن',
    '🚶🏻': '',
    '🌟': 'فرح',
    '💋': 'حب',
    '❌': '',
    '🙌': '',
    '🕊': '',
    '🔸': '',
    '🇪🇬': '',
    '🙇': 'حزن',
    '❄️': '',
    '🐑': '',
    '👌': '',
    '😙': '',
    '☔': '',
    '🏃🏼': '',
    '🎷': '',
    '🚶🏾\u200d♀️': '',
    '✏': '',
    '😊': 'فرح',
    '🌝': '',
    '☕': '',
    '💩': '',
    '🎶': 'حب',
    '🐰': '',
    '🐓': '',
    '😿': 'خوف',
    '👆': '',
    '😍': 'حب',
    '🙉': 'سخريه',
    '😀': 'فرح',
    '👏': '',
    '🙏🏻': 'تعاطف',
    '🔫': '',
    '🙃': '',
    '👼': '',
    '🎤': '',
    '😇': '',
    '🍳': '',
    '🐤': '',
    '😶': 'خوف',
    '🌞': '',
    '👊': 'غضب',
    '🔥': '',
    '🤔': '',
    '👧': '',
    '➿': '',
    '🗨': '',
    '🎥': '',
    '☘': '',
    '♥️': 'حب',
    '✨': '',
    '😕': 'حزن',
    '🏃🏻\u200d♀️': ''
}

In [None]:
# Building the emoticon dictionary to translate emoticons to their meanings
emoticons_to_labels = {
    ':)': 'فرح',
    ':(': 'حزن',
    '=)': 'فرح',
    ';)': 'مفاجأة',
    ':D': 'فرح',
    '=D': 'فرح',
    'xD':'فرح'
}

In [None]:
def replace_emojis_with_emotions(tweet, emoji_to_emotion):
    """
    Replaces emojis with corresponding emotions in the tweet.

    Args:
        tweet (str): The tweet to process.
        emoji_to_emotion (dict): A dictionary mapping emojis to their corresponding emotions.

    Returns:
        str: The tweet with emojis replaced by their corresponding emotions.
    """
    for emoji, emotion in emoji_to_emotion.items():
        tweet = tweet.replace(emoji, ' '+ emotion+' ')
    return tweet

In [None]:
# Replace emojis with their corresponding emotions

emoji_to_label = {**filtered_emojis, **emoticons_to_labels}

Data['tweet_with_replaced_emojis'] = [replace_emojis_with_emotions(text, emoji_to_label) for text in Data['tweet'] ]


# Cleaning

*   remove tags, punctuations, and normalize letters
*   remove stopwords




In [None]:
# Get stopwords from arabic stopwords library
stopwords_list = ast.stopwords_list()

# Remove negation prohibition words from stopwords list and add new stop words to it
negation_prohibition_words = ["ليس", "لا", "لم", "لن"]
stopwords_list = [word for word in stopwords_list if word not in negation_prohibition_words]
stopwords_list.extend(["دي","اﻻ", "ده", "انا","الى","انت","اللى", "ان","ما","يا","يلا","دا","مين","دول","دم","حد","من","عن","على","علشان","أني","وأنا","الل","كده","يعني","احنا","ايه"])


In [None]:
def strip_html_tags(text):
  '''
  Remove HTML tags
  and the content between the tags
  for data preprocessing
  
  Args:
      text (str): The input text to strip HTML tags from.
  
  Returns:
      str: The input text with HTML tags and their contents removed.
  '''
  soup = BeautifulSoup(text, "html.parser")
  # remove iframe & script tags
  [s.extract() for s in soup(['iframe', 'script'])]
  stripped_text = soup.get_text()
  # replace new lines and carriage return with new line
  stripped_text = re.sub(r'[\r|\n|\r\n]+', '\n', stripped_text)
  return stripped_text



def remove_accented_chars(text):
  '''
  handle accented char
  in multilingual scenarios
  Args:
      text (str): The input text to remove accented characters from.

  Returns:
      str: The input text with accented characters removed.
  '''
  # normalize the text to remove accented char
  text = unicodedata.normalize('NFKD', text)
  # encode it to ASCII
  text = text.encode('ascii', 'ignore')
  # decode it back ti unicode
  text = text.decode('utf-8', 'ignore')
  return text

def remove_punctuations(text):
    '''
    Remove punctuations
    Args:
        text (str): The input text to remove punctuations from.

    Returns:
        str: The input text with punctuations removed.
    '''
    punctuations = '''`÷×؛<>_():*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ''' + string.punctuation
    translator = str.maketrans(punctuations, ' ' * len(punctuations))
    return text.translate(translator)

def clean_str(text):
  '''
  Replace special characters in the text with their normalized forms.
  Args:
      text (str): The input text to clean.
  Returns:
      str: The input text with special characters replaced.
  '''
  search = ["أ", "إ", "آ", "ة", "_", "-", "/", ".", "،", " و ", " يا ", '"', "ـ", "'", "ى", "\n", "\t", "?", "؟", "!", "“"]
  replace = ["ا", "ا", "ا", "ه", " ", " ", " ", " ", " ", " و", " يا", " ", " ", " ", "ي", " ", " ", " ?", " ؟", " !", " "]

  #remove tashkeel
  p_tashkeel = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
  text = re.sub(p_tashkeel,"", text)

  #remove longation
  p_longation = re.compile(r'(.)\1+')
  subst = r"\1\1"
  text = re.sub(p_longation, subst, text)

  text = text.replace('وو', 'و')
  text = text.replace('يي', 'ي')
  text = text.replace('اا', 'ا')

  for i in range(0, len(search)):
      text = text.replace(search[i], replace[i])
  #trim
  text = text.strip()

  return text


def remove_english_letters(text):
    '''
    Remove English letters from the text.
    Args:
        text (str): The input text to remove English letters from.
    Returns:
        str: The input text with English letters removed.
    '''
    english_letters = string.ascii_letters
    translator = str.maketrans('', '', english_letters)
    return text.translate(translator)

def remove_stop_words(text):
    '''
    Remove stop words from the text.
    Args:
        text (str): The input text to remove stop words from.
    Returns:
        str: The input text with stop words removed.  
    '''
    return ' '.join(word for word in str(text).split() if word not in stopwords_list)


def pre_process_corpus(docs):
    '''
    Preprocess the corpus by applying various text cleaning techniques.
    Args:
        docs (list): The input corpus to preprocess.
    Returns:
        list: The preprocessed corpus.
    '''
    norm_docs = []

    for doc_index, doc in tqdm.tqdm(enumerate(docs)):
        # Store the original text of the current index
        original_text = doc

        doc = strip_html_tags(doc)

        doc = re.sub(' +', ' ', doc)
        doc = doc.strip()
        doc = re.sub(r'\d+', '', doc)
        doc = clean_str(doc)
        doc = remove_english_letters(doc)
        doc = remove_punctuations(doc)
        doc = remove_stop_words(doc)

        norm_docs.append(doc)

    return norm_docs


In [None]:
# Apply pre-processing to the dataset
Data['cleaned_data_with_replaced_emojis']= pre_process_corpus(Data['tweet_with_replaced_emojis'])


  soup = BeautifulSoup(text, "html.parser")
10065it [00:48, 208.84it/s]


# Stemming

In [None]:
# Apply stemming to the input Arabic text using the ISRI Arabic stemmer.
isri_stemmer = ISRIStemmer()
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()

def light_stemming(text):
    '''
    stemming function for light preprocessing.

    args:
        text: input text

    returns:
        stemmed text
    '''
    s = " "
    x = w_tokenizer.tokenize(text)
    return s.join([isri_stemmer.suf32(isri_stemmer.norm(isri_stemmer.waw(w), num=3)) for w in x])


In [None]:
# Apply light stemming
Data["Light Stemming"] = Data['cleaned_data_with_replaced_emojis'].apply(light_stemming)

In [None]:
# Observing the preprocessed data
Data

Unnamed: 0,tweet,label,tweet_with_replaced_emojis,cleaned_data_with_replaced_emojis,Light Stemming
0,الاوليمبياد الجايه هكون لسه ف الكليه ..,0,الاوليمبياد الجايه هكون لسه ف الكليه ..,الاوليمبياد الجايه هكون لسه الكليه,الاوليمبياد الجايه هكون لسه الكليه
1,عجز الموازنه وصل ل93.7 % من الناتج المحلي يعني...,1,عجز الموازنه وصل ل93.7 % من الناتج المحلي يعني...,عجز الموازنه وصل الناتج المحلي لسه اقل ونفلس و...,عجز الموازنه وصل الناتج المحلي لسه اقل ونفلس و...
2,كتنا نيله ف حظنا الهباب xD,3,كتنا نيله ف حظنا الهباب فرح,كتنا نيله حظنا الهباب فرح,كتنا نيله حظنا الهباب فرح
3,جميعنا نريد تحقيق اهدافنا لكن تونس تالقت في حر...,2,جميعنا نريد تحقيق اهدافنا لكن تونس تالقت في حر...,نريد تحقيق اهدافنا تونس تالقت حراسه المرمي,نريد تحقيق اهداف تونس تالقت حراسه المرمي
4,الاوليمبياد نظامها مختلف .. ومواعيد المونديال ...,0,الاوليمبياد نظامها مختلف .. ومواعيد المونديال ...,الاوليمبياد نظامها مختلف ومواعيد المونديال مكا...,الاوليمبياد نظام مختلف ومواعيد المونديال مكانت...
...,...,...,...,...,...
10060,2222: يلا يا جماعه حفله عمرو دياب خلصت نريح شو...,3,2222: يلا يا جماعه حفله عمرو دياب خلصت نريح شو...,ياجماعه حفله عمرو دياب خلصت نريح شويه ونبدا نك...,ياجماعه حفله عمرو دياب خلصت نريح شويه ونبدا نك...
10061,Mohamed5: اييييه دااا 😲😲 اوزيييل❤,6,Mohamed5: اييييه دااا مفاجأة مفاجأة اوزيييل...,مفاجاه مفاجاه اوزيل حب,مفاجاه مفاجاه اوزيل حب
10062,عملتلها ريتويت بمناسبه ساره بتاعه الاوليمبياد 😃,0,عملتلها ريتويت بمناسبه ساره بتاعه الاوليمبياد ...,عملتلها ريتويت بمناسبه ساره بتاعه الاوليمبياد فرح,عملتل ريتويت بمناسبه ساره بتاعه الاوليمبياد فرح
10063,وعليك قبلنا يانجم النجوم ياعندليب الحب والاحساس,2,وعليك قبلنا يانجم النجوم ياعندليب الحب والاحساس,قبلنا يانجم النجوم ياعندليب الحب والاحساس,قبل يانجم النجوم ياعندليب الحب والاحساس


# Split data
  70% for training\
  15% for development\
  15% for test




In [None]:
df_train, df_test = train_test_split(Data, test_size=0.3, random_state=42)
df_test, df_val = train_test_split(df_test, test_size=0.5, random_state=42)

# Emebdding the Data Using Marbert


In [None]:
# Load the MARBERT model and tokenizer
marbert_model_path = 'UBC-NLP/MARBERT'
tokenizer = AutoTokenizer.from_pretrained(marbert_model_path, from_tf=True)
marbert_model = TFAutoModel.from_pretrained(marbert_model_path, output_hidden_states=True)

remove_special_tokens=0  # change this to 0 if you want to keep the special token



def bert_tokenize(text: str) -> dict:
    '''
    Tokenize the input text using BERT tokenizer.
    Args:
        text (str): The input text to tokenize.
    Returns:
        dict: A dictionary containing the tokenized input.
    '''
    tokens = tokenizer(text, padding='max_length', truncation=True, max_length=500)
    if remove_special_tokens == 1:
        shape = np.array(tokens['input_ids']).shape
        modified_input_ids = np.zeros(shape).astype(np.int32)
        modified_attention_mask = np.zeros(shape).astype(np.int32)
        # Modify the input IDs and attention mask as per your requirement
        for i in range(shape[0]):
            modified_input_ids[i] = [0 if token_id == 1 else 0 if token_id == 3 else 0 if token_id == 0 else 0 if token_id == 2 else 0 if token_id == 4 else token_id for token_id in tokens['input_ids'][i]]
            modified_attention_mask[i] = [0 if token_id in [1, 3, 0, 2, 4] else 1 for token_id in tokens['input_ids'][i]]
        # Update the input IDs and attention mask in the tokens dictionary
        tokens['input_ids'] = modified_input_ids
        tokens['attention_mask'] = modified_attention_mask
    return tokens


def get_embeddings(ids, mask, type_ids):
    '''
    Get the embeddings for the input IDs, mask, and type IDs.
    Args:
        ids (tf.Tensor): The input IDs.
        mask (tf.Tensor): The input mask.
        type_ids (tf.Tensor): The input type IDs.
    Returns:
        tf.Tensor: The embeddings for the input IDs, mask, and type IDs.
    '''
    ids = tf.convert_to_tensor(ids)
    mask = tf.convert_to_tensor(mask)
    type_ids = tf.convert_to_tensor(type_ids)
    hidden_states = marbert_model(input_ids=ids, attention_mask=mask, token_type_ids=type_ids)[0]
    averaged_embedding = tf.reduce_mean(hidden_states, axis=1)
    return averaged_embedding.numpy()

def embedd(text):
    '''
    Embed the input text using BERT.
    Args:
        text (str): The input text to embed.
    Returns:
        np.ndarray: The embeddings for the input text.
    '''
    if type(text)==type(pd.Series()):
        text=text.values.astype(str).tolist()
    tokens = bert_tokenize(text)
    xlen = np.array(tokens['input_ids']).shape[0]
    x_emb = np.zeros((xlen,768))
    for i in range(0,xlen,100):
        if(i+100 < xlen):
            input_ids = tokens['input_ids'][i:i+100]
            attention_mask = tokens['attention_mask'][i:i+100]
            token_type_ids = tokens['token_type_ids'][i:i+100]
            x_emb[i:i+100] = get_embeddings(input_ids,attention_mask,token_type_ids)
        else:
            input_ids = tokens['input_ids'][i:xlen]
            attention_mask = tokens['attention_mask'][i:xlen]
            token_type_ids = tokens['token_type_ids'][i:xlen]
            x_emb[i:xlen] = get_embeddings(input_ids,attention_mask,token_type_ids)
    return x_emb.reshape(x_emb.shape[0], 1, x_emb.shape[1])

All model checkpoint layers were used when initializing TFBertModel.

All the layers of TFBertModel were initialized from the model checkpoint at UBC-NLP/MARBERT.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [None]:
# Perform the embedding
ls_train_embeddings=embedd(df_train["Light Stemming"])
ls_val_embeddings=embedd(df_val["Light Stemming"])
ls_test_embeddings =embedd(df_test["Light Stemming"])

  if type(text)==type(pd.Series()):
  if type(text)==type(pd.Series()):
  if type(text)==type(pd.Series()):


# Saving data



In [None]:

with open('train.pkl', 'wb') as f:
    pickle.dump(df_train, f)

with open('test.pkl', 'wb') as f:
    pickle.dump(df_test, f)

with open('val.pkl', 'wb') as f:
    pickle.dump(df_val, f)


In [None]:
with open('ls_train_embeddings.pkl', 'wb') as f:
    pickle.dump(ls_train_embeddings, f)
with open('ls_test_embeddings.pkl', 'wb') as f:
    pickle.dump(ls_test_embeddings, f)
with open('ls_val_embeddings.pkl', 'wb') as f:
    pickle.dump(ls_val_embeddings, f)
