In [1]:
import numpy as np
import pandas as pd
import re
import string                             

!pip install tashaphyne
import tashaphyne
from tashaphyne.stemming import ArabicLightStemmer

!pip install pyarabic
import pyarabic

!pip install emoji
import emoji

import nltk                             
from nltk.stem.isri import ISRIStemmer
from nltk.corpus import stopwords 
nltk.download('stopwords')                                
nltk.download('punkt')


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn import metrics
from sklearn import naive_bayes
from sklearn.metrics import f1_score

Collecting tashaphyne
  Downloading Tashaphyne-0.3.6-py3-none-any.whl (251 kB)
Collecting pyarabic
  Downloading PyArabic-0.6.15-py3-none-any.whl (126 kB)
Installing collected packages: pyarabic, tashaphyne
Successfully installed pyarabic-0.6.15 tashaphyne-0.3.6


You should consider upgrading via the 'C:\Python39\python.exe -m pip install --upgrade pip' command.




You should consider upgrading via the 'C:\Python39\python.exe -m pip install --upgrade pip' command.


Collecting emoji
  Downloading emoji-2.2.0.tar.gz (240 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Using legacy 'setup.py install' for emoji, since package 'wheel' is not installed.
Installing collected packages: emoji
    Running setup.py install for emoji: started
    Running setup.py install for emoji: finished with status 'done'
Successfully installed emoji-2.2.0


You should consider upgrading via the 'C:\Python39\python.exe -m pip install --upgrade pip' command.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Moham\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Moham\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


# Load training and test data

In [2]:
# read csv file
def load_data(path):
    df = pd.read_csv(path, encoding='UTF-8')
    return  np.asarray(df['text'].values), np.asarray(df['category'].values), np.asarray(df['stance'].values)

train_tweets, train_categories, train_stances = load_data('Dataset/train.csv')
dev_tweets, dev_categories, dev_stances = load_data('Dataset/dev.csv')

# Preprocessing

In [3]:
def preprocess_tweet(tweet):
    ArListem = ArabicLightStemmer()

    def stem_tweet(tweet_tokens):
        for i in range(len(tweet_tokens)):
            ArListem.light_stem(tweet_tokens[i])
            tweet_tokens[i] = ArListem.get_root()
        return tweet_tokens

    def remove_url(tweet):
        tweet = re.sub(r"http\S+", " ", tweet)
        return tweet

    def remove_retweet_tag(tweet):
        tweet = re.sub(r'^RT[\s]+', ' ', tweet)
        return tweet

    def remove_tweet_mentions(tweet):
        tweet = re.sub(r'@\w+', ' ', tweet)
        return tweet

    def remove_text_control_tags(tweet):
        tweet = re.sub(r'\n|\t|\r|<LF>|<lf>', ' ', tweet)
        return tweet

    def tokenize_tweet(tweet):
        tweet_tokens = re.split(r',|،|_|-|!| ', tweet)
        return tweet_tokens

    def remove_stopwords_punctuation(tweet_tokens, stop_words = ()):
        tweet_reduced = []
        for word in tweet_tokens:
            if (word not in stop_words and word not in string.punctuation):
                tweet_reduced.append(word)
                tweet_reduced[-1] = re.sub(r'[~`!@#$%^&*()-/_+={}[\]|/\:;"`<>,.?؟،]+', ' ', tweet_reduced[-1])
        return tweet_reduced

    def remove_specialcharacters(tweet):
        tweet = re.sub(r'#', ' ', tweet)
        return tweet

    def handle_emojis(tweet, remove_emojis = True):
        if not remove_emojis:
            tweet = emoji.demojize(tweet, language='en') # convert emojis to text ENGLISH!!
        else:
            for e in tweet:
                if emoji.is_emoji(e):
                   tweet = tweet.replace(e, '')
        return tweet

    def normalize_tweet(tweet_tokens):
        for i in range(len(tweet_tokens)):
            tweet_tokens[i] = pyarabic.araby.strip_tashkeel(tweet_tokens[i])
            tweet_tokens[i] = pyarabic.araby.strip_tatweel(tweet_tokens[i])
            tweet_tokens[i] = pyarabic.araby.strip_lastharaka(tweet_tokens[i])
            tweet_tokens[i] = re.sub(r'(.)\1{3,}', r"\1\1\1", tweet_tokens[i]) # Remove longation
            # tweet_tokens[i] = pyarabic.araby.normalize_alef(tweet_tokens[i])
            # tweet_tokens[i] = pyarabic.araby.normalize_hamza(tweet_tokens[i])
        return tweet_tokens

    tweet = remove_url(tweet)
    tweet = remove_retweet_tag(tweet)
    tweet = remove_specialcharacters(tweet)
    tweet = remove_text_control_tags(tweet)
    tweet = remove_tweet_mentions(tweet)
    tweet = handle_emojis(tweet, remove_emojis = True)
    tweet_tokens = tokenize_tweet(tweet)
    # tweet_tokens = normalize_tweet(tweet_tokens)
    tweet_tokens = remove_stopwords_punctuation(tweet_tokens, stopwords.words('arabic'))
    # tweet_tokens = stem_tweet(tweet_tokens)
    tweet = ' '.join(tweet_tokens)
    tweet = re.sub(r'[^\w\s]+|\d+',' ',tweet)  # to remove non-printable characters and numbers
    tweet = re.sub(r'\s+', ' ', tweet)     # to remove multiple spaces
    return tweet

In [None]:
for tweet in train_tweets:
    print(preprocess_tweet(tweet))
    print('-----------------------------')

# Feature Extraction

# 1- Bag of Words

In [4]:
vectorizer = CountVectorizer(token_pattern=r'\S+')
vectorizer.fit_transform(train_tweets)
def extract_bags_of_words(data): 
    bow = vectorizer.transform(data)
    return bow.toarray()

# 2- TF-IDF

In [5]:
TF_IDF = TfidfVectorizer(token_pattern=r'\S+')
TF_IDF.fit_transform(train_tweets)
def extract_tf_idf(data):
    tf_idf = TF_IDF.transform(data)
    return tf_idf.toarray()

# Calculating the bag of words for the training set

In [6]:
train_bow = extract_bags_of_words(train_tweets)

# Calculating TF-IDF of words for the training set

In [7]:
train_tf_idf = extract_tf_idf(train_tweets)

# Calculating the bag of words for the test set

In [8]:
test_bow = extract_bags_of_words(dev_tweets)

# Calculating the TF-IDF for the test set

In [9]:
test_tf_idf = extract_tf_idf(dev_tweets)

# Combine the features

In [None]:
# train_features = np.concatenate((train_bow, train_tf_idf), axis=1)
train_features = train_bow
# test_features = np.concatenate((test_bow, test_tf_idf), axis=1)
test_features = test_bow

# Training

# 1- SVM

In [None]:
svm_clf = svm.SVC()
svm_clf.fit(train_features, train_stances.values)

# 2- Naive Bayes

In [None]:
nb_clf = naive_bayes.MultinomialNB(alpha=0.13)
nb_clf.fit(train_features[:4000], train_stances[:4000])

# Classification

# 1- SVM

In [None]:
svm_predictions = svm_clf.predict(test_features)

# 2- Naive Bayes

In [None]:
nb_predictions = nb_clf.predict(test_features)

# Accuracies

# 1- SVM

In [None]:
print(metrics.classification_report(dev_stances, svm_predictions))

In [None]:
svm_predictions_f1 = f1_score(dev_stances, svm_predictions, average='macro')
print(svm_predictions_f1)

# 2- Naive Bayes

In [None]:
print(metrics.classification_report(dev_stances, nb_predictions))

In [None]:
nb_predictions_f1 = f1_score(dev_stances, nb_predictions, average='macro')
print(nb_predictions_f1)