In [2]:
import numpy as np
import pandas as pd
import re
import string                             

!pip install tashaphyne
import tashaphyne
from tashaphyne.stemming import ArabicLightStemmer

!pip install pyarabic
import pyarabic

!pip install emoji
import emoji

import nltk                             
from nltk.stem.isri import ISRIStemmer
from nltk.corpus import stopwords 
nltk.download('stopwords')                                
nltk.download('punkt')


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn import metrics
from sklearn import naive_bayes
from sklearn.metrics import f1_score

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


[nltk_data] Downloading package stopwords to /home/ahmed/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/ahmed/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Load training and test data

In [3]:
# read csv file
df = pd.read_csv('Dataset/train.csv', encoding='UTF-8')
train_tweets = df['text']
train_categories = df['category']
train_stances = df['stance']

df = pd.read_csv('Dataset/dev.csv', encoding='UTF-8')
dev_tweets = df['text']
dev_categories = df['category']
dev_stances = df['stance']

# Preprocessing

In [4]:
def preprocess_tweet(tweet):
    ArListem = ArabicLightStemmer()

    def stem_tweet(tweet_tokens):
        for i in range(len(tweet_tokens)):
            ArListem.light_stem(tweet_tokens[i])
            tweet_tokens[i] = ArListem.get_root()
        return tweet_tokens

    def remove_url(tweet):
        tweet = re.sub(r"http\S+", "", tweet)
        return tweet

    def remove_retweet_tag(tweet):
        tweet = re.sub(r'^RT[\s]+', '', tweet)
        return tweet

    def remove_tweet_mentions(tweet):
        tweet = re.sub(r'@\w+', '', tweet)
        return tweet

    def remove_text_control_tags(tweet):
        tweet = re.sub(r'\n|\t|\r|<LF>|<lf>', '', tweet)
        return tweet

    def tokenize_tweet(tweet):
        tweet_tokens = re.split(', |_|-|!', tweet)
        return tweet_tokens

    def remove_stopwords_punctuation(tweet_tokens, stop_words = ()):
        tweet_reduced = []
        for word in tweet_tokens:
            if (word not in stop_words and word not in string.punctuation):
                tweet_reduced.append(word)
                tweet_reduced[-1] = re.sub(r'[~`!@#$%^&*()-_+={}[\]|/\:;"`<>,.?؟،]+', '', tweet_reduced[-1])
        return tweet_reduced

    def remove_specialcharacters(tweet):
        tweet = re.sub(r'#', '', tweet)
        return tweet

    def handle_emojis(tweet):
        tweet = emoji.demojize(tweet, language='en') # convert emojis to text ENGLISH!!
        return tweet

    def normalize_tweet(tweet_tokens):
        for i in range(len(tweet_tokens)):
            tweet_tokens[i] = pyarabic.araby.strip_tashkeel(tweet_tokens[i])
            tweet_tokens[i] = pyarabic.araby.strip_tatweel(tweet_tokens[i])
            tweet_tokens[i] = pyarabic.araby.strip_lastharaka(tweet_tokens[i])
            # tweet_tokens[i] = pyarabic.araby.normalize_alef(tweet_tokens[i])
            tweet_tokens[i] = re.sub(r'(.)\1{3,}', r"\1\1\1", tweet_tokens[i]) # Remove longation
            # tweet_tokens[i] = pyarabic.araby.normalize_hamza(tweet_tokens[i])
        return tweet_tokens

    tweet = remove_url(tweet)
    tweet = remove_retweet_tag(tweet)
    tweet = remove_tweet_mentions(tweet)
    tweet = remove_specialcharacters(tweet)
    tweet = remove_text_control_tags(tweet)
    tweet = handle_emojis(tweet)
    tweet_tokens = tokenize_tweet(tweet)
    tweet_tokens = normalize_tweet(tweet_tokens)
    tweet_tokens_reduced = remove_stopwords_punctuation(tweet_tokens, stopwords.words('arabic'))
    # tweet_tokens_stemmed = stem_tweet(tweet_tokens_reduced)
    tweet = ' '.join(tweet_tokens_reduced)
    return re.sub(r'[^\w\s]+','',tweet)  # to remove non-printable characters

In [5]:
print(preprocess_tweet(train_tweets[0]))

بيل غيتس يتلقى لقاح كوفيد من غير تصوير الابرة و لا السيرنجة و لا الدواء و لابس بولو صيفي في عز الشتاء و يقول ان إحدى مزايا عمر ال  عاما هي انه مؤهل للحصول على اللقاح  يعنى ما كان يحتاج اللقاح لو كان عمره اصغر من  thinking face 


# Feature Extraction

# 1- Bag of Words

In [6]:
vectorizer = CountVectorizer(token_pattern=r'\S+')
vectorizer.fit_transform(train_tweets)
def extract_bags_of_words(data): 
    bow = vectorizer.transform(data)
    return bow.toarray()

# 2- TF-IDF

In [7]:
TF_IDF = TfidfVectorizer(token_pattern=r'\S+')
TF_IDF.fit_transform(train_tweets)
def extract_tf_idf(data):
    tf_idf = TF_IDF.transform(data)
    return tf_idf.toarray()

# Calculating the bag of words for the training set

In [8]:
train_bow = extract_bags_of_words(train_tweets)

# Calculating the bag of words for the training set

In [9]:
train_tf_idf = extract_tf_idf(train_tweets)

# Calculating the bag of words for the test set

In [10]:
test_bow = extract_bags_of_words(dev_tweets)

# Calculating the TF-IDF for the test set

In [11]:
test_tf_idf = extract_tf_idf(dev_tweets)

# Combine the features

In [12]:
# train_features = np.concatenate((train_bow, train_tf_idf), axis=1)
train_features = train_bow
# test_features = np.concatenate((test_bow, test_tf_idf), axis=1)
test_features = test_bow

<class 'numpy.ndarray'>


# Training

# 1- SVM

In [12]:
svm_clf = svm.SVC()
svm_clf.fit(train_features, train_stances.values)

# 2- Naive Bayes

In [None]:
nb_clf = naive_bayes.MultinomialNB()
nb_clf.fit(train_features, train_stances.values)

# Classification

# 1- SVM

In [None]:
svm_predictions = svm_clf.predict(test_features)

# 2- Naive Bayes

In [None]:
nb_predictions = nb_clf.predict(test_features)

# Accuracies

# 1- SVM

In [None]:
print(metrics.classification_report(dev_stances, svm_predictions))

              precision    recall  f1-score   support

          -1       0.56      0.07      0.13        70
           0       0.48      0.08      0.14       126
           1       0.82      0.99      0.90       804

    accuracy                           0.81      1000
   macro avg       0.62      0.38      0.39      1000
weighted avg       0.76      0.81      0.75      1000



In [None]:
svm_predictions_f1 = f1_score(dev_stances, svm_predictions, average='macro')
print(svm_predictions_f1)

0.3866812300343862
