In [None]:
import joblib
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


# ***Data Preprocessing***

In [None]:
pip install pyarabic

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyarabic
  Downloading PyArabic-0.6.15-py3-none-any.whl (126 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.4/126.4 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarabic
Successfully installed pyarabic-0.6.15


In [None]:
import re


from pyarabic.araby import strip_tashkeel, strip_tatweel

In [None]:

import re

def remove_emojis(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [None]:
import nltk
nltk.download('punkt')

def tokenize_tweet(tweet):
 
    tokens = nltk.word_tokenize(tweet)
  
    return tokens

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:

def remove_punctuation(tokens):
    # Remove punctuation from the tokens using a regular expression
    no_punct_tokens = [re.sub(r'[^a-zA-Z؀-ۿ]', '', token) for token in tokens]
    
    # Remove any empty tokens using a loop
    new_tokens = []
    for token in no_punct_tokens:
        if token:
            new_tokens.append(token)
    no_punct_tokens = new_tokens
    
    return no_punct_tokens

In [None]:
def remove_stopwords(tweet, stopword_file):
    with open(stopword_file, 'r', encoding='utf-8') as f:
        stop_words = f.read().splitlines()

    # Tokenize the tweet
    tokens = tweet.split()

    no_stopword_tokens = []
    for token in tokens:
        if token not in stop_words:
            no_stopword_tokens.append(token)
    
    # Join the tokens back into a processed tweet
    processed_tweet = " ".join(no_stopword_tokens)
    
    return processed_tweet



In [None]:
from nltk.stem.isri import ISRIStemmer


def stem_tokens(tokens):
   
    stemmer = ISRIStemmer()
    
    exclude_list = ['قلب', 'رحمة', 'أمل', 'حزن', 'سعادة', 'جمال', 'شجاعة', 'تفاؤل', 'يقين', 'تضامن',
                    'حرية', 'عدالة', 'إنسانية', 'عزيمة', 'وفاء', 'إخلاص', 'صدق', 'شفاء', 'دعاء', 'الله']
    
    stemmed_tokens = [stemmer.stem(token) if token not in exclude_list else token for token in tokens]
    return stemmed_tokens

In [None]:
def clean_tweet2(tweet):
    # Remove mentions (@username)
    tweet = re.sub(r"@[A-Za-z0-9_]+", "", tweet)
    # Remove retweets (RT)
    tweet = re.sub(r"RT\s+", "", tweet)
    # Remove URLs (http or https)
    tweet = re.sub(r"https?://[A-Za-z0-9./]+", "", tweet)
    # Remove any remaining non-Arabic characters
    tweet = re.sub(r"[^؀-ۿ]+", " ", tweet)
    # Remove extra whitespace
    tweet = re.sub(r"\s+", " ", tweet.strip())
    return tweet

In [None]:
def clean_tweet(tweet):
    tweet = normalize_hamza(tweet)
    tweet = strip_tatweel(tweet)
    tweet = strip_tashkeel(tweet)


    tweet = re.sub("[ًٌٍَُِّْٰ]", "", tweet)
    tweet = re.sub("[إأٱآا]", "ا", tweet)
    tweet = re.sub("ى", "ي", tweet)
    tweet = re.sub("ؤ", "ء", tweet)
    tweet = re.sub("ئ", "ء", tweet)
    tweet = re.sub("ة", "ه", tweet)
    noise = re.compile(""" ّ    | # Tashdid
                             َ    | # Fatha
                             ً    | # Tanwin Fath
                             ُ    | # Damma
                             ٌ    | # Tanwin Damm
                             ِ    | # Kasra
                             ٍ    | # Tanwin Kasr
                             ْ    | # Sukun
                             ـ     # Tatwil/Kashida
                         """, re.VERBOSE)
    text = re.sub(noise, '', tweet)
    tweet = tweet.lower()
    return tweet

In [None]:
def normalize_hamza(text):
    """Normalize Alef with Hamza Above and Alef with Hamza Below to Alef"""
    text = re.sub("[أإآا]", "ا", text)
    return text

# **TF-IDF Feature Extraction**

In [None]:
import joblib

# Load the TfidfVectorizer object from the saved file
tfidf_vectorizer = joblib.load('/content/drive/MyDrive/tfidf_vectorizer.sav')


# **SVM Model**

In [None]:
svm_model = joblib.load('/content/drive/MyDrive/svm_model.sav')

**Test Svm **

In [None]:
tweet = "السحب الليلة على الايفون .. رتويت للمرفقة وطبق الشروط 👇"
no_stopword_tokens = remove_stopwords(tweet, '/content/list.txt')
print(no_stopword_tokens)
tweet = clean_tweet(no_stopword_tokens)
tweet = clean_tweet2(tweet)
tweet = remove_emojis(tweet)
tokens = tokenize_tweet(tweet)
no_punct_tokens = remove_punctuation(tokens)
cleaned_tweet = ' '.join(no_punct_tokens)
print(cleaned_tweet)

السحب الليلة الايفون .. رتويت للمرفقة وطبق الشروط 👇
السحب الليله الايفون رتويت للمرفقه وطبق الشروط


In [None]:
tweet_features = tfidf_vectorizer.transform([cleaned_tweet])

# Make prediction using the loaded SVM model
prediction = svm_model.predict(tweet_features)
print(prediction)



['positive']


# **Decision Trees**

In [None]:
dtc_model = joblib.load('/content/drive/MyDrive/dtc_model.sav')

In [None]:
tweet_features = tfidf_vectorizer.transform([cleaned_tweet])

# Make prediction using the loaded SVM model
prediction = dtc_model.predict(tweet_features)
print(prediction)

['positive']


# **Naive Bayes**

In [None]:
from sklearn.naive_bayes import MultinomialNB
nb_model = joblib.load('/content/drive/MyDrive/nb_model.sav')

In [None]:
tweet_features = tfidf_vectorizer.transform([cleaned_tweet])

# Make prediction using the loaded SVM model
prediction = nb_model.predict(tweet_features)
print(prediction)

['positive']
