In [51]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
import string
import re
import re
import tnkeeh
from pyarabic.araby import strip_tashkeel, normalize_alef, normalize_teh
from pyarabic.araby import tokenize
from nltk.stem.isri import ISRIStemmer # special stemmer for arabic
from nltk.corpus import stopwords


In [32]:
# Normalization: remove diacritics from the text & Punctuation mark removal & Numbers & Non-arabic & alef, teh 
def normalization(text):

    # Remove non-Arabic characters, punctuation, and numbers
    text = re.sub(r'[^\u0621-\u064A\s]', '', text)
    
    # Remove diacritics
    text = strip_tashkeel(text)
    
    # Normalize Alef and Teh characters
    text = normalize_alef(text)
    text = normalize_teh(text)
    
    # Define character mapper
    char_mapper = {
        'ک': 'ك',
        'ﺑ': 'ب',
        'ھ': 'ه',  # Mapping 'ھ' to 'ه'
        'ی': 'ى'
    }

    # Map characters using char_mapper
    for src_char, target_char in char_mapper.items():
        text = text.replace(src_char, target_char)
    
    # Further cleaning using tnkeeh
    tnkeeh_obj = tnkeeh.Tnkeeh(
        remove_special_chars=True,
        remove_english=True,
        normalize=True,
        remove_diacritics=True,
        remove_tatweel=True,
        remove_html_elements=True,
        remove_links=True,
        remove_twitter_meta=True
    )
    text = tnkeeh_obj.clean_raw_text(text)[0]
    
    return text

# Test the normalization function
text = "وَإنّي لمُشْتاقٌه إلى ظِلّ صاحِبٍ،"
normalized_text = normalization(text)
print(normalized_text)


واني لمشتاقه الا ظل صاحب


In [43]:
# stemming
def stemming(word):
    
    # Create an instance of the ISRI Arabic Stemmer

    stemmer = ISRIStemmer()
    return stemmer.stem(word)

text = 'لمشتاق'
result = stemming(text)
print(result)

شتق


In [44]:
# tokenization
def tokenization(text):
    
    
    text = tokenize(text)
    return text

text = 'يروق ويصفو ان كدرت عليه '

result = tokenization(text)
print(result)

['يروق', 'ويصفو', 'ان', 'كدرت', 'عليه']


In [60]:

stop_words_file_name = 'stopWords.txt'
stop_words = []
with open(stop_words_file_name, encoding='utf-8') as file:
    for line in file:
        line = line.strip()
        stop_words.append(line)

stop_words = list(set(stop_words))
stop_words += stopwords.words('arabic')

print(stop_words)

['', 'مكانكنّ', 'وجود', 'صبرا', 'قد', 'تارة', 'خمسين', 'الماضي', 'هيا', 'اليها', 'عام', 'سبعة', 'نهاية', 'كانون', 'هَاتِه', 'وَيْ', 'حاشا', 'كأنّ', 'وله', 'كلا', 'شين', 'بات', 'بئس', 'أخذ', 'هَذِه', 'وأبو', 'أنفسكم', 'عاما', 'االا', 'الى', 'ليس', 'وقال', 'آهِ', 'ياء', 'بان', 'نوفمبر', 'فهى', 'طفق', 'كى', 'يوم', 'مما', 'أين', 'تلك', 'قطّ', 'ع', 'تم', 'كرب', 'هَيْهات', 'حمو', 'وإن', 'هى', 'ولايزال', 'ثمانية', 'تشرين', 'الثانية', 'وقد', 'هذه', 'خلال', 'صراحة', 'شَتَّانَ', 'فرادى', 'فوق', 'تبدّل', 'درى', 'أصلا', 'ض', 'إياكم', 'لديه', 'بؤسا', 'اثني', 'غير', 'رجع', 'لدي', 'بين', 'اثنين', 'ذال', 'هل', 'ما انفك', 'ماذا', 'بَلْهَ', 'ما', 'رأى', 'هَذَيْنِ', 'إذن', 'درهم', 'أيّان', 'إياك', 'أبدا', 'وقف', 'حبذا', 'أربعمئة', 'اللتين', 'لا', 'أكتوبر', 'من', 'رويدك', 'عند', 'االتى', 'أنشأ', 'آهاً', 'عدم', 'أربع', 'وراءَك', 'صباح', 'بلى', 'فبراير', 'كسا', 'أو', 'جير', 'سبعمئة', 'ثلاثاء', 'دولار', 'ايضا', 'سنة', 'صهٍ', 'خامس', 'ايار', 'م', 'واو', 'نفس', 'والذي', 'سبحان', 'فضلا', 'ديسمبر', 'مايزال', 'تف

In [61]:
def remove_stop_words(words, stop_words=stop_words):
    
    result = [] 
    removed_indexes = []
    for i in range(len(words)):
        if words[i] in stop_words:
            removed_indexes.append(i)
        else:
            result.append(words[i])
        
    removed_indexes.sort(reverse=True) # for save remove of pos tagging 
    return result, removed_indexes

remove_stop_words(['يروق', 'ويصفو', 'ان', 'كدرت', 'عليه'])


(['يروق', 'ويصفو', 'كدرت'], [4, 2])

In [62]:
def preprocessing(text):
    normalization_text = normalization(text)
    segments = tokenization(normalization_text)

    # remove stop words from the the text after tokenization
    words, _ = remove_stop_words(segments)
    number_of_words = len(words)
    
    
    # stemming for the words in the original text after tokenization without stop words
    for i in range(number_of_words):
        words[i] = stemming(words[i])

 
    return words

text = "أينَ منْ كانَ قبلنَا أين أينَا"

result = preprocessing(text)
print(result)
for word in result:
    print(word)


['اين', 'قبل', 'اين', 'اين']
اين
قبل
اين
اين


exchange the english names with arabic ones

In [69]:
# Function to load data and normalize text
def load_data(dataset_path):
    data = []
    labels = []
    authors = []  # List to store author names
    poem_authors = []  # List to store the author of each poem
    
    for author in os.listdir(dataset_path):
        author_folder = os.path.join(dataset_path, author)
        if os.path.isdir(author_folder):
            authors.append(author)  # Add author to the list
            for poem in os.listdir(author_folder):
                poem_path = os.path.join(author_folder, poem)
                if poem_path.endswith('.txt'):
                    with open(poem_path, 'r', encoding='utf-8') as file:
                        text = file.read()
                        preprocessed_text = preprocessing(text)  # Preprocess the poem text
                        data.append((preprocessed_text, author))  # Store preprocessed text along with author
                        poem_authors.append(author)  # Store the author of the poem
                        labels.append(author)  # Store the author (for compatibility, you may remove this line if not needed)
    return data, labels, authors, poem_authors

dataset_path = 'dataset'
data, labels, authors, poem_authors = load_data(dataset_path)
# Print the names of the authors
print("Authors:", authors)

# Print the poem data and their corresponding authors
for poem, author in zip(data, poem_authors):
    print("Author:", author)
    print("Poem:", poem)
    print()

Authors: ['AbuAlAtahiya', 'AbuAlFadlAldarimi', 'AbuFirasAlHamdani', 'AbuNuwas', 'AliAlTuhami', 'AliIbnAlJahm', 'AlMutanabbi', 'IbnAlFarid', 'UlayyaBintAlMahdi']
Author: AbuAlAtahiya
Poem: (['وني', 'شتق', 'صحب', '\n', 'يرق', 'صفو', 'كدر', '\n', 'عذر', 'انس', 'جفت'], 'AbuAlAtahiya')

Author: AbuAlAtahiya
Poem: (['اين', 'قبل', 'اين', 'اين', '\n', 'انس', 'كان', 'جمل', 'ورز', '\n', 'دهر', 'اتا', 'علي', 'فاف'], 'AbuAlAtahiya')

Author: AbuAlAtahiya
Poem: (['ارا', 'شيء', 'احا', 'قلب', 'علق', '\n', 'يبل', 'مزق', '\n', 'صرف', 'طور', 'ارا', 'عبر'], 'AbuAlAtahiya')

Author: AbuAlAtahiya
Poem: (['داو', 'رفق', 'جرح', 'خرق', '\n', 'وبل', 'لذم', 'لذم', 'حمد', 'وذق', '\n', 'وسع', 'ناس', 'خلق', 'حسن'], 'AbuAlAtahiya')

Author: AbuAlAtahiya
Poem: (['قلب', 'زمن', 'سود', 'رسك', 'ابض', '\n', 'ونع', 'جسم', 'رقه', 'قبض', '\n', 'نل', 'شيء', 'شئت', 'نوع', 'منا'], 'AbuAlAtahiya')

Author: AbuAlAtahiya
Poem: (['عمر', 'دنا', 'بدر', 'بقء', '\n', 'كفك', 'بدر', 'موت', 'دار', 'فنء', '\n', 'عشق', 'دنا', 'اخي', 'فان'],

feature extraction

In [76]:
class TFIDFExtractor:
    def __init__(self):
        from sklearn.feature_extraction.text import TfidfVectorizer
        self.vectorizer = TfidfVectorizer()
        self.feature_names = None
        self.author_names = None

    def fit_transform(self, corpus):
        # Extract the sentences from the corpus
        sentences = [sample[0] for sample in corpus]

        # Extract the authors from the corpus
        authors = [sample[1] for sample in corpus]

        # Convert the sentences to strings
        sentences = [' '.join(sentence) for sentence in sentences]

        # Fit and transform the sentences
        tfidf_matrix = self.vectorizer.fit_transform(sentences)

        # Get the feature names (unique words) as columns
        self.feature_names = self.vectorizer.get_feature_names_out()

        # Get the author names as rows
        self.author_names = authors

        return tfidf_matrix.toarray()

    def transform(self, corpus):
        # Extract the sentences from the corpus
        sentences = [sample[0] for sample in corpus]

        # Convert the sentences to strings
        sentences = [' '.join(sentence) for sentence in sentences]

        # Transform the sentences
        tfidf_matrix = self.vectorizer.transform(sentences)

        return tfidf_matrix.toarray()

    def get_feature_names(self):
        return self.feature_names
    
    def get_author_names(self):
        return self.author_names


In [77]:
# TF-IDF
tf_idf = TFIDFExtractor()

In [73]:
class WordEmbeddingExtractor:
    def __init__(self, vector_size=100, window=10, min_count=1, workers=4, sg=0):
        self.vector_size = vector_size
        self.window = window
        self.min_count = min_count
        self.workers = workers
        self.sg = sg
        self.model = None
        self.feature_names = None
        self.author_names = None

    def fit_transform(self, corpus):
        from gensim.models import FastText
        import numpy as np
        # Extract the sentences from the corpus
        sentences = [sample[0] for sample in corpus]

        # Extract the authors from the corpus
        authors = [sample[1] for sample in corpus]

        # Train FastText model
        self.model = FastText(sentences, vector_size=self.vector_size, window=self.window,
                              min_count=self.min_count, workers=self.workers, sg=self.sg)

        # Get the feature names (labeled dimensions) for FastText
        self.feature_names = [f't{i}' for i in range(self.vector_size)]

        # Get the author names as rows
        self.author_names = authors

        # Create feature vectors for each sentence
        feature_vectors = []
        for sentence in sentences:
            vectors = [self.model.wv[word] for word in sentence if word in self.model.wv]
            if vectors:
                sentence_vector = np.mean(vectors, axis=0)
            else:
                sentence_vector = np.zeros(self.vector_size)
            feature_vectors.append(sentence_vector)

        # Convert feature vectors to NumPy array
        feature_vectors = np.array(feature_vectors)

        return feature_vectors

    def transform(self, corpus):
        import numpy as np
        # Extract the sentences from the corpus
        sentences = [sample[0] for sample in corpus]

        # Create feature vectors for each sentence
        feature_vectors = []
        for sentence in sentences:
            vectors = [self.model.wv[word] for word in sentence if word in self.model.wv]
            if vectors:
                sentence_vector = np.mean(vectors, axis=0)
            else:
                sentence_vector = np.zeros(self.vector_size)
            feature_vectors.append(sentence_vector)

        # Convert feature vectors to NumPy array
        feature_vectors = np.array(feature_vectors)

        return feature_vectors

    def get_feature_names(self):
        return self.feature_names
    
    def get_author_names(self):
        return self.author_names

- Double check 

In [81]:
fast_text = WordEmbeddingExtractor(vector_size=499, window=10, min_count=1, workers=4, sg=0)


we can add Poetic Features they find the rhyme emptional tone and stuff for features extractions and is better done on the original data ( not Preprocess but remove  حركات) - look it up 

combine features

- Double check

In [82]:
tf_idf_matrix = tf_idf.fit_transform(data)
tf_idf_feature_names, authors_list = tf_idf.get_feature_names(), tf_idf.get_author_names()
word_embedding_matrix = fast_text.fit_transform(data)
word_embedding_feature_names, authors_list = fast_text.get_feature_names(), fast_text.get_author_names()

print("Shape of TF-IDF matrix:", tf_idf_matrix.shape)
print("Shape of Word Embedding matrix:", word_embedding_matrix.shape)


# Combine the feature names
combined_feature_names = tf_idf_feature_names + word_embedding_feature_names

def combine_feature_vectors(x, y):
    import numpy as np
    # Combine the feature vectors
    return np.concatenate((x, y), axis=1)

# Combine the TF-IDF matrix and FastText feature vectors
combined_features_vectors = combine_feature_vectors(tf_idf_matrix, word_embedding_matrix)


# Create a list of tuples for feature vector-author mapping
features_vectors_matrix = list(zip(combined_features_vectors, authors_list))

print(len(features_vectors_matrix[0][0]))



Shape of TF-IDF matrix: (75, 499)
Shape of Word Embedding matrix: (75, 499)
998
