In [62]:
import re
import conllu
import nltk
from sklearn_crfsuite import CRF
from sklearn_crfsuite import metrics
from pyarabic.araby import (
    tokenize,
    is_arabicrange,
    strip_tashkeel,
    strip_tatweel,
    strip_shadda,
)
import matplotlib.pyplot as plt
import pandas as pd
import re
import seaborn as sns
import numpy as np
#encode the corpus to numbers
import tensorflow as tf
import gensim
import joblib

from commons import word_features

In [None]:
with open("pos_tagging_dataset.conllu", mode="r", encoding="utf-8") as data:

    #read the file contents and assign under 'annotations'
    annotations=data.read()

#check the type of the resulting object
print(type(annotations))
annotations[1:1000]

In [None]:
sentences = conllu.parse(annotations)

In [None]:
print(type(sentences))
sentences[1].metadata#metadata is simply a dictionary type
#meta data has 5 keys:newdoc id ,sent_id,text,original_text,text_en

In [None]:
sentences[0][0]
#each sentence is a row , sentences[sentence_indx][:]
# to access each word within the sentence's pos, use sentences[sentence_indx][word_indx]['upos']

In [None]:

# remove diacritics from arabic text and store the new data inside a new file
def remove_tashkeel(data):
    data_without_diactrics = strip_tashkeel(data)
    data_without_shadda = strip_shadda(data_without_diactrics)
    data_without_tatweel = strip_tatweel(data_without_shadda)
    return data_without_tatweel

In [None]:
#create a corpus : list of dictionaries, each dict is a sentence
corpus=[]
list_pos=[]
list_words=[]
pos=[]
list_sentences=[]
for sentence in sentences:
    word_dict={}
    sentence_txt=sentence.metadata['text']
    re_pattern = re.compile(r'[a-zA-Z]+')
    if re_pattern.search(sentence_txt)== None:
        list_sentences.append(sentence_txt)
        pos_bfr=[]
        for w_indx in range(len(sentence)):
            word_str=remove_tashkeel(sentence[w_indx]['form'])
            word_pos=sentence[w_indx]['upos']
            pos_bfr.append(word_pos)
            list_pos.append(word_pos)
            list_words.append(word_str)
            word_dict[w_indx]=[word_str,word_pos,sentence_txt,len(sentence_txt)]
        pos.append(pos_bfr)
        corpus.append(word_dict)

In [None]:
#report the info of the cleaned corpus
print("The corpus has {} sentences".format(len(corpus)))

In [None]:
#save the results in a dataframe 
df_w = pd.DataFrame (list_words, columns = ['w_name'])
df = pd.DataFrame (list_pos, columns = ['tag_name'])
df_w.to_csv("word_list_cleaned.csv", encoding="utf-8")
df.to_csv("tag_list.csv", encoding="utf-8")

In [None]:
#plot the found tags with popularity of each in the corpus
fig, ax = plt.subplots()
# the size of A4 paper
fig.set_size_inches(11.7, 8.27)
ax = sns.countplot(x=df['tag_name'], order=df['tag_name'].value_counts(ascending=False).index);

abs_values = df['tag_name'].value_counts(ascending=False).values

ax.bar_label(container=ax.containers[0], labels=abs_values)

In [None]:
# Define the paths to your CSV files
text_file = "word_list_cleaned.csv"
pos_tags_file = "tag_list.csv"

# Load the data from CSV files
text_data = pd.read_csv(text_file)
pos_tags_data = pd.read_csv(pos_tags_file)

# Extract the text and POS tags from the DataFrames
texts = text_data["w_name"].tolist()
pos_tags = pos_tags_data["tag_name"].tolist()

corpus = list(zip(texts, pos_tags))
# Extract features for each sentence in the corpus
X = []
y = []
for text, pos_tag in zip(texts, pos_tags):
    text = str(text)
    pos = str(pos)

    words = text.split()
    pos_list = pos.split()
   
    words = text.split()
    tags = pos_tag.split()
    sentence = list(zip(words, tags))
    X_sentence = []
    y_sentence = []
    for i in range(len(sentence)):
        X_sentence.append(word_features(sentence, i))
        y_sentence.append(sentence[i][1])
    X.append(X_sentence)
    y.append(y_sentence)


# Split the data into training and testing sets
split = int(0.9 * len(X))
X_train = X
y_train = y
X_test = X[split:]
y_test = y[split:]


In [None]:
# Train a CRF model on the training data
crf = CRF(
	algorithm='lbfgs',
	c1=0.01,
	c2=0.05,
	max_iterations=100,
	all_possible_transitions=True
)
crf.fit(X_train, y_train)

# Make predictions on the test data and evaluate the performance
y_pred = crf.predict(X_test)

print(metrics.flat_accuracy_score(y_test, y_pred))


In [None]:
text = "و كان الطالبين يدرسان بجد. كان المعلمان يساعدون الطلاب في الفصل."
text = """
كان الولدان يلعب في الحديقة.
البنت يقرأ كتابًا تحت الشجرة.
الطالب تدرسان معًا في المكتبة.
ذهبت الأسرة إلى سوق.
أشتريت ثلاث كتب من المكتبة.
الكتابة على الطاولة.
المعلمين يعلم الطلاب في الصف.
كان الرجال يجلسون على الكرسي.
ألبنات تلعب في الساحتان المتجاورتان الصغيرتان.
ألصديق يذهبان إلى المدرسة كل يوم.
نريد أن نذهب الى الحديقة.
نحب أن نقرأ الكتب.
يجب أن نحترم الآخرين.
عليكما أن تدرسان بجد.
يمكنكم أن تحققون أحلامكم.
لن نستسلم أبدا.
علينا أن نتعاون معا.
يجب أن نحافظ على البيئة.
نود أن نسافر حول العالم.
لا تنسون أن تتصلوا بي.
لم ينجو المصاب من الحادث.
"""

# Tokenize the text into a list of words
# Tokenize the text into a list of words
words = text.split()

# Create a list of word-tag pairs for the new text
sentence = list(zip(words, ['' for _ in range(len(words))]))

# Extract features for the words
X_new = [word_features(sentence, i) for i in range(len(sentence))]

# Use the trained CRF model to predict the tags
y_pred = crf.predict([X_new])[0]

# Print the predicted tags
for word, tag in zip(words, y_pred):
    print(f"{word} - {tag}")

In [None]:
# save the model 
joblib.dump(crf, 'models/crf_model.joblib')

In [79]:
crf = joblib.load('models/crf_model.joblib')   
text = "كان المعلمان منذ قليل مندهشون"
words = text.split()
sentence = list(zip(words, ['' for _ in range(len(words))]))
X_new = [word_features(sentence, i) for i in range(len(sentence))]
y_pred = crf.predict([X_new])[0]
for word, tag in zip(words, y_pred):
    print(word, tag)

كان AUX
المعلمان ADJ
منذ ADP
قليل ADJ
مندهشون NOUN
