<a href="https://colab.research.google.com/github/123gamal/Python_Practice/blob/main/NLP_5_Tasks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

***Task one : Multilingual Text Preprocessing Pipeline***


In [32]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, ISRIStemmer

nltk.download('punkt')
nltk.download('stopwords')

def preprocess_text(text, lang='english'):
    text = re.sub(r'[^\w\s]', '', text.lower())
    if lang == 'english':
        tokens = nltk.word_tokenize(text, language='english')
        stop_words = set(stopwords.words('english'))
        stemmer = SnowballStemmer("english")
    elif lang == 'arabic':
        tokens = nltk.word_tokenize(text)
        stop_words = set(stopwords.words('arabic'))
        stemmer = ISRIStemmer()
    else:
        raise ValueError("Unsupported language")

    filtered = [stemmer.stem(w) for w in tokens if w not in stop_words]
    return filtered

print(preprocess_text("This is an English sentence."))
print(preprocess_text("هٰذا نَصٌّ بِاللُّغَةِ العَرَبِيَّةِ", lang='arabic'))

['english', 'sentenc']
['نص', 'لغة', 'عرب']


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


***Task 2:Based Word Similarity***



In [30]:
import fasttext
import fasttext.util
import gzip
import shutil
import os

# Download Arabic fastText model (only need to do once)
# Check if the uncompressed file already exists
if not os.path.exists("cc.ar.300.bin"):
    # Download the compressed file
    if not os.path.exists("cc.ar.300.bin.gz"):
        !wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ar.300.bin.gz

    # Decompress the file
    with gzip.open("cc.ar.300.bin.gz", 'rb') as f_in:
        with open("cc.ar.300.bin", 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
    print("Decompression complete.")
else:
    print("Uncompressed model file already exists.")


# Load model from the uncompressed file
ft = fasttext.load_model("cc.ar.300.bin")

# Similarity between words
vec1 = ft.get_word_vector("الذكاء")
vec2 = ft.get_word_vector("الاصطناعي")

# Cosine similarity
import numpy as np
cos_sim = np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
print("Similarity between 'الذكاء' and 'الاصطناعي':", cos_sim)

Decompression complete.
Similarity between 'الذكاء' and 'الاصطناعي': 0.43998143


***Task 3: Build a Q&A Chatbot Using Transformers***

In [20]:
from transformers import pipeline

qa = pipeline("question-answering", model="aubmindlab/bert-base-arabertv2")

context = """
اللغة العربية هي إحدى أكثر اللغات انتشاراً في العالم، يتحدث بها أكثر من 400 مليون نسمة في الوطن العربي ومناطق أخرى.
تُعدّ من اللغات الرسمية في منظمة الأمم المتحدة.
"""

question = "كم عدد المتحدثين باللغة العربية؟"

result = qa(question=question, context=context)
print(f"Answer: {result['answer']}")

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cuda:0


Answer: بها أكثر من 400 مليون نسمة في الوطن العربي


***Task 4: Topic Modeling with LDA on Arabic News***

In [7]:
from gensim import corpora, models
from nltk.tokenize import word_tokenize
import re
from nltk.corpus import stopwords


def preprocess_arabic(text):
    text = re.sub(r'[^\w\s]', '', text)
    tokens = word_tokenize(text)
    arabic_stop_words = set(stopwords.words('arabic'))
    return [word for word in tokens if word not in arabic_stop_words and len(word) > 2]

documents = [
    "السياسة في الشرق الأوسط معقدة ومتشابكة.",
    "يعاني الاقتصاد العالمي من التضخم.",
    "تتطور تقنيات الذكاء الاصطناعي بسرعة.",
    "الرياضة تساعد على تحسين الصحة العامة.",
]

processed_docs = [preprocess_arabic(doc) for doc in documents]
dictionary = corpora.Dictionary(processed_docs)
corpus = [dictionary.doc2bow(text) for text in processed_docs]

lda = models.LdaModel(corpus, num_topics=2, id2word=dictionary, passes=10)
topics = lda.print_topics()
for topic in topics:
    print(topic)

(0, '0.110*"التضخم" + 0.110*"يعاني" + 0.110*"الاقتصاد" + 0.110*"العالمي" + 0.038*"الأوسط" + 0.038*"السياسة" + 0.038*"معقدة" + 0.038*"بسرعة" + 0.037*"تقنيات" + 0.037*"تتطور"')
(1, '0.061*"تساعد" + 0.061*"الصحة" + 0.061*"تحسين" + 0.061*"العامة" + 0.061*"الرياضة" + 0.061*"الاصطناعي" + 0.061*"الشرق" + 0.061*"الذكاء" + 0.061*"ومتشابكة" + 0.061*"تتطور"')


***Task 5: Build a Custom Tokenizer for Arabic Diacritized Text***

In [8]:
import re

arabic_diacritics = re.compile(r'[\u0617-\u061A\u064B-\u0652]')

def remove_diacritics(text):
    return re.sub(arabic_diacritics, '', text)

def custom_tokenize(text):
    text = remove_diacritics(text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'[ـ]+', '', text)
    tokens = text.split()
    return tokens

text = "اللُّغَةُ العَرَبِيَّةُ جَمِيلَةٌ"
tokens = custom_tokenize(text)
print(tokens)

['اللغة', 'العربية', 'جميلة']


In [23]:
!pip install pyarabic



In [28]:
!pip install farasa

Collecting farasa
  Downloading Farasa-0.0.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading Farasa-0.0.1-py2.py3-none-any.whl (12.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.6/12.6 MB[0m [31m81.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: farasa
Successfully installed farasa-0.0.1


In [28]:
!pip install fasttext

Collecting fasttext
  Downloading fasttext-0.9.3.tar.gz (73 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-3.0.0-py3-none-any.whl.metadata (10.0 kB)
Using cached pybind11-3.0.0-py3-none-any.whl (292 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (pyproject.toml) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.3-cp311-cp311-linux_x86_64.whl size=4508439 sha256=53459919ca12db42ee2a16cc8228d54371d41240e275c746b76e72726b3ad939
  Stored in directory: /root/.cache/pip/wheels/65/4f/35/5057db0249224e9ab55a513