In [3]:
file_path = "arwiki-20180920-corpus.xml"

with open(file_path, "r", encoding="utf-8") as file:
    text = file.read()

print(text[:1000])  # Print first 1000 characters to check


<?xml version="1.0" encoding="UTF-8"?>
<!-- ########################################################## -->
<!-- #                                                          -->
<!-- #  arwiki-20180920.xml                                     -->
<!-- #                                                          -->
<!-- #                                                          -->
<!-- #  (C) Copyright 2018 Wikimedia Foundation, Inc.           -->
<!-- #                        http://ar.wikipedia.org           -->
<!-- #  (C) Copyright this adaption: Peter Kolb                 -->
<!-- #               peter.kolb@linguatools.org                 -->
<!-- #               http://www.linguatools.org/tools/corpora/  -->
<!-- #                                                          -->
<!-- #  This work is made available under the Creative Commons  -->
<!-- #  Attribution-ShareAlike 3.0 License:                     -->
<!-- #  http://creativecommons.org/licenses/by-sa/3.0/legalcode -->
<!-- #   

In [5]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.isri import ISRIStemmer

# Download NLTK Arabic resources
nltk.download('stopwords')

# Initialize Arabic stopwords and stemmer
arabic_stopwords = set(stopwords.words('arabic'))
stemmer = ISRIStemmer()

def preprocess_arabic(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # Remove non-Arabic characters (keep only Arabic letters and spaces)
    text = re.sub(r'[^\u0600-\u06FF\s]', '', text)
    
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Tokenize (split by spaces)
    words = text.split()
    
    # Remove stopwords
    words = [word for word in words if word not in arabic_stopwords]
    
    # Apply stemming
    words = [stemmer.stem(word) for word in words]
    
    return ' '.join(words)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [25]:
clean_paragraphs = []

file_path = 'arwiki-20180920-corpus.xml' 

MAX_LINES = 10000  # Limit lines for quick testing

with open(file_path, 'r', encoding='utf-8') as file:
    for i, line in enumerate(file):
        if i >= MAX_LINES:
            break
        line = line.strip()

        # Skip XML tags, HTML comments, and empty lines
        if line.startswith('<?xml') or line.startswith('<!--') or line.startswith('<') or line == '':
            continue

        # Now clean and save real Arabic text
        clean_line = preprocess_arabic(line)
        
        # Only add non-empty cleaned text
        if clean_line.strip() != '':
            clean_paragraphs.append(clean_line)

In [26]:
print(f" Finished preprocessing! Total cleaned paragraphs: {len(clean_paragraphs)}")

 Finished preprocessing! Total cleaned paragraphs: 814


In [27]:
# Create DataFrame
df = pd.DataFrame({'paragraph': clean_paragraphs})
df = df.fillna('')


In [28]:
# Save cleaned sample
df.to_csv('cleaned_arabic_wiki_sample.csv', index=False, encoding='utf-8')

print(" Cleaned Arabic sample saved successfully to cleaned_arabic_wiki_sample.csv")

 Cleaned Arabic sample saved successfully to cleaned_arabic_wiki_sample.csv


In [29]:
# Print a sample paragraph
if len(clean_paragraphs) > 0:
    print("\n Sample Context:\n", clean_paragraphs[6])


 Sample Context:
 زخر ياه ارض سطح بعد شكل تنع حيي يه، نقص زدد عمق عمد حية ائة بحر حيط وجد ماء كعمل سسي كتل حيوية، ويك عمل حدد ناج كمة قدر غذي نبت ذاب وسف ركب ترج مونيوم نتر ضفة ثني كسد كرب


In [32]:
print(df.columns)


Index(['paragraph'], dtype='object')


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = df['paragraph'].tolist()  # Use all cleaned texts from DataFrame

# Initialize TF-IDF
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(corpus)

print(X.shape)  # Number of documents x number of features
print(tfidf.get_feature_names_out())  # See the vocabulary

# Save cleaned sample
df.to_csv('cleaned_arabic_wiki_sample.csv', index=False, encoding='utf-8')
print("Cleaned Arabic sample saved successfully to cleaned_arabic_wiki_sample.csv")

# Print a sample paragraph
if len(clean_paragraphs) > 6:  # Notice: check if index 6 exists
    print("\nSample Context:\n", clean_paragraphs[6])


(814, 4111)
['آبر' 'آتم' 'آخر' ... 'يوم' 'يون' 'يوه']
Cleaned Arabic sample saved successfully to cleaned_arabic_wiki_sample.csv

Sample Context:
 زخر ياه ارض سطح بعد شكل تنع حيي يه، نقص زدد عمق عمد حية ائة بحر حيط وجد ماء كعمل سسي كتل حيوية، ويك عمل حدد ناج كمة قدر غذي نبت ذاب وسف ركب ترج مونيوم نتر ضفة ثني كسد كرب


In [None]:
from gensim.models import Word2Vec

# Prepare the corpus: list of lists of words
# If you have cleaned texts stored in the DataFrame `df`
# (after confirming the correct column name)
corpus = df['paragraph'].apply(lambda x: x.split()).tolist()


# Train the Word2Vec model
model = Word2Vec(sentences=corpus, vector_size=100, window=5, min_count=1, workers=4)

# Example: Get the vector for a specific word
try:
    vector = model.wv['لغة']  # Replace 'لغة' with any Arabic word you want
    print(vector)
except KeyError:
    print("The word 'لغة' was not found in the vocabulary.")


[-0.02465982  0.02210858  0.02295838  0.00122577  0.00963704 -0.07336017
  0.04456649  0.10314479 -0.03340747 -0.02782421 -0.03188172 -0.05230464
 -0.02025422  0.01345363 -0.00024046 -0.04092807 -0.00242839 -0.04591601
  0.00911411 -0.09710969  0.01579804  0.01291465  0.03608207 -0.01952653
 -0.00371001 -0.0189515  -0.04029905 -0.04442123 -0.03824596 -0.00056775
  0.04286265  0.01902711  0.00808944 -0.03060136 -0.00857342  0.0588086
 -0.01053068 -0.03707424 -0.02983358 -0.08111031 -0.01265097 -0.04168354
 -0.0084947   0.01928122  0.04559947 -0.03364554 -0.03676334 -0.00363762
  0.04084597  0.04047025  0.02170288 -0.0373952   0.00470576  0.01812375
 -0.04285415  0.04288952  0.00936949  0.00376055 -0.05506981  0.01030207
  0.01562598  0.02727741 -0.01852809 -0.02495131 -0.07165431  0.02490312
  0.02929877  0.04068363 -0.05740259  0.04528158 -0.0264707   0.02896178
  0.04488745 -0.02868368  0.04104752  0.03655614  0.02235711 -0.01168096
 -0.04695265  0.04094616 -0.01395918 -0.01067196 -0.