Required Libraries

In [None]:
import zipfile
import os
import glob
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
import matplotlib.pyplot as plt

Extracting zip

In [None]:

zip_file_path = '/content/Khaleej-2004-utf8.zip'
extract_folder = 'extracted_corpus'

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_folder)


Extracting folder of International news

In [None]:
international_folder = os.path.join(extract_folder, '/content/extracted_corpus/Khaleej-2004/International news')

Building corpus contituting of all html files that are in the international news folder

In [None]:
corpus = []

for file_path in glob.glob(os.path.join(international_folder, '*.html')):
    with open(file_path, 'r', encoding='utf-8') as file:
        html_content = file.read()
        soup = BeautifulSoup(html_content, 'html.parser')
        text_content = soup.get_text()
        # Add any additional preprocessing steps here
        corpus.append(text_content)


Downloading nltk libraries

In [None]:
# Download Arabic stop words
import nltk
nltk.download('stopwords')
stop_words = set(stopwords.words('arabic'))
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag

In [None]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

Preprocessing the corpus

In [None]:
preprocessed_corpus = []

for document in corpus:
    tokens = word_tokenize(document)
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

    # Stemming
    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]

    # Lemmatization
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]

    preprocessed_corpus.append(lemmatized_tokens)

In [None]:
# Part-of-Speech Tagging
pos_tags = []
for document_tokens in preprocessed_corpus:
    pos_tags.extend(pos_tag(document_tokens))

In [None]:
print(pos_tags[0:30])

[('سوريا', 'JJ'), ('توفد', 'NNP'), ('المعلم', 'NNP'), ('بيروت', 'NNP'), ('لإجراء', 'NNP'), ('محادثات', 'NNP'), ('موالين', 'NNP'), ('ومعارضين', 'NNP'), ('لتواجدها', 'NNP'), ('بلبنان', 'NNP'), ('انتهجت', 'NNP'), ('سوريا', 'NNP'), ('أسلوبا', 'NNP'), ('جديدا', 'NNP'), ('التعامل', 'NNP'), ('لبنان', 'NNP'), ('بإرسالها', 'NNP'), ('الثلاثاء', 'NNP'), ('مسئولا', 'NNP'), ('رسميا', 'NNP'), ('فتح', 'NNP'), ('قنوات', 'NNP'), ('الحوار', 'NNP'), ('مسئولين', 'NNP'), ('سياسيين', 'NNP'), ('موالين', 'NNP'), ('ومعارضين', 'NNP'), ('لوجودها', 'NNP'), ('لبنان', 'NNP'), ('ووصل', 'NNP')]


In [None]:
# Build Frequency Graph
all_tokens = [token for doc in preprocessed_corpus for token in doc]
word_freq = Counter(all_tokens)

Printing 10 most frequent words + frequency of each part of speach tag

In [None]:
# Print Top 10 Most Frequent Words
print("Top 10 Most Frequent Words:")
for word, freq in word_freq.most_common(10):
    print(f"{word}: {freq}")

# Count of Each Part-of-Speech Tag
pos_tag_counts = Counter(tag for word, tag in pos_tags)

# Print Number of Each Part-of-Speech Tag
print("\nNumber of Each Part-of-Speech Tag:")
for tag, count in pos_tag_counts.items():
    print(f"{tag}: {count}")

Top 10 Most Frequent Words:
ان: 9518
الى: 5274
وقال: 3158
امس: 1911
العراق: 1777
الانتخابات: 1610
المتحدة: 1401
انه: 1351
الفلسطينية: 1330
رئيس: 1286

Number of Each Part-of-Speech Tag:
JJ: 953
NNP: 429723
NN: 953
