In [1]:
import json
import pandas as pd
import nltk
import re
nltk.data.path = ['/home/albin/nltk_data']
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
from pathlib import Path
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.data import find
find("tokenizers/punkt")


[nltk_data] Downloading package punkt_tab to /home/albin/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/albin/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


FileSystemPathPointer('/home/albin/nltk_data/tokenizers/punkt')

In [2]:
class Debug:
    def read_jsonl_to_df(self, filename: str) -> pd.DataFrame:
        r = []
        with open(f"../output/{filename}.jsonl", 'r') as reader:
            for row in reader:
                json_row = json.loads(row)
                r.append(pd.DataFrame(json_row, index=[0]))
            
        df = pd.concat(r, ignore_index=True)
        duplicates = df[df.duplicated() == True]
        data   = df.drop_duplicates()
        return df, data, duplicates


class Statistics:
    def average_stats(self, data: pd.DataFrame) -> None:
        data['words'] = data['content'].apply(lambda x: len(x.split()))
        data['chars'] = data['content'].apply(lambda x: len(x))
        data['sentences'] = data['content'].apply(lambda x: len(x.split('.')))
        data['paragraphs'] = data['content'].apply(lambda x: len(x.split('\n')))
        data['avg_word_length'] = data['chars'] / data['words']
        result = data.groupby('media_type').agg({'sentences': 'mean', 'words': 'mean', 'chars': 'mean', 'paragraphs': 'mean', 'avg_word_length': 'mean'}).reset_index().round(0)
        
        print(
        "N sample: %s \n\n"
        "Average lenght of words, sentences, paragraphs:" \
        "\n\n %s" %(len(data), result), end="\n\n"
        )

    def nltk_stats(self, title, sentence) -> None:
        print("nltk.data.paths: ", nltk.data.path)

        stop_words = nltk.corpus.stopwords.words('english')
        tokens: list = nltk.word_tokenize(sentence)
        filtered_sentence = [w for w in tokens if not w.lower() in stop_words]
        tagged: list[tuple] = nltk.pos_tag(filtered_sentence)
        nouns = [word for word, pos in tagged if pos in ['NNP']]
        for noun in nouns:
            print(
            "Title: %s \n",
            {"Entity": noun,
            "Sentence": sentence,
            }
            )
    
    def run(self):
        data: pd.DataFrame = self.read_jsonl_to_df()
        self.average_stats(data)
        for i in data.iterrows():
            self.nltk_stats(i[1]['title'], i[1]['content'])
        
df, data, duplicates = Debug().read_jsonl_to_df('aljazeera_links')
print("Original df: %s, After removed: %s, NR of duplicates: %s" % (len(df), len(data), len(duplicates)))

Original df: 2133, After removed: 2133, NR of duplicates: 0


In [3]:
data.sort_values(by='title', ascending=True)

Unnamed: 0,website,url,link,title,media_type,date,content
563,aljazeera,https://www.aljazeera.com/tag/israel-palestine...,https://www.aljazeera.com/program/newsfeed/202...,1-year-old among victims of Israel’s indiscrim...,,20 Mar 2025,
1907,aljazeera,https://www.aljazeera.com/tag/israel-palestine...,https://www.aljazeera.com/program/newsfeed/202...,10-year-old brother of Shaban al-Dalou dies fr...,,18 Oct 2024,
2099,aljazeera,https://www.aljazeera.com/tag/israel-palestine...,https://www.aljazeera.com/news/2024/10/3/israe...,18 killed in Israeli strike on West Bank’s Tul...,,3 Oct 2024,
1695,aljazeera,https://www.aljazeera.com/tag/israel-palestine...,https://www.aljazeera.com/program/digidocs/202...,24 Hours,,10 Nov 2024,
509,aljazeera,https://www.aljazeera.com/tag/israel-palestine...,https://www.aljazeera.com/video/newsfeed/2025/...,24 hours of Israeli atrocities in Gaza,,25 Mar 2025,
...,...,...,...,...,...,...,...
1618,aljazeera,https://www.aljazeera.com/tag/israel-palestine...,https://www.aljazeera.com/program/quotable/202...,"“No excuse” for FIFA, UEFA silence over Israel...",,17 Nov 2024,
1333,aljazeera,https://www.aljazeera.com/tag/israel-palestine...,https://www.aljazeera.com/program/quotable/202...,“No infrastructure left” in Palestinian Yarmou...,,27 Dec 2024,
1045,aljazeera,https://www.aljazeera.com/tag/israel-palestine...,https://www.aljazeera.com/program/quotable/202...,“No words to quite describe the scale of devas...,,27 Jan 2025,
1235,aljazeera,https://www.aljazeera.com/tag/israel-palestine...,https://www.aljazeera.com/program/quotable/202...,"“Remarkable” that after 100 days of siege, peo...",,12 Jan 2025,


In [4]:
def identify_type(link):
    link = link.removeprefix('https://www.aljazeera.com/')
    match = re.search('(.*?)(?=/\d{4}/\d{1,2}/\d{1,2}/)', link)
    if match:
        return match.group(1)
    else:
        return None

df['media_type'] = df['link'].map(identify_type)
df['media_type'].value_counts()

media_type
news                              947
program/newsfeed                  412
news/liveblog                     223
opinions                          115
gallery                            97
features                           71
video/newsfeed                     62
program/inside-story               42
program/quotable                   31
program/the-bottom-line            16
podcasts                           14
news/longform                      14
program/the-stream                 10
features/longform                   8
program/compare-contrast            7
video/inside-story                  5
sports                              5
program/counting-the-cost           4
program/featured-documentaries      4
program/between-us                  4
program/talk-to-al-jazeera          4
program/the-listening-post          4
economy                             4
program/centre-stage                4
program/digidocs                    3
program/al-jazeera-close-up         2
p

In [5]:
l = df.query('media_type == "opinions"').reset_index()
len(df)

1574