In [13]:
import json
import pandas as pd
import nltk
import re
nltk.data.path = ['/home/albin/nltk_data']
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
from pathlib import Path
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.data import find
find("tokenizers/punkt")


[nltk_data] Downloading package punkt_tab to /home/albin/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/albin/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


FileSystemPathPointer('/home/albin/nltk_data/tokenizers/punkt')

In [17]:
class Debug:
    def read_jsonl_to_df(self, filename: str) -> pd.DataFrame:
        r = []
        with open(f"../output/{filename}.jsonl", 'r') as reader:
            for row in reader:
                json_row = json.loads(row)
                r.append(pd.DataFrame(json_row, index=[0]))
            
        df = pd.concat(r, ignore_index=True)
        duplicates = df[df.duplicated() == True]
        data   = df.drop_duplicates()
        return df, data, duplicates


class Statistics:
    def average_stats(self, data: pd.DataFrame) -> None:
        data['words'] = data['content'].apply(lambda x: len(x.split()))
        data['chars'] = data['content'].apply(lambda x: len(x))
        data['sentences'] = data['content'].apply(lambda x: len(x.split('.')))
        data['paragraphs'] = data['content'].apply(lambda x: len(x.split('\n')))
        data['avg_word_length'] = data['chars'] / data['words']
        result = data.groupby('media_type').agg({'sentences': 'mean', 'words': 'mean', 'chars': 'mean', 'paragraphs': 'mean', 'avg_word_length': 'mean'}).reset_index().round(0)
        
        print(
        "N sample: %s \n\n"
        "Average lenght of words, sentences, paragraphs:" \
        "\n\n %s" %(len(data), result), end="\n\n"
        )

    def nltk_stats(self, title, sentence) -> None:
        print("nltk.data.paths: ", nltk.data.path)

        stop_words = nltk.corpus.stopwords.words('english')
        tokens: list = nltk.word_tokenize(sentence)
        filtered_sentence = [w for w in tokens if not w.lower() in stop_words]
        tagged: list[tuple] = nltk.pos_tag(filtered_sentence)
        nouns = [word for word, pos in tagged if pos in ['NNP']]
        for noun in nouns:
            print(
            "Title: %s \n",
            {"Entity": noun,
            "Sentence": sentence,
            }
            )
    
    def run(self):
        data: pd.DataFrame = self.read_jsonl_to_df()
        self.average_stats(data)
        for i in data.iterrows():
            self.nltk_stats(i[1]['title'], i[1]['content'])
        
df, data, duplicates = Debug().read_jsonl_to_df('aljazeera_links')
print("Original df: %s, After removed: %s, NR of duplicates: %s" % (len(df), len(data), len(duplicates)))

Original df: 84, After removed: 84, NR of duplicates: 0


In [20]:
data.sort_values(by='title', ascending=True)

Unnamed: 0,website,url,link,title,media_type,date,content
64,aljazeera,https://www.aljazeera.com/tag/israel-palestine...,https://www.aljazeera.com/news/2025/5/17/gaza-...,Arab League calls for funds to rebuild Gaza at...,,17 May 2025,
70,aljazeera,https://www.aljazeera.com/tag/israel-palestine...,https://www.aljazeera.com/news/2025/5/16/break...,Breaking down a deadly week in Gaza as Israel ...,,16 May 2025,
45,aljazeera,https://www.aljazeera.com/tag/israel-palestine...,https://www.aljazeera.com/program/newsfeed/202...,British presenter Gary Lineker steps down over...,,19 May 2025,
79,aljazeera,https://www.aljazeera.com/tag/israel-palestine...,https://www.aljazeera.com/video/newsfeed/2025/...,Cannes premiere of film profiling slain Gaza j...,,15 May 2025,
50,aljazeera,https://www.aljazeera.com/tag/israel-palestine...,https://www.aljazeera.com/opinions/2025/5/19/c...,"Cheer up, people of Gaza! You’ll get killed on...",,19 May 2025,
...,...,...,...,...,...,...,...
7,aljazeera,https://www.aljazeera.com/tag/israel-palestine...,https://www.aljazeera.com/video/newsfeed/2025/...,What we know about the killing of two Israeli ...,,22 May 2025,
10,aljazeera,https://www.aljazeera.com/tag/israel-palestine...,https://www.aljazeera.com/news/2025/5/22/which...,Which countries trade the most with Israel and...,,22 May 2025,
1,aljazeera,https://www.aljazeera.com/tag/israel-palestine...,https://www.aljazeera.com/opinions/2025/5/22/w...,Words won’t save Gaza – The West must stop ena...,,22 May 2025,
81,aljazeera,https://www.aljazeera.com/tag/israel-palestine...,https://www.aljazeera.com/gallery/2025/5/15/on...,‘One long Nakba’: Palestinians mark 77 years s...,,15 May 2025,


In [98]:
def identify_type(link):
    link = link.removeprefix('https://www.aljazeera.com/')
    match = re.search('(.*?)(?=/\d{4}/\d{1,2}/\d{1,2}/)', link)
    if match:
        return match.group(1)
    else:
        return None

df['media_type'] = df['link'].map(identify_type)
df['media_type'].value_counts()

media_type
news                              1044
program/newsfeed                   462
news/liveblog                      246
opinions                           119
gallery                            117
features                            77
program/inside-story                51
video/newsfeed                      47
program/quotable                    31
program/the-bottom-line             18
news/longform                       15
podcasts                            14
program/the-stream                  12
features/longform                   10
program/compare-contrast             7
economy                              7
program/counting-the-cost            6
program/talk-to-al-jazeera           5
program/the-listening-post           5
video/inside-story                   5
sports                               5
program/between-us                   4
program/centre-stage                 4
program/featured-documentaries       4
program/fault-lines                  3
program/digido

In [99]:
l = df.query('media_type == "opinions"').reset_index()
len(df)

2340