### __Main preprocessing for all sources__

#### __Imports__

In [1]:
import os, json, copy, sys
import numpy as np
from collections import Counter

sys.path.append(os.path.dirname(os.path.abspath('..')))
from utils.helpers import rename_dictionary_keys, assign_unique_author_ids
from utils.text_analysis_functions import data_cleaning, filtering_pipelines, cleaning_pipelines 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# %pip install gr-nlp-toolkit 

#### __Functions__

In [3]:
def compute_scalar_weights(post_scores, comment_counts, scores_min, scores_max, comment_count_min, comment_count_max):
    """
    Compute scalar weights for posts based on normalized post scores and comment counts.
    
    Args:
        - post_scores (array-like): Raw scores for each post.
        - comment_counts (array-like): Comment counts for each post.
    
    Returns:
        - numpy.ndarray: Scalar weights for each post, in the range [0, 1].
    """
    p = np.array(post_scores, dtype=float)
    c = np.array(comment_counts, dtype=float)
    # normalized post scores
    p_norm = (p - scores_min) / (scores_max - scores_min) if scores_max != scores_min else np.zeros_like(p)
    # normalized comment counts
    C_norm = (c - comment_count_min) / (comment_count_max - comment_count_min) if comment_count_max != comment_count_min else np.zeros_like(C)
    # final scalar
    S = 0.5 * p_norm + 0.5 * C_norm
    return S

#### __Data__

**We check the following:**

- ***Reddit posts*** from targeted subreddits
- ***YouTube titles*** based on keyword search

<ins>Filtering criteria:</ins> Keyword stem appearance

**Required Datasets:**

- ***YouTube video metadata***: yt_videos
- ***YouTube video comments***: yt_comments
- ***Reddit post metadata***: rd_posts
- ***Reddit post comments***: rd_comments
- ***OpenGov unified comments***: ogov_comments

In [4]:
## Reddit
reddit_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath("main_preprocessing.ipynb"))))) + "\\outputs\\api_queried\\reddit_api"
## YouTube
youtube_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath("main_preprocessing.ipynb"))))) + "\\outputs\\api_queried\\youtube_api"
## OpenGov 
opengov_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath("main_preprocessing.ipynb"))))) + "\\outputs\\site_scraped\\same_sex_marriage_law"

## YouTube
# videos
with open(youtube_path + "\\youtube_scraped_videos.json", "r", encoding="utf-8") as f:
    yt_videos = json.load(f)
## comments
youtube_jsons  = [file for file in os.listdir(youtube_path + "\\youtube_comments") if file.endswith('.json')]
youtube_jsons_path = youtube_path + "\\youtube_comments"
youtube_comments = [] # combine opengov .json

for filename in youtube_jsons:
    filepath = os.path.join(youtube_jsons_path, filename)
    with open(filepath, "r", encoding="utf-8") as f:
        data = json.load(f)
        youtube_comments.extend(data)

## OpenGov
opengov_files  = [file for file in os.listdir(opengov_path) if file.endswith('.json')]
ogov_comments = [] # combine opengov .json
for filename in opengov_files:
    filepath = os.path.join(opengov_path, filename)
    with open(filepath, "r", encoding="utf-8") as f:
        data = json.load(f)
        ogov_comments.extend(data)

## Reddit
## posts
with open(reddit_path + "\\reddit_scraped_post.json", "r", encoding="utf-8") as f:
    reddit_posts = json.load(f)
## comments
with open(reddit_path + "\\reddit_scraped_comments.json", "r", encoding="utf-8") as f:
    reddit_comments = json.load(f)


#### __Preprocessing__

##### Object Initialization

In [None]:
cleaner = data_cleaning()
filtering_pipe = filtering_pipelines()
cleaning_pipe = cleaning_pipelines() 

##### Homogenize dictionaries

In [6]:
yt_videos_edited = copy.deepcopy(yt_videos)
youtube_comments_edited = copy.deepcopy(youtube_comments)
reddit_comments_edited = copy.deepcopy(reddit_comments)

for idx, vid in enumerate(yt_videos_edited):
    rename_dictionary_keys(yt_videos_edited[idx],"video_id","id")

for idx, vid in enumerate(youtube_comments_edited):
    rename_dictionary_keys(youtube_comments_edited[idx],"video_id","id")

for idx, com in enumerate(reddit_comments_edited):
    rename_dictionary_keys(reddit_comments_edited[idx],"post_id","id") 

##### Duplicates

In [7]:
yt_ids = []
for i in youtube_comments_edited:
    yt_ids.append(i["id"])

reddit_ids = []
for j in reddit_comments_edited:
    reddit_ids.append(j["id"])

youtube_unique_list = []
youtube_non_unique_list = []
for id in reddit_ids:
    if id not in youtube_unique_list:
        youtube_unique_list.append(id)
    else:
        if id in youtube_non_unique_list:
            pass
        else:
            youtube_non_unique_list.append(id)

reddit_unique_list = []
reddit_non_unique_list = []
for id in reddit_ids:
    if id not in reddit_unique_list:
        reddit_unique_list.append(id)
    else:
        if id in reddit_non_unique_list:
            pass
        else:
            reddit_non_unique_list.append(id) 

In [8]:
youtube_comments_edited[5].keys()

dict_keys(['comments', 'id'])

In [9]:
# For YouTube
seen_youtube_ids = []
filtered_youtube_comments = []

for comment in youtube_comments_edited:
    comment_id = comment["id"]
    if comment_id not in seen_youtube_ids:
        filtered_youtube_comments.append(comment)
        seen_youtube_ids.append(comment_id)

# For Reddit
seen_reddit_ids = []
filtered_reddit_comments = []

for comment in reddit_comments_edited:
    comment_id = comment["id"]
    if comment_id not in seen_reddit_ids:
        filtered_reddit_comments.append(comment)
        seen_reddit_ids.append(comment_id)

In [10]:
youtube_comments_edited = filtered_youtube_comments
reddit_comments_edited = filtered_reddit_comments

##### Filtering content

In [11]:
greek_keywords = ["ομόφυλα ζευγάρια",
                  "ομόφυλα τεκνοθεσία",
                  "ισότητα στο πολιτικό γάμο",
                  "γάμος ομόφυλων",
                  "γάμος ομόφυλων ζευγαριών",
                  "ομόφυλα"]

In [12]:
comment_buckets = [reddit_comments_edited, youtube_comments_edited]
object_buckets = [yt_videos_edited, reddit_posts]

valid_ids = []

for object_bucket in object_buckets:
    for obj in object_bucket:
        if filtering_pipe.filter_content(obj["title"], greek_keywords):
            valid_ids.append(obj["id"])

In [13]:
reddit_comments_filtered = []
youtube_comments_filtered = []
blocked = []

for idx, comment_platform in enumerate(comment_buckets):
    for comment in comment_platform:
        if comment["id"] in valid_ids:
            if idx == 0:
                reddit_comments_filtered.append(comment)
            else:
                youtube_comments_filtered.append(comment)
        else:
            blocked.append(comment["id"])

In [14]:
# if not printing anything, there are not blocked videos
for id in blocked:
    for vid in yt_videos_edited:
        if vid["id"] == id:
            filtering_pipe.filter_content(vid["title"], greek_keywords)

In [None]:
print("Initial YT video count:", len(youtube_comments_edited), "Filtered count:", len(youtube_comments_filtered))
print("Initial Reddit post count:", len(reddit_comments_edited), "Filtered count:", len(reddit_comments_filtered))
print("OpenGov comment count", len(ogov_comments)) 

In [16]:
youtube_plain_comments = []
for com in youtube_comments_filtered:
    for dic in com["comments"]:
        youtube_plain_comments.append(dic["body"])

ogov_plain_comments = []
for com in ogov_comments:
    ogov_plain_comments.append(com["article_text"])

reddit_plain_comments = []
for com in reddit_comments_filtered:
    for dic in com["comments"]:
        reddit_plain_comments.append(dic["body"])

##### Cleaning

In [None]:
reddit_steps_to_run = ["normalize", "reddit_specific", "transliterate"]
yt_steps_to_run = ["normalize", "youtube_specific", "transliterate"]
ogov_steps_to_run = ["normalize", "transliterate"]

# YouTube
youtube_cleaned = []
for video in youtube_comments_filtered:
    video_copy = copy.deepcopy(video)
    for comment in video_copy["comments"]:
        cleaned_text = cleaning_pipe.text_cleaning(comment["body"], yt_steps_to_run)
        comment["body"] = cleaned_text
    youtube_cleaned.append(video_copy)

# Reddit
reddit_cleaned = []
for post in reddit_comments_filtered:
    post_copy = copy.deepcopy(post)
    for comment in post_copy["comments"]:
        cleaned_text = cleaning_pipe.text_cleaning(comment["body"], reddit_steps_to_run)
        comment["body"] = cleaned_text
    reddit_cleaned.append(post_copy)

# OpenGov
ogov_cleaned = []
for entry in ogov_comments:
    entry_copy = copy.deepcopy(entry)
    cleaned_text = cleaning_pipe.text_cleaning(entry_copy["article_text"], ogov_steps_to_run)
    entry_copy["article_text"] = cleaned_text
    ogov_cleaned.append(entry_copy)

In [18]:
youtube_cleaned = [
    {
        "id": item["id"],
        "comments": [c for c in item["comments"] if (c.get("body") or "").strip()]
    }
    for item in youtube_cleaned]

reddit_cleaned = [
    {
        "id": item["id"],
        "comments": [c for c in item["comments"] if (c.get("body") or "").strip()]
    }
    for item in reddit_cleaned]

ogov_cleaned = [
    item for item in ogov_cleaned
    if (item.get("article_text") or "").strip()]

In [None]:
len(youtube_cleaned), len(reddit_cleaned), len(ogov_cleaned)

In [20]:
author_map, youtube_cleaned_with_ids, reddit_cleaned_with_ids, ogov_cleaned_with_ids = assign_unique_author_ids(
    youtube_cleaned,
    reddit_cleaned,
    ogov_cleaned
)

##### Scaling

Reddit

In [21]:
like_counts = [item["like_count"] for item in reddit_posts]
comment_counts = [item["num_comments"] for item in reddit_posts]

scores_min, scores_max = min(like_counts), max(like_counts)
comment_count_min, comment_count_max = min(comment_counts), max(comment_counts)

In [22]:
for post in reddit_posts:
    post["popularity_scaler"] = compute_scalar_weights(
    post["like_count"],
    post["num_comments"],
    scores_min=scores_min,
    scores_max=scores_max,
    comment_count_min=comment_count_min,
    comment_count_max=comment_count_max
    )

In [23]:
post_scaler_lookup = {
    post["id"]: post["popularity_scaler"]
    for post in reddit_posts
}

for thread in reddit_cleaned_with_ids:
    post_id = thread.get("id")
    scaler = post_scaler_lookup.get(post_id)
    for comment in thread.get("comments", []):
        comment["popularity_scaler"] = scaler 
        # scale likes
        like_count = comment.get("like_count", 0)
        comment["like_scaled"] = scaler * abs(like_count)

YouTube

In [24]:
yt_like_counts = [video["like_count"] for video in yt_videos_edited]
yt_comment_counts = [video["comment_count"] for video in yt_videos_edited]

yt_scores_min, yt_scores_max = min(yt_like_counts), max(yt_like_counts)
yt_comment_count_min, yt_comment_count_max = min(yt_comment_counts), max(yt_comment_counts)

for video in yt_videos_edited:
    video["popularity_scaler"] = compute_scalar_weights(
        video["like_count"],
        video["comment_count"],
        scores_min=yt_scores_min,
        scores_max=yt_scores_max,
        comment_count_min=yt_comment_count_min,
        comment_count_max=yt_comment_count_max
    )

yt_scaler_lookup = {
    video["id"]: video["popularity_scaler"]
    for video in yt_videos_edited
}

for thread in youtube_cleaned_with_ids:
    vid_id = thread.get("id")
    scaler = yt_scaler_lookup.get(vid_id, 0)

    for comment in thread.get("comments", []):
        comment["popularity_scaler"] = scaler
        # scale likes
        like_count = comment.get("like_count", 0)
        comment["like_scaled"] = scaler * like_count

##### Saving

In [25]:
output_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(".")))) + "\\working_data"

with open(os.path.join(output_path, "youtube_cleaned.json"), "w", encoding="utf-8") as f:
    json.dump(youtube_cleaned_with_ids, f, ensure_ascii=False, indent=2)

with open(os.path.join(output_path, "reddit_cleaned.json"), "w", encoding="utf-8") as f:
    json.dump(reddit_cleaned_with_ids, f, ensure_ascii=False, indent=2)

with open(os.path.join(output_path, "ogov_cleaned.json"), "w", encoding="utf-8") as f:
    json.dump(ogov_cleaned_with_ids, f, ensure_ascii=False, indent=2)

with open(os.path.join(output_path, "author_id_map.json"), "w", encoding="utf-8") as f:
    json.dump(author_map, f, ensure_ascii=False, indent=2)

@misc{loukas-etal-2025-greek-nlp-toolkit,
    title={GR-NLP-TOOLKIT: An Open-Source NLP Toolkit for Modern Greek}, 
    author={Lefteris Loukas and Nikolaos Smyrnioudis and Chrysa Dikonomaki and Spyros Barbakos and Anastasios Toumazatos and John Koutsikakis and Manolis Kyriakakis and Mary Georgiou and Stavros Vassos and John Pavlopoulos and Ion Androutsopoulos},
    year={2025},
    eprint={2412.08520},
    archivePrefix={arXiv},
    primaryClass={cs.CL},
    url={https://arxiv.org/abs/2412.08520}, 
}