### Anonymize comments & tabularize data

In [1]:
import os, json, sys
from datetime import datetime, timezone
import uuid
import pandas as pd

sys.path.append(os.path.dirname(os.path.abspath('..')))
from utils.text_analysis_functions import data_cleaning

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def parse_greek_date(date_str):
    greek_months = {
    "Ιανουαρίου": "01", "Φεβρουαρίου": "02", "Μαρτίου": "03", "Απριλίου": "04",
    "Μαΐου": "05", "Ιουνίου": "06", "Ιουλίου": "07", "Αυγούστου": "08",
    "Σεπτεμβρίου": "09", "Οκτωβρίου": "10", "Νοεμβρίου": "11", "Δεκεμβρίου": "12"
    }
    try:
        day, month_name, rest = date_str.strip().split(" ", 2)
        year = rest.split(",")[0].strip()
        month = greek_months.get(month_name)
        return datetime.strptime(f"{year}-{month}", "%Y-%m")
    except:
        return None

def assign_length_bin(count):
    word_count_bins = {
        'short': (0, 25),
        'medium': (26, 90),
        'long': (91, float('inf'))
    }
    for label, (low, high) in word_count_bins.items():
        if low <= count <= high:
            return label
    return 'unknown'


In [3]:
## Reddit
reddit_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(".")))) + "\\working_data\\reddit_cleaned.json"
## YouTube
youtube_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(".")))) + "\\working_data\\youtube_cleaned.json"
## OpenGov 
opengov_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(".")))) + "\\working_data\\ogov_cleaned.json"

## YouTube
with open(youtube_path, "r", encoding="utf-8") as f:
    youtube_clean = json.load(f)
## Reddit
with open(reddit_path, "r", encoding="utf-8") as f:
    reddit_clean = json.load(f)
## OpenGov
with open(opengov_path, "r", encoding="utf-8") as f:
    ogov_clean = json.load(f) 

In [4]:
keys_to_remove_reddit = [
    'author',
    'reddit_id'
]

for thread in reddit_clean:
    thread_id = thread.get('id', 'empty')
    for seq, comment in enumerate(thread.get('comments', []), start=1):
        author = comment.get('author_id', 'None')
        # random suffix
        rand_suffix = uuid.uuid4().hex[:8]
        # new comment_id
        comment_id = f"R-{author}-{thread_id}-{seq}-{rand_suffix}"
        comment['unique_comment_id'] = comment_id
        # drop sensitive fields
        for key in keys_to_remove_reddit:
            comment.pop(key, None)

keys_to_remove_youtube = [
    'author',
    'comment_id'
]

for video in youtube_clean:
    video_id = video.get('id', 'empty')
    for seq, comment in enumerate(video.get('comments', []), start=1):
        author = comment.get('author_id', 'None')
        # random suffix
        rand_suffix = uuid.uuid4().hex[:8]
        # new comment_id
        comment_id = f"Y-{author}-{video_id}-{seq}-{rand_suffix}"
        comment['unique_comment_id'] = comment_id
        # drop sensitive fields
        for key in keys_to_remove_youtube:
            comment.pop(key, None)

keys_to_remove_ogov = [
    'author_name',
    'URL'
]

for seq, comment in enumerate(ogov_clean, start=1):
    author = comment.get('author_id','anon')
    # random suffix
    suffix = uuid.uuid4().hex[:8]
    # new comment_id
    comment['unique_comment_id'] = f"O-{author}-{seq}-{suffix}"
    # drop sensitive fields
    for k in keys_to_remove_ogov:
        comment.pop(k, None)

In [5]:
records = []

## Reddit
for thread in reddit_clean:
    for c in thread.get("comments", []):
        wc = data_cleaning.word_count(c["body"])
        date = datetime.fromisoformat(c["published_at"]).replace(day=1)
        records.append({
            "platform": "reddit",
            "date": date,
            "text": c["body"],
            "like_count": c.get("like_count", 0),
            "word_count": wc,
            "like_scaled": c.get("like_scaled", 0),
            "comment_id": c["unique_comment_id"]
        })

## YouTube
for video in youtube_clean:
    for c in video.get("comments", []):
        wc = data_cleaning.word_count(c["body"])
        date = datetime.fromisoformat(c["published_at"]).replace(day=1)
        records.append({
            "platform": "youtube",
            "date": date,
            "text": c["body"],
            "like_count": c.get("like_count", 0),
            "word_count": wc,
            "like_scaled": c.get("like_scaled", 0),
            "comment_id": c["unique_comment_id"]
        })

## OpenGov
for c in ogov_clean:
    wc = data_cleaning.word_count(c["article_text"])
    date = parse_greek_date(c["date_published"].strip())
    records.append({
        "platform": "opengov",
        "date": date,
        "text": c["article_text"],
        "like_count": 1,
        "word_count": wc,
        "like_scaled": 1/len(ogov_clean),
        "comment_id": c["unique_comment_id"]
    })

df = pd.DataFrame(records)

df['date'] = pd.to_datetime(df['date'], errors='coerce', utc=True)
df['text_length_bin'] = df['word_count'].apply(assign_length_bin)
df['period'] = pd.cut(
    df['date'],
    bins=[
        datetime(2000, 1, 1, tzinfo=timezone.utc),
        datetime(2023, 11, 30, tzinfo=timezone.utc),
        datetime(2024, 2, 29, tzinfo=timezone.utc),
        datetime(2100, 1, 1, tzinfo=timezone.utc)
    ],
    labels=['pre', 'during', 'post']
)
df['date_mini'] = df['date'].dt.strftime('%Y-%m')

df['like_scaled_norm'] = (
    df.groupby('platform')['like_scaled'].transform(lambda x: (x - x.min()) / (x.max() - x.min()))
)

In [6]:
output_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(".")))) + "\\working_data"

with open(os.path.join(output_path, "youtube_cleaned_anonymized.json"), "w", encoding="utf-8") as f:
    json.dump(youtube_clean, f, ensure_ascii=False, indent=2)

with open(os.path.join(output_path, "reddit_cleaned_anonymized.json"), "w", encoding="utf-8") as f:
    json.dump(reddit_clean, f, ensure_ascii=False, indent=2)

with open(os.path.join(output_path, "ogov_cleaned_anonymized.json"), "w", encoding="utf-8") as f:
    json.dump(ogov_clean, f, ensure_ascii=False, indent=2) 

df.to_csv(output_path + "\\transformed_dataset.csv")