# Post-processing

- check number of posts per month 
- check english and removal of non-english text 
- check presence of duplicates + removal 
- random sample: at least 100 post per month 


In [2]:
import os
import json
import random
from langdetect import detect
from bs4 import BeautifulSoup

## 1. Number of posts 

In [3]:
# CHECK NUMERO DI POST PER MESE 

folder = "/home/damn/Documents/PROJECTS/THESIS/Social-graph-miner-multi-platform-data-analysis/mastodon/dataset/100_posts"

for fname in os.listdir(folder):
    if fname.endswith(".json"):
        fpath = os.path.join(folder, fname)
        with open(fpath, "r", encoding="utf-8") as f:
            data = json.load(f)
            print(f"{fname}: {len(data)} elements")


100_post_mastodon_2024-02.json: 150 elements
100_post_mastodon_2024-03.json: 143 elements
100_post_mastodon_2024-04.json: 122 elements
100_post_mastodon_2024-05.json: 179 elements


## 2. Duplicates

In [4]:
for fname in os.listdir(folder):
    if not fname.endswith(".json"):
        continue

    fpath = os.path.join(folder, fname)
    with open(fpath, "r", encoding="utf-8") as file:
        data = json.load(file)

    seen = set()
    duplicates = []
    unique_posts = []

    for post in data:
        post_id = post.get("uri")
        if post_id in seen:
            duplicates.append(post)
        else:
            seen.add(post_id)
            unique_posts.append(post)

    if duplicates:
        print(f"{fname}: trovati {len(duplicates)} duplicati")
    else:
        print(f"{fname}: nessun duplicato")

    # Uncomment to overwrite files without duplicates 
    # if len(unique_posts) < len(data):
    #     with open(fpath, "w", encoding="utf-8") as file:
    #         json.dump(unique_posts, file, ensure_ascii=False, indent=2)
    #     print(f"{fname}: sovrascritto con {len(unique_posts)} post unici")


100_post_mastodon_2024-02.json: nessun duplicato
100_post_mastodon_2024-03.json: nessun duplicato
100_post_mastodon_2024-04.json: nessun duplicato
100_post_mastodon_2024-05.json: nessun duplicato


# 3. Check english 

In [8]:
for fname in os.listdir(folder):
    file_path = os.path.join(folder, fname)
    with open(file_path, encoding="utf-8") as f:
        posts = json.load(f)

    english_posts = []
    for post in posts:
        lang = post.get("language")
        if lang is not None:
            is_en = (lang == "en")
        else:
            soup = BeautifulSoup(post.get("content", ""), "html.parser")
            try:
                is_en = detect(soup.get_text()) == "en"
            except:
                is_en = False

        if is_en:
            english_posts.append(post)

    # overwrite with only English posts
    with open(file_path, "w", encoding="utf-8") as f:
        json.dump(english_posts, f, ensure_ascii=False, indent=2)

    print(f"{fname}: {len(english_posts)} English posts out of {len(posts)}")


100_post_mastodon_2024-02.json: 145 English posts out of 150
100_post_mastodon_2024-03.json: 133 English posts out of 143
100_post_mastodon_2024-04.json: 111 English posts out of 122
100_post_mastodon_2024-05.json: 169 English posts out of 179


## 4. Random sample

In [9]:
for fname in os.listdir(folder):
    if not fname.endswith(".json"):
        continue

    fpath = os.path.join(folder, fname)
    with open(fpath, encoding="utf-8") as f:
        posts = json.load(f)

    if len(posts) >= 100:
        sampled = random.sample(posts, 100)  # esattamente 100
        with open(fpath, "w", encoding="utf-8") as f:
            json.dump(sampled, f, ensure_ascii=False, indent=2)
        print(f"{fname}: sovrascritto con 100 post")
    else:
        print(f"{fname}: solo {len(posts)} post disponibili (meno di 100, non modificato)")


100_post_mastodon_2024-02.json: sovrascritto con 100 post
100_post_mastodon_2024-03.json: sovrascritto con 100 post
100_post_mastodon_2024-04.json: sovrascritto con 100 post
100_post_mastodon_2024-05.json: sovrascritto con 100 post
