# Post-processing

- check number of posts per month 
- check english and removal of non-english text 
- check presence of duplicates + removal 
- random sample: at least 100 post per month 


In [None]:
import os
import json
import random
from langdetect import detect
from bs4 import BeautifulSoup
import numpy as np 
from collections import Counter
import matplotlib.pyplot as plt
import random
from datetime import datetime, timedelta
import pandas as pd

## 1. Number of posts 

In [None]:
folder = "/home/damn/Documents/PROJECTS/THESIS/Social-graph-miner-multi-platform-data-analysis/mastodon/dataset/100_posts/100_posts"


In [None]:

for fname in os.listdir(folder):
    if fname.endswith(".json"):
        fpath = os.path.join(folder, fname)
        with open(fpath, "r", encoding="utf-8") as f:
            data = json.load(f)
            print(f"{fname}: {len(data)} elements")


## 2. Duplicates

In [None]:
for fname in os.listdir(folder):
    if not fname.endswith(".json"):
        continue

    fpath = os.path.join(folder, fname)
    with open(fpath, "r", encoding="utf-8") as file:
        data = json.load(file)

    seen = set()
    duplicates = []
    unique_posts = []

    for post in data:
        post_id = post.get("uri")
        if post_id in seen:
            duplicates.append(post)
        else:
            seen.add(post_id)
            unique_posts.append(post)

    if duplicates:
        print(f"{fname}: trovati {len(duplicates)} duplicati")
    else:
        print(f"{fname}: nessun duplicato")

    # Uncomment to overwrite files without duplicates 
    # if len(unique_posts) < len(data):
    #     with open(fpath, "w", encoding="utf-8") as file:
    #         json.dump(unique_posts, file, ensure_ascii=False, indent=2)
    #     print(f"{fname}: sovrascritto con {len(unique_posts)} post unici")


# 3. Check english 

In [None]:
for fname in os.listdir(folder):
    file_path = os.path.join(folder, fname)
    with open(file_path, encoding="utf-8") as f:
        posts = json.load(f)

    english_posts = []
    for post in posts:
        lang = post.get("language")
        if lang is not None:
            is_en = (lang == "en")
        else:
            soup = BeautifulSoup(post.get("content", ""), "html.parser")
            try:
                is_en = detect(soup.get_text()) == "en"
            except:
                is_en = False

        if is_en:
            english_posts.append(post)

    # overwrite with only English posts
    with open(file_path, "w", encoding="utf-8") as f:
        json.dump(english_posts, f, ensure_ascii=False, indent=2)

    print(f"{fname}: {len(english_posts)} English posts out of {len(posts)}")


# Hashtag extraction

In [None]:
hashtags = []

for filename in os.listdir(folder):
    file_path = os.path.join(folder, filename)
    with open(file_path, "r", encoding="utf-8") as f: 
        data = json.load(f)
        for element in data:
            if "tags" in element:
                for tag in element["tags"]:
                    if "name" in tag:
                        hashtags.append(tag["name"])

                            


new_file = os.path.join(folder, "/home/damn/Documents/PROJECTS/THESIS/Social-graph-miner-multi-platform-data-analysis/mastodon/code/hashtag/100_posts/hashtag_raw.json")

with open(new_file, "w", encoding="utf-8") as f:
    json.dump(hashtags, f, ensure_ascii=False, indent=2)

                            


# Counting hashtags extracted


In [None]:
file = "/home/damn/Documents/PROJECTS/THESIS/Social-graph-miner-multi-platform-data-analysis/mastodon/code/hashtag/100_posts/hashtag_raw.json"
out  = "/home/damn/Documents/PROJECTS/THESIS/Social-graph-miner-multi-platform-data-analysis/mastodon/code/hashtag/100_posts/hashtag_norm.json"

#
with open(file, "r", encoding="utf-8") as f:
    hashtags = json.load(f)

# lowercase normalization
hashtags_nor = [tag.lower() for tag in hashtags]

# frequencies
count_ba = Counter(hashtags)


with open(out, "w", encoding="utf-8") as f:
    json.dump(count_ba, f, ensure_ascii=False, indent=2)



print(f"Total hashtags: {len(hashtags_nor)}")
print(f"Unique hashtags: {len(count_ba)}")
print("Top 10:", count_ba.most_common(20))
# print("\n")
# for a in count_ba.most_common(100):
#     print(a)

In [None]:
hashtags, freqs = zip(*count_ba.most_common(20))  
plt.figure(figsize=(12, 6))
plt.bar(hashtags, freqs)
plt.xticks(rotation=45, ha="right")
plt.title("Top 20 Hashtags Distribution")
plt.xlabel("Hashtags")
plt.ylabel("Frequency")
plt.tight_layout()
plt.show()

In [None]:
freq_ba = np.array(sorted(count_ba.values(), reverse=True))
ranks_ba = np.arange(1, len(freq_ba) + 1)

plt.figure(figsize=(8,6))
plt.loglog(ranks_ba, freq_ba, linestyle="-", linewidth=2, color="blue")  # linea continua

plt.yscale("log")
plt.xscale("log")
plt.title("Hashtag Distribution (log-log scale)")
plt.xlabel("Rank")
plt.ylabel("Frequency")
plt.tight_layout()
plt.show()


# 1-random day data flow download 

The following codes allow to: 
- create a list of the 100 most used hashtags from the 1.5 year dataset sample 
- select a random day in the time range 
- download all the posts for such a day containing at least one of the 100 hashtags 
- compute the cumulative distribution of such hashtags 

The idea here is to retrieve the most used hashtags in a random day (cause the z-score approach showed a zipf distribution and failed to explain data). 

### 1. Creation list of 100 most used hashtags

In [None]:
in_file = "/home/damn/Documents/PROJECTS/THESIS/Social-graph-miner-multi-platform-data-analysis/mastodon/code/hashtag/100_posts/hashtag_norm.json"
out_file = "/home/damn/Documents/PROJECTS/THESIS/Social-graph-miner-multi-platform-data-analysis/mastodon/code/hashtag/100_posts/top100_hashtags.json"

with open(in_file, "r", encoding="utf-8") as f:
    hashtag_counts = json.load(f)

top100 = sorted(hashtag_counts.items(), key=lambda x: x[1], reverse=True)[:100]
top100_hashtags = [tag for tag, _ in top100]

with open(out_file, "w", encoding="utf-8") as f:
    json.dump(top100_hashtags, f, ensure_ascii=False, indent=2)

In [None]:
in_file = "/home/damn/Documents/PROJECTS/THESIS/Social-graph-miner-multi-platform-data-analysis/mastodon/code/hashtag/100_posts/hashtag_norm.json"
out_file = "/home/damn/Documents/PROJECTS/THESIS/Social-graph-miner-multi-platform-data-analysis/mastodon/code/hashtag/100_posts/top100_hashtags.json"

with open(in_file, "r", encoding="utf-8") as f:
    hashtag_counts = json.load(f)

top100 = sorted(hashtag_counts.items(), key=lambda x: x[1], reverse=True)[:100]

### 2. Selection of random day in time range

In [None]:

random.seed(1)
def random_day_between(start_str, end_str):

    start_date = datetime.strptime(start_str, "%Y-%m-%d").date()
    end_date   = datetime.strptime(end_str, "%Y-%m-%d").date()

    delta_days = (end_date - start_date).days

    random_offset = random.randint(0, delta_days)
    random_date   = start_date + timedelta(days=random_offset)

    return random_date.isoformat()

START_STR = "2024-02-06"  
END_STR   = "2025-07-06" 
call = random_day_between(START_STR, END_STR)
print(call)

### 3. Adjustment of download code to retrieve the data flow of the random day 

to be run as pyhton file 

- adjust random day selected
- adjust output directory and best_100_hashtag file directory 
- create a .py file 
- run it 

In [None]:
import json
import os
from mastodon import Mastodon
from datetime import datetime, timedelta, timezone
import time
import random
from dotenv import load_dotenv

### CONFIG 
OUTPUT_DIR  = ''
load_dotenv()
INSTANCE= 'https://mastodon.social'
HASHTAG = 'climatechange'
START= datetime(2024, 12, 8, tzinfo=timezone.utc)
END= datetime(2025, 12, 8, tzinfo=timezone.utc)
ACCESS_TOKEN = os.getenv('MASTODON_TOKEN')

### DIRECTORY 
os.makedirs(OUTPUT_DIR, exist_ok=True)
mastodon = Mastodon(access_token=ACCESS_TOKEN, api_base_url=INSTANCE)
seen_ids = set()

### EXTRACTION  
current_day = START
while current_day < END:
    posts_saved = 0
    month_file_path = os.path.join(OUTPUT_DIR, f"{current_day.year}-{current_day.month:02d}.json")

    with open(month_file_path, 'a', encoding='utf-8') as fout:
        while posts_saved < 5:
            rand_hour = random.randint(0, 23)
            rand_minute = random.randint(0, 59)
            rand_second = random.randint(0, 59)
            random_dt = current_day.replace(hour=rand_hour, minute=rand_minute, second=rand_second)
            
            posts = mastodon.timeline_hashtag(
                hashtag=HASHTAG,
                max_id=random_dt,
                limit=1
            )
            time.sleep(2)  
            if posts:
                s = posts[0]
                if current_day <= s.created_at < (current_day + timedelta(days=1)) and s.id not in seen_ids:
                    
                    fout.write(json.dumps(posts, ensure_ascii=False, default=str) + "\n")
                    seen_ids.add(s.id)
                    posts_saved += 1
    current_day += timedelta(days=1)

print(f"Saved")


### 4. Hashtag retrival + plot/statistics

Adjust in/out file paths

In [None]:
import json
from collections import Counter

in_file  = "/home/damn/Documents/PROJECTS/THESIS/Social-graph-miner-multi-platform-data-analysis/mastodon/dataset/100_posts/1_day/2024-12-08.jsonl"  
out_file = "/home/damn/Documents/PROJECTS/THESIS/Social-graph-miner-multi-platform-data-analysis/mastodon/code/hashtag/1_day/hashtags.json"

hashtags = []

with open(in_file, "r", encoding="utf-8") as f:
    for line in f:
        data = json.loads(line)  
        if "tags" in data:
            for tag in data["tags"]:
                if "name" in tag:
                    hashtags.append(tag["name"])

counts = Counter(hashtags)

with open(out_file, "w", encoding="utf-8") as f:
    json.dump(dict(counts), f, ensure_ascii=False, indent=2)

print(f"Saved {len(counts)} unique hashtags, {sum(counts.values())} total hashtags")


In [None]:
counts_file = "/home/damn/Documents/PROJECTS/THESIS/Social-graph-miner-multi-platform-data-analysis/mastodon/code/hashtag/1_day/hashtags.json"
top100_file = "/home/damn/Documents/PROJECTS/THESIS/Social-graph-miner-multi-platform-data-analysis/mastodon/code/hashtag/100_posts/top100_hashtags.json"
out_plot    = "/home/damn/Documents/PROJECTS/THESIS/Social-graph-miner-multi-platform-data-analysis/mastodon/code/hashtag/100_posts/top100_cumulata.png"

with open(counts_file, "r", encoding="utf-8") as f:
    all_counts = json.load(f)

with open(top100_file, "r", encoding="utf-8") as f:
    top100 = json.load(f)

# build dataframe preserving the order of the list
data = [(tag, all_counts.get(tag, 0)) for tag in top100]
df = pd.DataFrame(data, columns=["Hashtag", "Count"])

# compute cumulative %
df["Cumulata %"] = df["Count"].cumsum() / df["Count"].sum() * 100


# plot
plt.figure(figsize=(12, 6))
plt.plot(df["Hashtag"], df["Cumulata %"], marker="o", color="blue", linewidth=2)
plt.xticks(rotation=90)
plt.xlabel("Hashtag")
plt.ylabel("Cumulata %")
plt.title("Cumulative distribution Mastodon")
plt.tight_layout()
plt.grid(True)

# save
plt.savefig(out_plot)
plt.show()
