In [45]:
import re

def clean_author_line(line):
    # Remove the 'Author: ' prefix and trailing time expressions
    line = line.replace('Author: ', '')
    return re.sub(r'\s+(about\s+an\s+hour|\d+\s+(minutes?|hours?|days?)|a\s+(day|minute|hour|week|weeks))$', '', line)

def get_time_from_author_line(line):
    # Extract the time part from the author line
    match = re.search(r'\s+(about\s+an\s+hour|\d+\s+(minutes?|hours?|days?)|a\s+(day|minute|hour|week|weeks))$', line)
    if match:
        return match.group(0).strip()
    return None

In [46]:
import os
import json

posts = []
comments = []
comments_dir = "comments"

for root, _, files in os.walk(comments_dir):
    for file in files:
        # comments_*.json
        if not file.startswith("comments_") or not file.endswith(".json"):
            continue
        with open(os.path.join(root, file), 'r', encoding='utf-8') as f:
            data = json.load(f)
            # data has the following structure:
            {
                "url": "https://www.facebook.com/ministeruldeinterne/posts/pfbid0di4J1tBDVyX5VCZpoztts2Jn2NcPsCkoexuKujFtipKdv8crcFmBBNs52CerFrxbl",
                "comments": [
                    {
                    "name": "Ecaterina Gusoaie 6 days",
                    "text": "Nu ."
                    },
                ],
                "message": "Post message"
            }
            # for each comment in comments, we need to clean the author line and extract the time
            
            for comment in data["comments"]:
                author_line = comment["name"]
                time = get_time_from_author_line(author_line)
                if time:
                    author_line = clean_author_line(author_line)
                comment["name"] = author_line
                comment["time"] = time
                comments.append(comment)

            # add the post message to the posts list
            post = {
                "url": data["url"],
                "message": data["message"],
                "comments": data["comments"]
            }
            posts.append(post)


            

In [47]:
from collections import defaultdict

# ✅ Group authors by exact comment text
comment_to_authors = defaultdict(list)

for post in posts:
    for comment in post["comments"]:
        comment_text = comment["text"].strip()
        author = comment["name"].strip()
        comment_to_authors[comment_text].append(author)

# Filter only comments with multiple authors
result = [
    {
        "comment": comment,
        "authors": authors,
        "authors_count": len(authors)
    }
    for comment, authors in comment_to_authors.items()
    if len(authors) > 1
]

# Additional filter for seasonal greetings ("Amin", "Adevărat", "Felicitări!")
seasonal_greetings = ["Amin", "Adevărat", "Felicitări!", "Doamne ajuta amin", "Vot Iasmina", "Felicitări", "La mulți ani!","Doamne ajuta", "Follow", "Felicitari", "Amin!",
"Vot Patricia și Moldo", "Vot Patricia", "MEDIA3.GIPHY.COM", "Vot Moldo și Patricia", "Adevarat a inviat!", "La mulți ani", "La multi ani!", "La multi ani",
"Respect","Vot iasmina","MEDIA1.GIPHY.COM", "MEDIA2.GIPHY.COM","Doamne ajută!","Nu", "MEDIA4.GIPHY.COM","Adevarat a Inviat!",
]
result = [
    item for item in result
    if not any(greeting in item["comment"] for greeting in seasonal_greetings)
]

# order by authors count descending
result.sort(key=lambda x: x["authors_count"], reverse=True)

# Save the result to a JSON file
output_file = "comments_with_multiple_authors.json"
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(result, f, ensure_ascii=False, indent=4)

