In [25]:
import glob
import json
import unicodedata
import re
import hashlib

# Get all JSON files matching the pattern
file_paths = glob.glob("../comments/comments_*.json")

def hash_comment(comment):
    """
    Hash the comment text to create a unique identifier.
    """
    text = comment['text']
    # Normalize to NFKD form and encode to ASCII, ignoring non-ASCII characters
    normalized_text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('ascii')

    # Remove all characters except a-z and 0-9
    cleaned_text = re.sub(r'[^a-z0-9]', '', normalized_text.lower())
    # Hash the cleaned text using SHA-256
    return hashlib.sha256(cleaned_text.encode('utf-8')).hexdigest().upper()

def normalize_name(name):
    """
    Normalize the name by removing `Reply by` and `to ....`. Keep original unicode characters.
    """
    # Remove `Reply by` and `to ....` from the name
    name = re.sub(r'Reply by ', '', name)
    name = re.sub(r'to [^ ]+', '', name)

    # remove last two words from the name
    name = ' '.join(name.split()[:-2])
    return name.strip()
    

# Load and parse each JSON file
hash_map = {}

authors = {}


# Comments longer than 100 characters
long_comments = {}

for file_path in file_paths:
    with open(file_path, 'r', encoding='utf-8') as file:
        json_data = json.load(file)
        if not json_data.get('comments'):
            print(f"File {file_path} does not contain 'comments' key.")
            continue
        # Append the comments to the list
        # print(f"Processing file: {file_path}")
        for comment in json_data['comments']:
            # Hash the comment to create a unique identifier
            #if name start with 'Replying to' then skip it
            name = normalize_name(comment['name'])

            # if author is not in the authors dict, add it
            if name not in authors:
                authors[name] = {
                    'count': 1,
                }
            else:
                authors[name]['count'] += 1

            if comment['name'].startswith('Reply'):
                continue
            comment_hash = hash_comment(comment)
            # Add the comment ID to the comment dictionary
            # print(f"Hashing comment: {comment['text']} -> {comment_hash}")

            # Check if the hash already exists in the hash_map
            if comment_hash in hash_map:
                hash_map[comment_hash]['count'] += 1
            else:
                # Store the hash and the corresponding comment
                hash_map[comment_hash] = {
                    'text': comment['text'],
                    'count': 1,
                }
            
            # Check if the comment is longer than 100 characters
            if len(comment['text']) > 100:
                # Store the comment in the long_comments dictionary
                long_comments[comment_hash] = {
                    'text': comment['text'],
                    'author': name,
                    'length': len(comment['text']),
                }

# Order the hash_map by count in descending order, taking the top 10
output_hash_map = dict(sorted(hash_map.items(), key=lambda item: item[1]['count'], reverse=True)[:25])

# Print the output hash map
for comment_hash, data in output_hash_map.items():
    print(f"Hash: {comment_hash}, Count: {data['count']}, Text: {data['text']}")

# Print top 10 authors
for author, data in sorted(authors.items(), key=lambda item: item[1]['count'], reverse=True)[:10]:
    print(f"Author: {author}, Count: {data['count']}")
    

# Print top 10 long comments
for comment_hash, data in sorted(long_comments.items(), key=lambda item: item[1]['length'], reverse=True)[:10]:
    print(f"Long Comment: author: {data['author']}, Length: {data['length']}, Text: {data['text']}")


File ../comments\comments_00CA47DEDEB9C336087FBEE226B2AC0CD6C0880BEAF17AAB13B291B5B454B83A.json does not contain 'comments' key.
File ../comments\comments_1910204C2B92FC42D5E7CEF8ED3E9BB182DDF55D92CA2F0179154C3A75ED0F6F.json does not contain 'comments' key.
File ../comments\comments_2F0A174CF2FE1390345FD12064967B8EAE2F452138117A50065AB585DA61BB78.json does not contain 'comments' key.
File ../comments\comments_68F95E31756BDEACDECB07C3145BADA0209E8F35FDF82D2681FA22D58550E727.json does not contain 'comments' key.
File ../comments\comments_6A758F3043DFF8048497CEB2262DD84CB03EE17C96E6DA7AF2A684FAF27BA35F.json does not contain 'comments' key.
File ../comments\comments_706E0003DF27CFC4C5C251706C485918B01AE762AB3BA55B2585B9F3A1C1A213.json does not contain 'comments' key.
File ../comments\comments_8024FF88D29F82BAC34C896F54CD3F04A636D995A92500125A8589673A990F4D.json does not contain 'comments' key.
File ../comments\comments_82115101511843C159337DB1E169E92E9CE5C0F317075CF8EF338723D9FFB8C6.json d