## Convert the Input JSON structure 

In [48]:
import json

def convert_to_parsed_structure(input_json):
    parsed_structure = {
        "id": input_json.get("id", ""),  # Assuming the channel id is at the top level
        "parsed_messages": []
    }

    for message in input_json.get("messages", []):
        message_id = message.get("id", "")
        message_text = ""
        message_date = message.get("date", "")

        if "text" in message:
            for segment in message["text"]:
                if isinstance(segment, dict) and "text" in segment:
                    message_text += segment["text"]
                elif isinstance(segment, str):
                    message_text += segment

        labels = []  # Adjust this based on the actual structure of your labels in the message

        parsed_message = {
            "id": message_id,
            "text": message_text,
            "date": message_date,
            "labels": labels
        }

        parsed_structure["parsed_messages"].append(parsed_message)

    return parsed_structure


## Read Json file from raw folder

In [None]:
# if __name__ == "__main__":
with open("raw\Tikvah.json", "r", encoding="utf-8") as file:
    data = json.load(file)
    input_json = data  # Assuming the entire content is under "data" key, adjust as needed

parsed_structure = convert_to_parsed_structure(input_json)
json_output = json.dumps(parsed_structure, ensure_ascii=False, indent=2)
print("parsed:", json_output)

## Remove null values, new lines and extra spaces

In [51]:
# Remove: null values, new lines (“\n”) ,extra spaces
def filter_empty_messages(parsed_structure):
    def clean_text(text):
        # Remove new lines and extra spaces from the text
        return ' '.join(text.split())

    filtered_structure = {
        "id": parsed_structure.get("id", ""),
        "parsed_messages": [
            {
                "id": message["id"],
                "text": clean_text(message["text"]),
                "date": message["date"],
                "labels": message["labels"]
            }
            for message in parsed_structure.get("parsed_messages", [])
            if message["text"].strip()  # Only include messages with non-empty text
        ]
    }
    return filtered_structure

In [None]:
filtered_structure = filter_empty_messages(parsed_structure)

json_output = json.dumps(filtered_structure, ensure_ascii=False, indent=2)
print("parsed:", json_output)

## Extract and remove symbols from messages

In [53]:
def extract_and_remove_symbols_from_messages(filtered_structure):
    # Iterate over parsed messages in the filtered structure
    for message in filtered_structure.get("parsed_messages", []):
        # Extract and remove symbols from the text
        message["text"] = extract_and_remove_symbols(message.get("text", ""))

    return filtered_structure

In [55]:
def extract_and_remove_symbols(text):
    # Define regular expressions for different symbols to be removed
    hashtag_pattern = re.compile(r'#\w+')
    emoji_pattern = re.compile('[\U00010000-\U0010ffff]', flags=re.UNICODE)  # Assuming emojis are in this range
    link_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_pattern = re.compile(r'@(\w+)')
    other_symbol_pattern = re.compile(r'[^\w\s]')  # Remove non-alphanumeric characters except spaces

    # Remove hashtags
    text = re.sub(hashtag_pattern, '', text)

    # Remove emojis
    text = re.sub(emoji_pattern, '', text)

    # Remove links
    text = re.sub(link_pattern, '', text)

    # Remove mentions
    text = re.sub(mention_pattern, '', text)

    # Remove other symbols
    text = re.sub(other_symbol_pattern, '', text)

    return text

In [None]:
filtered_structure = filter_empty_messages(filtered_structure)

# Then, extract and remove symbols
processed_structure = extract_and_remove_symbols_from_messages(filtered_structure)
json_output = json.dumps(processed_structure, ensure_ascii=False, indent=2)
print("Processed Structure:", json_output)

## Replace specific Geez scripts 

In [57]:
import json

def replace_values_in_text(processed_structure):
    # Define replacement rules
    replacement_rules = {
        # Replace values in text
        'ሐ': 'ሀ', 'ሑ': 'ሁ', 'ሒ': 'ሂ', 'ሓ': 'ሃ', 'ሔ': 'ሄ', 'ሖ': 'ህ', 'ሆ': 'ሆ',
        'ኀ': 'ሀ', 'ኁ': 'ሁ', 'ኂ': 'ሂ', 'ኃ': 'ሃ', 'ኄ': 'ሄ', 'ኅ': 'ህ', 'ኆ': 'ሆ',
        'ሠ': 'ሰ', 'ሡ': 'ሱ', 'ሢ': 'ሲ', 'ሣ': 'ሳ', 'ሤ': 'ሴ', 'ሦ': 'ስ', 'ሧ': 'ሶ',
        'ዐ': 'አ', 'ዑ': 'ኡ', 'ዒ': 'ኢ', 'ዓ': 'ኣ', 'ዔ': 'ኤ', 'ዕ': 'እ', 'ዖ': 'ኦ',
        'ጸ': 'ፀ', 'ጹ': 'ፁ', 'ጺ': 'ፂ', 'ጻ': 'ፃ', 'ጼ': 'ፄ', 'ጽ': 'ፅ', 'ጾ': 'ፆ'
    }

    # Iterate over parsed messages in the processed structure
    for message in processed_structure.get("parsed_messages", []):
        # Replace values in the text using the defined rules
        message["text"] = replace_values(message.get("text", ""), replacement_rules)

    return processed_structure

def replace_values(text, replacement_rules):
    # Replace values in the text using the provided rules
    for old_value, new_value in replacement_rules.items():
        text = text.replace(old_value, new_value)
    return text

In [None]:
filtered_structure = filter_empty_messages(processed_structure)

# Then, extract and remove symbols
processed_structure = extract_and_remove_symbols_from_messages(filtered_structure)

# Finally, replace values in the text
final_structure = replace_values_in_text(processed_structure)
json_output = json.dumps(final_structure, ensure_ascii=False, indent=2)
print("Final Structure:", json_output)