In [1]:
pip install bs4

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.3.1 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [12]:
import os
import json
from bs4 import BeautifulSoup
from collections import Counter

# Function to merge related annotations
def merge_related_labels(annotations, relations):
    merged_annotations = {}
    for annotation in annotations:
        if annotation['type'] == 'hypertextlabels':
            ann_id = annotation['id']
            text = annotation['value']['text']
            label = annotation['value']['hypertextlabels'][0]
            start_offset = annotation['value']['globalOffsets']['start']
            end_offset = annotation['value']['globalOffsets']['end']
            if ann_id not in merged_annotations:
                merged_annotations[ann_id] = {'text': text, 'label': label, 'start': start_offset, 'end': end_offset}
            else:
                merged_annotations[ann_id]['text'] += ' ' + text
                merged_annotations[ann_id]['end'] = end_offset

    for from_id, to_id in relations.items():
        if from_id in merged_annotations and to_id in merged_annotations:
            if merged_annotations[from_id]['label'] == merged_annotations[to_id]['label']:
                merged_annotations[from_id]['text'] += ' ' + merged_annotations[to_id]['text']
                merged_annotations[from_id]['end'] = merged_annotations[to_id]['end']
                del merged_annotations[to_id]

    return merged_annotations

# Function to extract role and sub-role pairs, normalized to lowercase
def extract_role_subrole_pairs(merged_annotations, relations):
    role_subrole_pairs = []
    
    for from_id, to_id in relations.items():
        if from_id in merged_annotations and to_id in merged_annotations:
            from_label = merged_annotations[from_id]['label']
            to_label = merged_annotations[to_id]['label']

            # Filter for "Organization Role" and "Organization Sub-Role" pairs
            if from_label == "Organization Role" and to_label == "Organization Sub-Role":
                role = merged_annotations[from_id]['text'].strip().lower()
                subrole = merged_annotations[to_id]['text'].strip().lower()
                role_subrole_pairs.append((role, subrole))

    return role_subrole_pairs

# Function to process each JSON file in the folder
def process_json_folder(folder_path):
    role_subrole_count = Counter()

    # Loop through all files in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith('.json'):
            file_path = os.path.join(folder_path, filename)

            # Open and load the JSON file with UTF-8 encoding
            try:
                with open(file_path, 'r', encoding='utf-8') as file:
                    data = json.load(file)
            except UnicodeDecodeError as e:
                print(f"Error decoding {filename}: {e}")
                continue  # Skip this file and move to the next

            # Process each document in the JSON data
            for item in data:
                if 'data' in item and 'html' in item['data']:
                    html_content = item['data']['html']
                    soup = BeautifulSoup(html_content, 'lxml')
                    document_text = soup.get_text()

                    if 'annotations' in item:
                        for annotation_set in item['annotations']:
                            annotations = annotation_set['result']
                            relations = {rel['from_id']: rel['to_id'] for rel in annotations if rel.get('type') == 'relation'}
                            
                            # Merge related labels
                            merged_annotations = merge_related_labels(annotations, relations)

                            # Extract role-subrole pairs
                            role_subrole_pairs = extract_role_subrole_pairs(merged_annotations, relations)
                            
                            # Update count of role-subrole pairs
                            role_subrole_count.update(role_subrole_pairs)

    return role_subrole_count

# Provide the folder path here
folder_path = r'C:\Users\archishman vb\OneDrive\Desktop\annotated\parties information'

# Process all JSON files in the folder
role_subrole_count = process_json_folder(folder_path)

merged_pairs = []

# Sort the role-subrole pairs by occurrence count in descending order
sorted_role_subrole_count = sorted(role_subrole_count.items(), key=lambda x: x[1], reverse=True)

# Print the number of occurrences of each role-subrole pair
print("Role-SubRole Pairs and their occurrences (case-insensitive, sorted by occurrences):")
for pair, count in sorted_role_subrole_count:
    print(f"Role: {pair[0].capitalize()} Sub-Role: {pair[1].capitalize()} Occurrences: {count}")
    merged_pairs.append(f"Role: {pair[0].capitalize()} Sub-Role: {pair[1].capitalize()} Occurrences: {count}")

# Calculate and print the total number of occurrences
total_occurrences = sum(role_subrole_count.values())
print(f"\nTotal occurrences of all role-subrole pairs: {total_occurrences}")


Role-SubRole Pairs and their occurrences (case-insensitive, sorted by occurrences):
Role: Bookrunners Sub-Role: Joint Occurrences: 34
Role: Agent Sub-Role: Administrative Occurrences: 32
Role: Arranger Sub-Role: Lead Occurrences: 23
Role: Arrangers Sub-Role: Joint lead Occurrences: 21
Role: Lender Sub-Role: Swingline Occurrences: 16
Role: Lead arrangers Sub-Role: Joint Occurrences: 14
Role: Lender Sub-Role: Swing line Occurrences: 12
Role: Guarantor Sub-Role: Domestic Occurrences: 12
Role: Arrangers Sub-Role: Lead joint Occurrences: 11
Role: Lender Sub-Role: U.s. swing Occurrences: 11
Role: Lender Sub-Role: Issuing Occurrences: 10
Role: Agent Sub-Role: Syndication Occurrences: 9
Role: Issuer Sub-Role: L/c Occurrences: 9
Role: Bookrunner Sub-Role: Sole Occurrences: 9
Role: Lender Sub-Role: Revolving Occurrences: 8
Role: Agent Sub-Role: Collateral Occurrences: 7
Role: Agency Sub-Role: Rating Occurrences: 7
Role: Arranger Sub-Role: Sole lead Occurrences: 7
Role: Arranger Sub-Role: Joint l

In [13]:
!pip install inflect

Defaulting to user installation because normal site-packages is not writeable
Collecting inflect
  Downloading inflect-7.4.0-py3-none-any.whl.metadata (21 kB)
Collecting more-itertools>=8.5.0 (from inflect)
  Downloading more_itertools-10.5.0-py3-none-any.whl.metadata (36 kB)
Collecting typeguard>=4.0.1 (from inflect)
  Downloading typeguard-4.3.0-py3-none-any.whl.metadata (3.7 kB)
Downloading inflect-7.4.0-py3-none-any.whl (34 kB)
Downloading more_itertools-10.5.0-py3-none-any.whl (60 kB)
   ---------------------------------------- 0.0/61.0 kB ? eta -:--:--
   ---------------------------------------- 61.0/61.0 kB 3.2 MB/s eta 0:00:00
Downloading typeguard-4.3.0-py3-none-any.whl (35 kB)
Installing collected packages: typeguard, more-itertools, inflect
Successfully installed inflect-7.4.0 more-itertools-10.5.0 typeguard-4.3.0



[notice] A new release of pip is available: 23.3.1 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [14]:
import os
import json
from bs4 import BeautifulSoup
from collections import Counter
import inflect

# Initialize the inflect engine for singular/plural handling
p = inflect.engine()

# Function to merge related annotations
def merge_related_labels(annotations, relations):
    merged_annotations = {}
    for annotation in annotations:
        if annotation['type'] == 'hypertextlabels':
            ann_id = annotation['id']
            text = annotation['value']['text']
            label = annotation['value']['hypertextlabels'][0]
            start_offset = annotation['value']['globalOffsets']['start']
            end_offset = annotation['value']['globalOffsets']['end']
            if ann_id not in merged_annotations:
                merged_annotations[ann_id] = {'text': text, 'label': label, 'start': start_offset, 'end': end_offset}
            else:
                merged_annotations[ann_id]['text'] += ' ' + text
                merged_annotations[ann_id]['end'] = end_offset

    for from_id, to_id in relations.items():
        if from_id in merged_annotations and to_id in merged_annotations:
            if merged_annotations[from_id]['label'] == merged_annotations[to_id]['label']:
                merged_annotations[from_id]['text'] += ' ' + merged_annotations[to_id]['text']
                merged_annotations[from_id]['end'] = merged_annotations[to_id]['end']
                del merged_annotations[to_id]

    return merged_annotations

# Function to extract role and sub-role pairs, normalized to lowercase and singular form
def extract_role_subrole_pairs(merged_annotations, relations):
    role_subrole_pairs = []
    
    for from_id, to_id in relations.items():
        if from_id in merged_annotations and to_id in merged_annotations:
            from_label = merged_annotations[from_id]['label']
            to_label = merged_annotations[to_id]['label']

            # Filter for "Organization Role" and "Organization Sub-Role" pairs
            if from_label == "Organization Role" and to_label == "Organization Sub-Role":
                role = merged_annotations[from_id]['text'].strip().lower()
                subrole = merged_annotations[to_id]['text'].strip().lower()

                # Convert the role to its singular form
                singular_role = p.singular_noun(role) if p.singular_noun(role) else role

                # Add the singular form of the role along with the subrole
                role_subrole_pairs.append((singular_role, subrole))

    return role_subrole_pairs

# Function to process each JSON file in the folder
def process_json_folder(folder_path):
    role_subrole_count = Counter()

    # Loop through all files in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith('.json'):
            file_path = os.path.join(folder_path, filename)

            # Open and load the JSON file with UTF-8 encoding
            try:
                with open(file_path, 'r', encoding='utf-8') as file:
                    data = json.load(file)
            except UnicodeDecodeError as e:
                print(f"Error decoding {filename}: {e}")
                continue  # Skip this file and move to the next

            # Process each document in the JSON data
            for item in data:
                if 'data' in item and 'html' in item['data']:
                    html_content = item['data']['html']
                    soup = BeautifulSoup(html_content, 'lxml')
                    document_text = soup.get_text()

                    if 'annotations' in item:
                        for annotation_set in item['annotations']:
                            annotations = annotation_set['result']
                            relations = {rel['from_id']: rel['to_id'] for rel in annotations if rel.get('type') == 'relation'}
                            
                            # Merge related labels
                            merged_annotations = merge_related_labels(annotations, relations)

                            # Extract role-subrole pairs
                            role_subrole_pairs = extract_role_subrole_pairs(merged_annotations, relations)
                            
                            # Update count of role-subrole pairs
                            role_subrole_count.update(role_subrole_pairs)

    return role_subrole_count

# Provide the folder path here
folder_path = r'C:\Users\archishman vb\OneDrive\Desktop\annotated\parties information'

# Process all JSON files in the folder
role_subrole_count = process_json_folder(folder_path)

merged_pairs = []

# Sort the role-subrole pairs by occurrence count in descending order
sorted_role_subrole_count = sorted(role_subrole_count.items(), key=lambda x: x[1], reverse=True)

# Print the number of occurrences of each role-subrole pair
print("Role-SubRole Pairs and their occurrences (case-insensitive, singular form, sorted by occurrences):")
for pair, count in sorted_role_subrole_count:
    print(f"Role: {pair[0].capitalize()} Sub-Role: {pair[1].capitalize()} Occurrences: {count}")
    merged_pairs.append(f"Role: {pair[0].capitalize()} Sub-Role: {pair[1].capitalize()} Occurrences: {count}")

# Calculate and print the total number of occurrences
total_occurrences = sum(role_subrole_count.values())
print(f"\nTotal occurrences of all role-subrole pairs: {total_occurrences}")


Role-SubRole Pairs and their occurrences (case-insensitive, singular form, sorted by occurrences):
Role: Bookrunner Sub-Role: Joint Occurrences: 40
Role: Agent Sub-Role: Administrative Occurrences: 32
Role: Arranger Sub-Role: Lead Occurrences: 29
Role: Arranger Sub-Role: Joint lead Occurrences: 28
Role: Lender Sub-Role: Swingline Occurrences: 16
Role: Lead arranger Sub-Role: Joint Occurrences: 16
Role: Arranger Sub-Role: Lead joint Occurrences: 14
Role: Lender Sub-Role: Swing line Occurrences: 12
Role: Guarantor Sub-Role: Domestic Occurrences: 12
Role: Agent Sub-Role: Syndication Occurrences: 11
Role: Lender Sub-Role: U.s. swing Occurrences: 11
Role: Lender Sub-Role: Issuing Occurrences: 10
Role: Issuer Sub-Role: L/c Occurrences: 9
Role: Bookrunner Sub-Role: Sole Occurrences: 9
Role: Lender Sub-Role: Revolving Occurrences: 8
Role: Agent Sub-Role: Collateral Occurrences: 7
Role: Agency Sub-Role: Rating Occurrences: 7
Role: Arranger Sub-Role: Sole lead Occurrences: 7
Role: Agent Sub-Role

In [23]:
import os
import json
from bs4 import BeautifulSoup
from collections import Counter
import inflect

# Initialize the inflect engine for singular/plural handling
p = inflect.engine()

# Function to merge related annotations
def merge_related_labels(annotations, relations):
    merged_annotations = {}
    for annotation in annotations:
        if annotation['type'] == 'hypertextlabels':
            ann_id = annotation['id']
            text = annotation['value']['text']
            label = annotation['value']['hypertextlabels'][0]
            start_offset = annotation['value']['globalOffsets']['start']
            end_offset = annotation['value']['globalOffsets']['end']
            if ann_id not in merged_annotations:
                merged_annotations[ann_id] = {'text': text, 'label': label, 'start': start_offset, 'end': end_offset}
            else:
                merged_annotations[ann_id]['text'] += ' ' + text
                merged_annotations[ann_id]['end'] = end_offset

    for from_id, to_id in relations.items():
        if from_id in merged_annotations and to_id in merged_annotations:
            if merged_annotations[from_id]['label'] == merged_annotations[to_id]['label']:
                merged_annotations[from_id]['text'] += ' ' + merged_annotations[to_id]['text']
                merged_annotations[from_id]['end'] = merged_annotations[to_id]['end']
                del merged_annotations[to_id]

    return merged_annotations

# Function to extract role and sub-role pairs, normalized to lowercase, singular form, and no whitespaces
def extract_role_subrole_pairs(merged_annotations, relations):
    role_subrole_pairs = []
    
    for from_id, to_id in relations.items():
        if from_id in merged_annotations and to_id in merged_annotations:
            from_label = merged_annotations[from_id]['label']
            to_label = merged_annotations[to_id]['label']

            # Filter for "Organization Role" and "Organization Sub-Role" pairs
            if from_label == "Organization Role" and to_label == "Organization Sub-Role":
                role = merged_annotations[from_id]['text'].strip().lower().replace(" ", "")
                subrole = merged_annotations[to_id]['text'].strip().lower().replace(" ", "")

                # Convert the role to its singular form
                singular_role = p.singular_noun(role) if p.singular_noun(role) else role

                # Add the singular form of the role along with the subrole
                role_subrole_pairs.append((singular_role, subrole))

    return role_subrole_pairs

# Function to process each JSON file in the folder
def process_json_folder(folder_path):
    role_subrole_count = Counter()

    # Loop through all files in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith('.json'):
            file_path = os.path.join(folder_path, filename)

            # Open and load the JSON file with UTF-8 encoding
            try:
                with open(file_path, 'r', encoding='utf-8') as file:
                    data = json.load(file)
            except UnicodeDecodeError as e:
                print(f"Error decoding {filename}: {e}")
                continue  # Skip this file and move to the next

            # Process each document in the JSON data
            for item in data:
                if 'data' in item and 'html' in item['data']:
                    html_content = item['data']['html']
                    soup = BeautifulSoup(html_content, 'lxml')
                    document_text = soup.get_text()

                    if 'annotations' in item:
                        for annotation_set in item['annotations']:
                            annotations = annotation_set['result']
                            relations = {rel['from_id']: rel['to_id'] for rel in annotations if rel.get('type') == 'relation'}
                            
                            # Merge related labels
                            merged_annotations = merge_related_labels(annotations, relations)

                            # Extract role-subrole pairs
                            role_subrole_pairs = extract_role_subrole_pairs(merged_annotations, relations)
                            
                            # Update count of role-subrole pairs
                            role_subrole_count.update(role_subrole_pairs)

    return role_subrole_count

# Provide the folder path here
folder_path = r'C:\Users\archishman vb\OneDrive\Desktop\annotated\parties information'

# Process all JSON files in the folder
role_subrole_count = process_json_folder(folder_path)

# Sort the role-subrole pairs by occurrence count in descending order
sorted_role_subrole_count = sorted(role_subrole_count.items(), key=lambda x: x[1], reverse=True)

# Print the role-subrole pairs in the desired format: "rolesub-role Occurrences"
print("RoleSubRole and their occurrences (case-insensitive, singular form, sorted by occurrences):")
for pair, count in sorted_role_subrole_count:
    print(f"{pair[1]}{pair[0]} {count}")

# Calculate and print the total number of occurrences
total_occurrences = sum(role_subrole_count.values())
print(f"\nTotal occurrences of all role-subrole pairs: {total_occurrences}")


RoleSubRole and their occurrences (case-insensitive, singular form, sorted by occurrences):
jointbookrunner 43
administrativeagent 32
leadarranger 29
swinglinelender 28
jointleadarranger 28
jointleadarranger 16
leadjointarranger 14
domesticguarantor 12
syndicationagent 11
u.s.swinglender 11
solebookrunner 10
issuinglender 10
l/cissuer 9
revolvinglender 8
collateralagent 7
ratingagency 7
soleleadarranger 7
co-syndicationagent 6
collateraltrustee 5
documentationagent 5
co-documentationagent 4
issuingbank 4
ratingsagency 4
swinglinebank 4
designatedborrower 3
additionalcreditparty 3
administrativeborrower 3
jointarranger 2
leadbookrunner 2
lcissuer 2
subsidiaryborrower 2
co-bookrunner 2
solebookrunner 2
termlender 2
soleleadarranger 2
swinglinelenderlender 2
co-bookmanagersinitiallender 2
subordinatedborrower 1
subordinatedlender 1
intermediateholding 1
intercreditoragent 1
existingmanager 1
approvedmanager 1
issuinginitialbank 1
malteseguarantor 1
dutchborrower 1
managingsenioragent 1
so

In [27]:
import os
import json
from bs4 import BeautifulSoup
from collections import Counter
import inflect

# Initialize the inflect engine for singular/plural handling
p = inflect.engine()

# List of valid role-subrole labels
valid_labels = [
    'jointbookrunner', 'administrativeagent', 'leadarranger', 'swinglinelender', 
    'jointleadarranger', 'leadjointarranger', 'domesticguarantor', 'syndicationagent', 
    'u.s.swinglender', 'solebookrunner', 'issuinglender'
]

# Function to merge related annotations
def merge_related_labels(annotations, relations):
    merged_annotations = {}
    for annotation in annotations:
        if annotation['type'] == 'hypertextlabels':
            ann_id = annotation['id']
            text = annotation['value']['text']
            label = annotation['value']['hypertextlabels'][0]
            start_offset = annotation['value']['globalOffsets']['start']
            end_offset = annotation['value']['globalOffsets']['end']
            if ann_id not in merged_annotations:
                merged_annotations[ann_id] = {'text': text, 'label': label, 'start': start_offset, 'end': end_offset}
            else:
                merged_annotations[ann_id]['text'] += ' ' + text
                merged_annotations[ann_id]['end'] = end_offset

    for from_id, to_id in relations.items():
        if from_id in merged_annotations and to_id in merged_annotations:
            if merged_annotations[from_id]['label'] == merged_annotations[to_id]['label']:
                merged_annotations[from_id]['text'] += ' ' + merged_annotations[to_id]['text']
                merged_annotations[from_id]['end'] = merged_annotations[to_id]['end']
                del merged_annotations[to_id]

    return merged_annotations

# Function to process and check links for valid labels, replacing the Organization Name label
def replace_orgname_with_valid_labels(merged_annotations, relations, valid_labels):
    replaced_labels = []

    for from_id, to_id in relations.items():
        if from_id in merged_annotations and to_id in merged_annotations:
            from_label = merged_annotations[from_id]['label']
            to_label = merged_annotations[to_id]['label']

            # Check for "Organization Name" label and its relation
            if from_label == "Organization Name":
                linked_text = merged_annotations[to_id]['text'].strip().lower().replace(" ", "")

                # If linked text matches any valid role-subrole label, replace the Organization Name label
                if linked_text in valid_labels:
                    merged_annotations[from_id]['label'] = linked_text
                    replaced_labels.append(linked_text)
                else:
                    # If no match, replace Organization Role and Organization Sub-Role with "other-role"
                    if from_label == "Organization Role" or from_label == "Organization Sub-Role":
                        merged_annotations[from_id]['label'] = "other-role"
                        replaced_labels.append("other-role")

    return replaced_labels

# Function to save the updated annotations to a new JSON file compatible with Label Studio
def save_updated_annotations(original_file_path, updated_data):
    # Create a new filename by appending "_modified" to the original filename
    base, ext = os.path.splitext(original_file_path)
    new_file_path = f"{base}_modified{ext}"

    with open(new_file_path, 'w', encoding='utf-8') as file:
        json.dump(updated_data, file, ensure_ascii=False, indent=4)

    return new_file_path

# Function to process each JSON file in the folder
def process_json_folder(folder_path):
    total_replaced_labels = []

    # Loop through all files in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith('.json'):
            file_path = os.path.join(folder_path, filename)

            # Open and load the JSON file with UTF-8 encoding
            try:
                with open(file_path, 'r', encoding='utf-8') as file:
                    data = json.load(file)
            except UnicodeDecodeError as e:
                print(f"Error decoding {filename}: {e}")
                continue  # Skip this file and move to the next

            # Process each document in the JSON data
            for item in data:
                if 'data' in item and 'html' in item['data']:
                    html_content = item['data']['html']
                    soup = BeautifulSoup(html_content, 'lxml')
                    document_text = soup.get_text()

                    if 'annotations' in item:
                        for annotation_set in item['annotations']:
                            annotations = annotation_set['result']
                            relations = {rel['from_id']: rel['to_id'] for rel in annotations if rel.get('type') == 'relation'}
                            
                            # Merge related labels
                            merged_annotations = merge_related_labels(annotations, relations)

                            # Replace Organization Name with valid role-subrole labels or "other-role"
                            replaced_labels = replace_orgname_with_valid_labels(merged_annotations, relations, valid_labels)
                            total_replaced_labels.extend(replaced_labels)

                            # Update the original annotations with modified labels
                            for ann_id in merged_annotations:
                                for annotation_set in item['annotations']:
                                    for annotation in annotation_set['result']:
                                        if annotation['id'] == ann_id:
                                            annotation['value']['hypertextlabels'][0] = merged_annotations[ann_id]['label']
                                            break  # Exit loop after updating the label

            # Save updated annotations back to a new file
            new_file_path = save_updated_annotations(file_path, data)
            print(f"Saved modified annotations to: {new_file_path}")

    return total_replaced_labels

# Provide the folder path here
folder_path = r'C:\Users\archishman vb\OneDrive\Desktop\annotated\parties information'

# Process all JSON files in the folder and capture the replaced labels
replaced_labels = process_json_folder(folder_path)

# Print the replaced labels and give a success message
if replaced_labels:
    print("Labels replaced successfully:")
    for label in replaced_labels:
        print(f"Replaced with: {label}")
    print("\nSuccess!")
else:
    print("No labels were replaced.")


Saved modified annotations to: C:\Users\archishman vb\OneDrive\Desktop\annotated\parties information\arvind.kr.200202@gmail.com_modified.json
Saved modified annotations to: C:\Users\archishman vb\OneDrive\Desktop\annotated\parties information\cs5221652@iitd.ac.in_modified.json
Saved modified annotations to: C:\Users\archishman vb\OneDrive\Desktop\annotated\parties information\luthraaditya283@gmail.com_modified.json
Saved modified annotations to: C:\Users\archishman vb\OneDrive\Desktop\annotated\parties information\nilesh147k@gmail.com_modified.json
Saved modified annotations to: C:\Users\archishman vb\OneDrive\Desktop\annotated\parties information\piyush.ofcl@gmail.com_modified.json
Saved modified annotations to: C:\Users\archishman vb\OneDrive\Desktop\annotated\parties information\vidhyakshaya.k-26@scds.saiuniversity.edu.in_modified.json
Labels replaced successfully:
Replaced with: administrativeagent
Replaced with: administrativeagent
Replaced with: administrativeagent
Replaced with: