In [None]:
from tqdm import tqdm
import pandas as pd
import re
import hashlib
from address import islocation
import bson
from pymongo import MongoClient

client = MongoClient( "mongodb://localhost:27017", serverSelectionTimeoutMS=5000, retryWrites=True)
database = client["singularity"]
collection = database["location_de"]


def read_bson(file_path: str) -> dict:
    """ Read the bson file

    Args:
        file_path (str): File path

    Returns:
        dict: Dictionary of the bson file
    
    Examples:
        >>> mongo = Mongo()
        >>> mongo.read_bson(file_path = DATA_DIR + f'\cx_process_raw.bson')
    """        
    with open(file_path, 'rb') as f:
        data = bson.decode_all(f.read())
    return data


def write_documents(document_list, file_path):
    """Write a list of documents to a bson file."""
    with open(file_path, 'wb+') as f:
        for doc in document_list:
            f.write(bson.BSON.encode(doc))

def cid(row):
    # Define a list of German legal forms to remove
    german_legal_forms = ['gmbh','ggmbh','ag', 'mbh', 'gbg', 'eg', 'ug', 'kg', 'ohg', 'partg', 'se', 'ev','ek','haftungsbeschränkt','gbr','co','kgaa','bs','cokg','gmbhcokg','mbhco','g','gmbhco','gesellschaft mit beschränkter haftung']
    # Create a regular expression pattern to match the legal forms
    pattern = r'\b(?:' + '|'.join(german_legal_forms) + r')\b'
    company1 = row['name'].split() 
    company2 = [word for word in company1 if not islocation(word)]
    company3 = ' '.join(company2)
    company4 = re.sub(r'[^a-zA-Z\s0-9äöüÄÖÜß]', '', company3)
    company5 = ' '.join(company4.split())

    # Use re.sub to replace the matched legal forms with an empty string
    without_legal_forms = re.sub(pattern, '', company5.lower(), flags=re.IGNORECASE)

    # Step 1: Normalize the company name (convert to lowercase and remove spaces)
    normalized_name = without_legal_forms.lower().replace(" ", "")

    # Step 2: Create a hash using SHA-256
    hash_object = hashlib.sha256(normalized_name.encode())
    hashed_name = hash_object.hexdigest()
    return hashed_name
   

# bson_data = read_bson(r"C:\Users\besff\OneDrive\Desktop\DE Company\data.bson")
# df = pd.DataFrame(bson_data)
# df = df.drop(columns=['_id'])
# tqdm.pandas(desc="HashID",colour='yellow')
# df["_id"] = df.progress_apply(cid, axis=1)
# write_documents(df.to_dict('records'),r"C:\Users\besff\OneDrive\Documents\GitHub\singularity\singularity\data")
    # chunk.to_json(r"C:\Users\besff\OneDrive\Documents\GitHub\singularity\singularity\data\data.json",orient='records',lines=True, mode='a', index=False)
# tqdm.pandas(desc="HashID",colour='yellow')
# df["_id"] = df.progress_apply(cid, axis=1)
# df.to_csv(r"C:\Users\besff\OneDrive\Documents\GitHub\singularity\singularity\c1", index=False)
# df.head()

In [2]:
# Create two dictionaries with the same content
dict1 = {"key1": "value1", "key2": "value2"}
dict2 = {"key1": "value1", "key2": "value2"}

# Convert the dictionaries to frozensets
frozen_dict1 = frozenset(dict1.items())
frozen_dict2 = frozenset(dict2.items())

# Create a set and add the frozensets
my_set = set()
my_set.add(frozen_dict1)
my_set.add(frozen_dict2)

# Check the contents of the set
print(my_set)

{frozenset({('key1', 'value1'), ('key2', 'value2')})}
