In [None]:
import unicodedata
import re

def remove_accents(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    return u"".join([c for c in nfkd_form if not unicodedata.combining(c)])

def tokenize_file(filename):
    with open(filename, 'r', encoding='utf-8') as file:
        content = file.read().lower()
    # Remove accents
    content = remove_accents(content)
    # Split by comma or newline, then strip whitespace
    tokens = [token.strip() for token in re.split(r'[,\n]', content) if token.strip()]
    return tokens

# Read and tokenize both files
names = tokenize_file('name_segregated_tokens.txt')
places = tokenize_file('place_segregated_tokens.txt')

# Remove duplicates
names = list(set(names))
places = list(set(places))

# Move common tokens from places to names
common_tokens = set(names) & set(places)
names.extend(common_tokens)
places = [place for place in places if place not in common_tokens]

# Sort the lists
names.sort()
places.sort()

print(f"Number of unique names: {len(names)}")
print(f"Number of unique places: {len(places)}")

# Optional: Print the first few elements of each list to verify
print("\nFirst 10 names:")
print(names[:10])
print("\nFirst 10 places:")
print(places[:10])

In [None]:
import numpy as np
import pandas as pd
import re
from collections import Counter
import unicodedata
import matplotlib.pyplot as plt
from typing import List, Dict, Tuple, Optional

# Load the dataset
df = pd.read_csv('../unidades.csv')

# Display the first few rows to verify data
df.head()

# Define a function to remove accents
def remove_accents(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    return "".join([c for c in nfkd_form if not unicodedata.combining(c)])

# Define synonyms
synonyms = {
    'gab': 'gabinete',
    'gab.': 'gabinete',
    'presidencia': 'presidencia',
    'v': 'vara',
    'var': 'vara',
    'vio': 'violencia',
    'c': 'circunscricao',
    'juiza': 'juiz',
    'substituta': 'substituto',
    'dra': 'dr',
    'faz': 'fazenda',
    'fam': 'familia',
    'exma': 'exmo',
    'reg': 'registros',
    'pub': 'publico',
    'publ': 'publico',
    'publica': 'publico',
    'juv': 'juventude',
    'inf': 'infancia',
    'crim': 'criminal',
    'DEECRIM': 'criminal',
    'adj': 'adjunto',
    'cons': 'consumo',
    'jef': 'federal',
    'jud': 'judiciario',
    'desembargadora': 'desembargador',
    'des': 'desembargador',
    'desa': 'desembargador',
    'desemb': 'desembargador',
    'j': 'juizado',
    'jui': 'juizado',
    'civ': 'civel',
    'civeis': 'civel',
    'civil': 'civel',
    'esp': 'especial',
    'especiais': 'especial',
    'educativa': 'educacional',
    'contadoria/tesouraria': 'contadoria',
    'c/mulher': 'mulher',
    'calculos': 'calculo',
    'calc': 'calculo',
    'mulh': 'mulher',
    'adm': 'administracao',
    'amb': 'ambiental',
    'acomp': 'acompanhamento',
    'aten': 'atencao',
    'atend': 'atendimento',
    'aux': 'auxiliar',
    'aval': 'avaliacao',
    'compet': 'competencia',
    'conf': 'conflito',
    'confl': 'conflito',
    'coord': 'coordenacao',
    'cump': 'cumprimento',
    'def': 'defensoria',
    'dep': 'departamento',
    'dist': 'distribuicao',
    'distr': 'distribuicao',
    'exec': 'execucao',
    'fisc': 'fiscal',
    'gest': 'gestao',
    'inform': 'informacao',
    'inq': 'inquerito',
    'jurid': 'juridico',
    'med': 'mediacao',
    'mun': 'municipal',
    'munic': 'municipal',
    'org': 'organizacao',
    'pres': 'presidencia',
    'proc': 'processo',
    'prog': 'programa',
    'proj': 'projeto',
    'prot': 'protocolo',
    'rec': 'recurso',
    'rel': 'relator',
    'sec': 'secretaria',
    'serv': 'servico',
    'sist': 'sistema',
    'tec': 'tecnico',
    'trib': 'tribunal',
    'unid': 'unidade',
    'fg' : 'fig',
    'gm' : 'gmf',
    '[microrregiao' : 'microrregiao',


    # Add more synonyms as needed
}

# Define multi-token replacements
multi_token_replacements = {
    'vt': ['vara', 'trabalho'],
    'cejusc' :['centro','judicial','solucao','conflitos','cidadania'],
    'tr': ['turma', 'recursal'],
    'jit': ['juizado', 'especial','civel'],
    'gades': ['gabinete', 'desembargador'],
    'jesp': ['juizado', 'especial','criminal'],
    'jec': ['juizado', 'especial','civel'],
    'saf': ['servico', 'anexo','fazendas'],
    # Add more multi-token replacements as needed
}

# Function to replace synonyms and multi-token replacements
def replace_synonyms_and_multi_tokens(token):
    if token in multi_token_replacements:
        return multi_token_replacements[token]
    else:
        return [synonyms.get(token, token)]


# Tokenization function
def tokenize_name(name, additional_stopwords=None):
    name = remove_accents(name).lower()
    
    # REPLACE 'CJ' OR 'C J' WITH 'CIRCUNSCRICAO JUDICIAL'
    name = re.sub(r'\bc\s*j\b', 'circunscricao', name)
    
    # REMOVE NUMBERS AND NUMBER-LETTER COMBINATIONS, BUT KEEP THE PRECEDING WORD
    name = re.sub(r'\b(\d+\w*)\b', '', name)
    
    # COMBINE 'GRAU' WITH THE PRECEDING WORD
    name = re.sub(r'(\b\w+\b)\s+grau', r'\1_grau', name)
    
    # COMBINE 'VICE' WITH THE FOLLOWING WORD (WITH SPACE OR HYPHEN) INTO A SINGLE TOKEN
    name = re.sub(r'\b(vice)[-\s]+(\w+)', r'\1_\2', name)

    
    tokens = re.split(r'\s|,|\.|\(|\)|-', name)
    tokens = [token.strip() for token in tokens if token.strip()]

    stopwords = ['de', 'da', 'do', 'das', 'dos', 
                 'e', 'a', 'o', 'i', 'u', 'b', 'as', 'ao',
                 '"', 'em', 'des', 'com', 'n', 'g', 'ap', 'sr', 'sra','/',
                'hora', 'solteira', 'villa','zz', '°', '¿',
                'i','ii','iii','iv','v','vi','vii','viii','ix','x',
                'xi','xii','xiii','xiv','xv','xvi','xvii','xviii','xix','xx',
                'xxi','xxii','xxiii','xxiv','xxv','xxvi','xxvii','xxviii','xxix','xxx',
                'xxxi','xxxii','xxxiii','xxxiv','xxxv','xxxvi','xxxvii','xxxviii','xxxix','xl',
                'xli','xlii','xliii','xliv','xlv','xlvi','xlvii','xlviii','xlix','l',
                'li','lii','liii','liv','lv','lvi','lvii','lviii','lix','lx',
                'lxi','lxii','lxiii','lxiv','lxv','lxvi','lxvii','lxviii','lxix','lxx',
                'lxxi','lxxii','lxxiii','lxxiv','lxxv','lxxvi','lxxvii','lxxviii','lxxix','lxxx',
                'lxxxi','lxxxii','lxxxiii','lxxxiv','lxxxv','lxxxvi','lxxxvii','lxxxviii','lxxxix','xc',
                'xci','xcii','xciii','xciv','xcv','xcvi','xcvii','xcviii','xcix','c']
    if additional_stopwords:
        stopwords.extend(additional_stopwords)
    

    def combine_words(name, stopwords):
        # Define the words to be combined
        words_to_combine = [
            'sao', 'santa', 'santo', 'nova', 'novo', 'bom', 'boa', 
            'alto', 'alta', 'baixo', 'baixa', 'porto', 'campos', 
            'rio', 'foz', 'barra', 'passa', 'entre'
        ]

        # Function to replace word and its following non-stopword
        def replace_word(match):
            word1 = match.group(1)
            word2 = match.group(2)
            if word2.lower() not in stopwords:
                return f'{word1}_{word2}'
            return f'{word1} {word2}'

        # Create a combined regex pattern for all words to be combined
        pattern = r'\b(' + '|'.join(words_to_combine) + r')[\s-]+(\w+)'

        # Apply the replacement
        name = re.sub(pattern, replace_word, name)

        return name
    
    name = combine_words(name, stopwords)

    stopwords.extend([name.lower() for name in names])

    if additional_stopwords:
        stopwords.extend(additional_stopwords)

    tokens = re.split(r'\s|,|\.|\(|\)|-', name)
    tokens = [token.strip() for token in tokens if token.strip() and token not in stopwords]

    
    # PROCESS EACH TOKEN, APPLYING SYNONYMS AND MULTI-TOKEN REPLACEMENTS
    processed_tokens = []
    skip_next = False
    for i, token in enumerate(tokens):
        if skip_next:
            skip_next = False
            continue
        if token and token not in stopwords:
            # HANDLE CASE WHERE 'C' IS FOLLOWED BY 'J'
            if token == 'c' and i + 1 < len(tokens) and tokens[i + 1] == 'j':
                processed_tokens.append('circunscricao')
                skip_next = True
            else:
                processed_tokens.extend(replace_synonyms_and_multi_tokens(token))
    
    # REPLACE UNDERSCORES WITH SPACES IN PRESERVED CONJOINED EXPRESSIONS
    processed_tokens = [token.replace('_', ' ') for token in processed_tokens]
    return processed_tokens

# Apply tokenization and store original names
df['tokens'] = df['nomeUnidade'].apply(lambda x: tokenize_name(x))
df['original_name'] = df['nomeUnidade']

# Flatten list of tokens
all_tokens = [token for sublist in df['tokens'] for token in sublist]

# Calculate frequency of each token
token_counts = Counter(all_tokens)
common_tokens = token_counts.most_common()

# Convert to DataFrame for visualization
token_df = pd.DataFrame(common_tokens, columns=['token', 'count'])

print(f"Number of disparate tokens: {len(set(all_tokens))}")

# Display the DataFrame
token_df.head(212)

In [None]:
import numpy as np
import pandas as pd
from typing import List, Tuple
from collections import Counter

# Assume 'places' is defined earlier in the code
# places = ['sao paulo', 'rio de janeiro', 'belo horizonte', ...]  # Example, replace with actual places

def get_frequent_tokens(df: pd.DataFrame, min_frequency: int = 9) -> set:
    all_tokens = [token for sublist in df['tokens'] for token in sublist]
    token_counts = Counter(all_tokens)
    return {token for token, count in token_counts.items() if count >= min_frequency}

def assign_positions(tokens: List[str], places: List[str]) -> List[str]:
    positions = ['alpha', 'beta', 'gamma', 'delta', 'epsilon', 'zeta', 'eta', 'theta', 'iota', 'kappa',
                 'lambda', 'mu', 'nu', 'xi', 'omicron', 'pi', 'rho', 'sigma', 'tau', 'upsilon',
                 'phi', 'chi', 'psi', 'omega']
    
    # Separate place tokens from other tokens
    place_tokens = [token for token in tokens if token in places]
    other_tokens = [token for token in tokens if token not in places]
    
    # Assign positions to other tokens
    positioned_tokens = [f"{token}_{positions[i]}" for i, token in enumerate(other_tokens) if i < len(positions)]
    
    # Combine tokens, with place tokens at the end (without position addendum)
    return positioned_tokens + place_tokens

def initialize_entries(df: pd.DataFrame, places: List[str]) -> List[np.ndarray]:
    """
    Convert DataFrame rows to Entry format with normalization and 'alien' handling,
    considering only tokens with frequency >= 9 and adding position to token names.
    Place tokens are replaced with '[cidade]' and always at the end of the entry without position addendum.
    """
    frequent_tokens = get_frequent_tokens(df)
    max_tokens = df['tokens'].apply(lambda x: len([token for token in x if token in frequent_tokens])).max()
    
    entries = []
    for index, row in df.iterrows():
        filtered_tokens = []
        cidade_token = None
        for token in row['tokens']:
            if token in places:
                cidade_token = '[cidade]'
            elif token in frequent_tokens:
                filtered_tokens.append(token)
        
        tokens_with_position = assign_positions(filtered_tokens, [])  # Empty list as we've already handled places
        
        # Add '[cidade]' token at the end if it exists
        if cidade_token:
            tokens_with_position.append(cidade_token)
        
        # Normalize the number of tokens
        normalized_tokens = tokens_with_position + ['null'] * (max_tokens - len(tokens_with_position))
        
        # Check if all tokens are null, replace first with 'alien' if so
        if all(token == 'null' for token in normalized_tokens):
            normalized_tokens[0] = 'alien_alpha'
        
        entry = np.array([index, np.array(normalized_tokens, dtype=object)], dtype=object)
        entries.append(entry)
    
    return entries

class ClassificationStructure:
    def __init__(self):
        self.classifications = {}  # Dictionary to store classifications and their entry indices
        self.entries = []  # List to store all entries
        self.entry_classifications = {}  # Dictionary to store entry indices and their classifications
        self.weights = {}  # Dictionary to store weights of classifications

    def add_entry(self, entry: np.ndarray, classifications: List[str]):
        entry_index = len(self.entries)
        self.entries.append(entry)
        self.entry_classifications[entry_index] = set()
        
        for classification in classifications:
            if classification and classification != 'null':
                if classification not in self.classifications:
                    self.classifications[classification] = set()
                    self.weights[classification] = 0
                
                self.classifications[classification].add(entry_index)
                self.entry_classifications[entry_index].add(classification)
                self.weights[classification] += 1

    def remove_entry(self, entry_index: int):
        if entry_index in self.entry_classifications:
            for classification in self.entry_classifications[entry_index]:
                self.classifications[classification].remove(entry_index)
                self.weights[classification] -= 1
                if len(self.classifications[classification]) == 0:
                    del self.classifications[classification]
                    del self.weights[classification]
            del self.entry_classifications[entry_index]
            self.entries[entry_index] = None

    def get_entries_with_classifications(self, classifications: List[str]) -> List[np.ndarray]:
        if not classifications:
            return []
        valid_sets = [self.classifications[c] for c in classifications if c in self.classifications]
        if not valid_sets:
            return []
        entry_indices = set.intersection(*valid_sets)
        return [self.entries[i] for i in entry_indices if self.entries[i] is not None]

    def get_classifications_for_entry(self, entry_index: int) -> set:
        return self.entry_classifications.get(entry_index, set())

    def get_weight(self, classification: str) -> int:
        return self.weights.get(classification, 0)

    def get_overlaps(self) -> dict:
        overlaps = {}
        for entry_index, classifications in self.entry_classifications.items():
            if len(classifications) > 1:
                overlap_key = frozenset(classifications)
                if overlap_key not in overlaps:
                    overlaps[overlap_key] = set()
                overlaps[overlap_key].add(entry_index)
        return overlaps



In [None]:
# Example usage (assuming df is your original DataFrame)
cs = ClassificationStructure()

# Create Entry representations
entries = initialize_entries(df, places)

print(f"Total number of entries: {len(entries)}")
print("Sample entry:", entries[0])

# Populate ClassificationStructure
for entry in entries:
    index, tokens = entry
    cs.add_entry(entry, tokens)

# Example queries
print("\nEntries with 'vara_alpha' (if exists):", cs.get_entries_with_classifications(['vara_alpha']))
print("\nClassifications for entry 0:", cs.get_classifications_for_entry(0))
print("\nWeight of 'juizado_beta' (if exists):", cs.get_weight('juizado_beta'))
print("\nEntries with 'sao paulo' (if exists):", cs.get_entries_with_classifications(['sao paulo']))

In [None]:
import numpy as np
from collections import defaultdict
from typing import List, Tuple

def hierarchical_sort(entries: List[np.ndarray]) -> List[np.ndarray]:
    def sort_level(level_entries: List[np.ndarray], token_index: int) -> List[np.ndarray]:
        if not level_entries or token_index >= len(level_entries[0][1]):
            return level_entries

        # Group entries by token at the current level
        groups = defaultdict(list)
        for entry in level_entries:
            token = entry[1][token_index]
            groups[token].append(entry)

        # Sort groups by frequency, then alphabetically
        sorted_groups = sorted(groups.items(), 
                               key=lambda x: (-len(x[1]), x[0] if x[0] != 'null' else 'zzz'))

        # Recursively sort each group
        sorted_entries = []
        for _, group in sorted_groups:
            sorted_group = sort_level(group, token_index + 1)
            sorted_entries.extend(sorted_group)

        return sorted_entries

    return sort_level(entries, 0)

# Assuming 'entries' is your list of entry arrays
sorted_entries = hierarchical_sort(entries)

# Print the first few sorted entries to verify
for entry in sorted_entries:
    print(f"Index: {entry[0]}, Tokens: {entry[1]}")

In [None]:
from typing import List, Dict, Any
from collections import defaultdict

class TaxonomicNode:
    def __init__(self, value: str):
        self.value = value
        self.children: Dict[str, TaxonomicNode] = {}
        self.entries: List[int] = []

def build_taxonomic_tree(sorted_entries: List[np.ndarray]) -> TaxonomicNode:
    root = TaxonomicNode("root")
    max_depth = len(sorted_entries[0][1])  # Assuming all entries have the same length

    for entry in sorted_entries:
        entry_index, tokens = entry
        current_node = root
        
        for i, token in enumerate(tokens):
            if token == 'null':
                break
            if token not in current_node.children:
                current_node.children[token] = TaxonomicNode(token)
            current_node = current_node.children[token]
        
        current_node.entries.append(entry_index)

    normalize_tree_depth(root, max_depth)
    return root

def normalize_tree_depth(node: TaxonomicNode, target_depth: int, current_depth: int = 0):
    if current_depth == target_depth - 1:
        return

    if not node.children:
        for i in range(current_depth, target_depth - 1):
            dummy_name = f"{node.value}_{'abcdefghijklmnopqrstuvwxyz'[i-current_depth]}"
            node.children[dummy_name] = TaxonomicNode(dummy_name)
            node = node.children[dummy_name]
    else:
        for child in node.children.values():
            normalize_tree_depth(child, target_depth, current_depth + 1)

def print_tree(node: TaxonomicNode, depth: int = 0):
    print("  " * depth + node.value)
    if node.entries:
        print("  " * (depth + 1) + f"Entries: {node.entries}")
    for child in node.children.values():
        print_tree(child, depth + 1)

# Assuming 'sorted_entries' is your list of sorted entry arrays
taxonomic_tree = build_taxonomic_tree(sorted_entries)

# Print the tree to verify
print_tree(taxonomic_tree)

In [None]:
import csv

# Assuming `sorted_entries` is the final list of entries to export
with open('tree_output.csv', mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["Index", "Tokens"])  # Write header, adjust as necessary
    for entry in sorted_entries:
        writer.writerow([entry[0], ", ".join(entry[1])])  # Convert token list to string for CSV
