In [10]:
import numpy as np
import pandas as pd
import re
from collections import defaultdict, Counter
import unicodedata
import csv

# Load the dataset (only the 'nomeUnidade' column)
df = pd.read_csv('unidades.csv', usecols=['nomeUnidade'])

# Display the first few rows to verify data
df.head()

# Load hierarchical groups from CSV
hierarchical_groups = {}
with open('hierarchical_groups.csv', 'r') as csvfile:
    reader = csv.reader(csvfile)
    next(reader)  # Skip header
    for row in reader:
        level, token = row
        if level in hierarchical_groups:
            hierarchical_groups[level].append(token)
        else:
            hierarchical_groups[level] = [token]

# Display the hierarchical groups to verify
for name, group in hierarchical_groups.items():
    print(f"{name}: {group}")


Domain: ['jurisprudencia', 'extraordinaria', 'vice presidencia', 'camaras', 'orgao', 'protocolo', 'centro', 'cartorio', 'juiz', 'judiciaria', 'controle', 'vara', 'processamento', 'apoio', 'unificada', 'consumidor', 'juizado', 'prudente']
Kingdom: ['coman', 'tutelar', 'ativa', 'operacional', 'mista', 'vinculada', 'pericias', 'gabinete', 'proger', 'diretoria', 'processual', 'distribuicao', 'servico', 'judiciario', 'civel', 'turmas', 'virtual', 'contra', 'pg3', 'execucao', 'justica', 'criminais', 'civeis', 'sucessoes', 'publicos']
Phylum: ['dtr', 'trf', 'substituto', 'camara', 'unica', 'auxiliar', 'assessoria', 'secao', 'sede', 'nucleo', 'recursal', 'itinerante', 'divisao', 'consumo', 'violencia', 'pre', 'central', 'tribunal', 'uniformizacao', 'alto', 'inativo', 'fazenda', 'posto', 'coordenadoria', 'fazendas', 'governador', 'infancia', 'contagem', 'penais', 'penal']
Class: ['cumulativa', 'plena', 'bacenjud', 'ssj', 'dcp', 'microrregiao', 'titular', 'custodia', 'auditoria', 'eleitoral', 'c

In [11]:
# Define a function to remove accents
def remove_accents(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    return "".join([c for c in nfkd_form if not unicodedata.combining(c)])

# Define synonyms
synonyms = {
    'gab': 'gabinete',
    'gab.': 'gabinete',
    'presidencia': 'presidencia',
    'v': 'vara',
    'vio': 'violencia',
    'c': 'circunscricao',
    'juiza': 'juiz',
    'substituta': 'substituto',
    'faz': 'fazenda',
    'fam': 'familia',
    'exma': 'exmo',
    'reg': 'registros',
    'pub': 'publico',
    'juv': 'juventude',
    'inf': 'infancia',
    'crim': 'criminal',
    'DEECRIM': 'criminal',
    'adj': 'adjunto',
    'cons': 'consumo',
    'jef': 'federal',
    'jud': 'judiciario',
    'desembargadora': 'desembargador',
    'des': 'desembargador',
    'desa': 'desembargador',
    'desemb': 'desembargador',
    'j': 'juizado',
    'jui': 'juizado',
    'civ': 'civel',
    'esp': 'especial',
    'especiais': 'especial'
    # Add more synonyms as needed
}

# Define multi-token replacements
multi_token_replacements = {
    'vt': ['vara', 'trabalho'],
    'cejusc': ['centro', 'judicial', 'solucao', 'conflitos', 'cidadania'],
    'tr': ['turma', 'recursal'],
    'jit': ['juizado', 'especial', 'civel'],
    'gades': ['gabinete', 'desembargador'],
    'jesp': ['juizado', 'especial', 'criminal'],
    'jec': ['juizado', 'especial', 'civel'],
    'saf': ['servico', 'anexo', 'fazendas'],
    # Add more multi-token replacements as needed
}

# Function to replace synonyms and multi-token replacements
def replace_synonyms_and_multi_tokens(token):
    if token in multi_token_replacements:
        return multi_token_replacements[token]
    else:
        return [synonyms.get(token, token)]

# Tokenization function
def tokenize_name(name, additional_stopwords=None):
    name = remove_accents(name).lower()
    
    # REPLACE 'CJ' OR 'C J' WITH 'CIRCUNSCRICAO JUDICIAL'
    name = re.sub(r'\bc\s*j\b', 'circunscricao', name)
    
    # REMOVE NUMBERS AND NUMBER-LETTER COMBINATIONS, BUT KEEP THE PRECEDING WORD
    name = re.sub(r'\b(\d+\w*)\b', '', name)
    
    # COMBINE 'GRAU' WITH THE PRECEDING WORD
    name = re.sub(r'(\b\w+\b)\s+grau', r'\1_grau', name)
    
    # COMBINE 'VICE' WITH THE FOLLOWING WORD (WITH SPACE OR HYPHEN) INTO A SINGLE TOKEN
    name = re.sub(r'\b(vice)[-\s]+(\w+)', r'\1_\2', name)
    
    tokens = re.split(r'\s|,|\.|\(|\)|-', name)

    stopwords = ['de', 'da', 'do', 'das', 'dos', 
                 'e', 'a', 'o', 'i', 'u', 'b', 'as', 'ao',
                 '"', 'em', 'des', 'com', 'n',
                 'rio', 'paulo', 'sao', 'bom', 'monte', 'montes', 'jesus', 'boa', 'ponta',
                 'joao', 'jose', 'maria', 'santa','ferreira', 'martins', 'alves','antonio','luis', 'santos',
                 'porto', 'belo', 'nova', 'sul', 'campo', 'oliveira','luiz','fortaleza', 'goiania', 'curitiba', 'silvia',
                 'carlos','grande','silva','francisco', 'pedro', 'lucas','ana', 'francisca', 'antonia','juliana', 'julia','fernando',
                 'fernanda','marcos','gabriel','adriana','marcia','souza','sousa','rodrigues','aves','pereira','lima','costa',
                 'dr','dra', 'janeiro','vitoria','salvador','brasilia','fortaleza','horizonte','manaus','curitiba','recife','goiania',
                 'santo','campos','filho','natal', 'primeira', 'segunda','iguacu','santana', 'preto','barra','funda','alegre',
                 'alegre/rs','norte','cruz','sr', 'sra', 'cuiaba','belem','paulista','serra','ribeirao','neto','piaui','eduardo','roberto',
                 'caxias','campinas','terceira','sul/rs','branco','almeida','novo','miguel','andre','carvalho','londrina','guarulhos',
                 'goncalo', 'aparecida', 'sebastiao','teresina','/','ribeiro','gomes','campina','augusto','verde', 'ii','i','iii',
                 'ricardo', 'bernardo','niteroi', 'vila','maringa','rocha','alberto','sorocaba','uberlandia','dois','tres','oeste','leste',
                 'ap','amaro','cesar','machado','jorge','castro', 'marcelo', 'minas','henrique', 'cabo','vista','guimaraes', 'alexandre',
                 'aracaju', 'cascavel', 'vicente', 'assis', 'lopes', 'velho', 'moraes', 'goncalves', 'conceicao', 'foz','franca','neves',
                 'mendes','marques', 'cruzes', 'mogi', "em grau", 'grossa','feira', 'mesquita', 'rosa', 'junior','teixeira', 'jardim',
                 'lagoas', 'fernandes', 'varzea', 'pinheiro', 'duque', 'aguas', 'goias', 'araujo', 'barbosa', 'nogueira', 'cristina',
                 'guararapes', 'taguatinga', 'dias', 'helena', 'bauru', 'cunha', 'quarta','penas', 'sergio', 'soares', 'joinville',
                 'claro','andrade', 'macapa', 'gama', 'pinto', 'mt', 'juazeiro', 'batista', 'jaboatao', 'ms', 'freitas', 'vieira', 'palmas',
                 'mossoro', 'osasco', 'piracicaba', 'joaquim', 'patos', 'vida', 'pinhais', 'petropolis', 'marco', 'saude', 'coelho', 'reis',
                 'lucia', 'fonseca', 'branca', 'olinda', 'anapolis', 'parnaiba', 'df', 'aracatuba', 'toledo', 'claudio', 'leite', 'fora']
    if additional_stopwords:
        stopwords.extend(additional_stopwords)
    
    # Process each token, applying synonyms and multi-token replacements
    processed_tokens = []
    skip_next = False
    for i, token in enumerate(tokens):
        if skip_next:
            skip_next = False
            continue
        if token and token not in stopwords:
            # Handle case where 'C' is followed by 'J'
            if token == 'c' and i + 1 < len(tokens) and tokens[i + 1] == 'j':
                processed_tokens.append('circunscricao')
                skip_next = True
            else:
                processed_tokens.extend(replace_synonyms_and_multi_tokens(token))
    
    # Replace underscores with spaces in preserved conjoined expressions
    processed_tokens = [token.replace('_', ' ') for token in processed_tokens]
    return processed_tokens

# Apply tokenization
df['tokens'] = df['nomeUnidade'].apply(lambda x: tokenize_name(x))


In [12]:
# Function to categorize tokens based on hierarchical groups
def categorize_token_by_hierarchy(token, hierarchical_groups):
    for name, group in hierarchical_groups.items():
        if token in group:
            return name
    return 'other'

# Function to calculate expressivity ratio for a token in relation to a group
def calculate_expressivity_ratio(token, group, token_stats):
    word_count = token_stats[token]['count']
    cooccurrence_count = sum(token_stats[token]['cooccurrences'].get(t, 0) for t in group)
    return 1 - (cooccurrence_count / word_count) if word_count > 0 else 0

# Calculate token statistics
token_stats = {token: {'count': count, 'cooccurrences': defaultdict(int)} for token, count in Counter([token for tokens_row in df['tokens'] for token in tokens_row]).items()}

for tokens_row in df['tokens']:
    for token in tokens_row:
        if token in token_stats:
            for co_token in tokens_row:
                if co_token != token:
                    token_stats[token]['cooccurrences'][co_token] += 1

# Create hierarchical classification for each entry and choose species
def classify_entry(tokens, hierarchical_groups, token_stats):
    classification = {}
    unclassified_tokens = []
    
    for token in tokens:
        level = categorize_token_by_hierarchy(token, hierarchical_groups)
        if level != 'other':
            if level in classification:
                classification[level].append(token)
            else:
                classification[level] = [token]
        else:
            unclassified_tokens.append(token)
    
    # Order by hierarchy
    ordered_classification = {level: sorted(classification.get(level, [])) for level in hierarchical_groups.keys()}
    
    # Calculate expressivity ratio for unclassified tokens
    species_token = None
    lowest_avg_expressivity = float('inf')
    for token in unclassified_tokens:
        avg_expressivity = np.mean([calculate_expressivity_ratio(token, group, token_stats) for group in hierarchical_groups.values()])
        if avg_expressivity < lowest_avg_expressivity:
            lowest_avg_expressivity = avg_expressivity
            species_token = token
    
    # Create a combined notation
    combined_classification = ":".join([",".join(ordered_classification[level]) for level in ordered_classification if ordered_classification[level]])
    if species_token:
        combined_classification += f" | {species_token}"
    
    return combined_classification

df['classification'] = df['tokens'].apply(lambda x: classify_entry(x, hierarchical_groups, token_stats))

# Display the classified entries
df.head()


Unnamed: 0,nomeUnidade,tokens,classification
0,GAB. JUIZ JOSE CARLOS COELHO E SOUZA,"[gabinete, juiz]",juiz:gabinete
1,CENTRAL DE MANDADOS DE BLUMENAU,"[central, mandados, blumenau]",central:mandados | blumenau
2,2ª VARA FEDERAL DE BLUMENAU,"[vara, federal, blumenau]",vara:federal | blumenau
3,3ª VARA FEDERAL DE BLUMENAU,"[vara, federal, blumenau]",vara:federal | blumenau
4,4ª VARA FEDERAL DE BLUMENAU,"[vara, federal, blumenau]",vara:federal | blumenau


In [13]:
# Split classification into hierarchical levels and species
df[['Domain', 'Kingdom', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species']] = df['classification'].str.split('[:|]', expand=True)

# Initial alphabetical sort for all entries
df = df.sort_values(by=['classification'])

# Ground-up alphabetical ordering for each level
hierarchical_levels = ['Species', 'Genus', 'Family', 'Order', 'Class', 'Phylum', 'Kingdom', 'Domain']

for level in hierarchical_levels:
    if level in df.columns:
        df = df.sort_values(by=hierarchical_levels[:hierarchical_levels.index(level) + 1])

# Combine classification back into a single string
df['classification'] = df[hierarchical_levels].apply(lambda x: ' | '.join(x.dropna()), axis=1)

# Drop the temporary hierarchical columns
df = df.drop(columns=hierarchical_levels)

# Export classified entries to a new CSV
df[['nomeUnidade', 'classification']].to_csv('classified_unidades.csv', index=False)

# Display the final sorted DataFrame
df.head()


Unnamed: 0,nomeUnidade,tokens,classification
19617,"0POSSE - 1ª VARA (CÍVEL, CRIMINAL %u2013 CRIME...","[vara, civel, criminal, %u2013, crimes, geral,...",%u2013 | criminal | juventude | geral | crim...
19868,0 CEJUSC - Abaeté (Pré-Processual),"[centro, judicial, solucao, conflitos, cidadan...",abaete | solucao | judicial | conflitos | ci...
11769,CEJUSC - PRE-PROCESSUAL - POSTO SAUDE-ABRAMGE,"[centro, judicial, solucao, conflitos, cidadan...",abramge | solucao | judicial | conflitos | c...
11755,CEJUSC - PRE-PROCESSUAL - ACE,"[centro, judicial, solucao, conflitos, cidadan...",ace | solucao | judicial | conflitos | cidad...
22225,VARA DOS FEITOS RELATIVOS ÀS RELAÇÕES DE CONSU...,"[vara, feitos, relativos, relacoes, consumo, c...","acidente | registro,relativos | feitos,publi..."
