In [5]:
import unicodedata
import re

def remove_accents(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    return u"".join([c for c in nfkd_form if not unicodedata.combining(c)])

def tokenize_file(filename):
    with open(filename, 'r', encoding='utf-8') as file:
        content = file.read().lower()
    # Remove accents
    content = remove_accents(content)
    # Split by comma or newline, then strip whitespace
    tokens = [token.strip() for token in re.split(r'[,\n]', content) if token.strip()]
    return tokens

# Read and tokenize both files
names = tokenize_file('name_segregated_tokens.txt')
places = tokenize_file('place_segregated_tokens.txt')

# Remove duplicates
names = list(set(names))
places = list(set(places))

# Move common tokens from places to names
common_tokens = set(names) & set(places)
names.extend(common_tokens)
places = [place for place in places if place not in common_tokens]

# Sort the lists
names.sort()
places.sort()

print(f"Number of unique names: {len(names)}")
print(f"Number of unique places: {len(places)}")

# Optional: Print the first few elements of each list to verify
print("\nFirst 10 names:")
print(names[:10])
print("\nFirst 10 places:")
print(places[:10])

Number of unique names: 2934
Number of unique places: 2279

First 10 names:
['abaetetuba', 'abaetetuba', 'abel', 'abelardo', 'abelha', 'abensur', 'abraham', 'abreu', 'acacio', 'acarau']

First 10 places:
['abadia/go', 'abadiania', 'abaete', 'abare', 'aberto', 'abre campo/mg', 'acailandia', 'acailandia/ma', 'acajutiba', 'acara']


In [9]:
import numpy as np
import pandas as pd
import re
from collections import Counter
import unicodedata
import matplotlib.pyplot as plt
from typing import List, Dict, Tuple, Optional

# Load the dataset
df = pd.read_csv('unidades.csv')

# Display the first few rows to verify data
df.head()

# Define a function to remove accents
def remove_accents(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    return "".join([c for c in nfkd_form if not unicodedata.combining(c)])

# Define synonyms
synonyms = {
    'gab': 'gabinete',
    'gab.': 'gabinete',
    'presidencia': 'presidencia',
    'v': 'vara',
    'var': 'vara',
    'vio': 'violencia',
    'c': 'circunscricao',
    'juiza': 'juiz',
    'substituta': 'substituto',
    'dra': 'dr',
    'faz': 'fazenda',
    'fam': 'familia',
    'exma': 'exmo',
    'reg': 'registros',
    'pub': 'publico',
    'publ': 'publico',
    'publica': 'publico',
    'juv': 'juventude',
    'inf': 'infancia',
    'crim': 'criminal',
    'DEECRIM': 'criminal',
    'adj': 'adjunto',
    'cons': 'consumo',
    'jef': 'federal',
    'jud': 'judiciario',
    'desembargadora': 'desembargador',
    'des': 'desembargador',
    'desa': 'desembargador',
    'desemb': 'desembargador',
    'j': 'juizado',
    'jui': 'juizado',
    'civ': 'civel',
    'civeis': 'civel',
    'civil': 'civel',
    'esp': 'especial',
    'especiais': 'especial',
    'educativa': 'educacional',
    'contadoria/tesouraria': 'contadoria',
    'c/mulher': 'mulher',
    'calculos': 'calculo',
    'calc': 'calculo',
    'mulh': 'mulher',
    'adm': 'administracao',
    'amb': 'ambiental',
    'acomp': 'acompanhamento',
    'aten': 'atencao',
    'atend': 'atendimento',
    'aux': 'auxiliar',
    'aval': 'avaliacao',
    'compet': 'competencia',
    'conf': 'conflito',
    'confl': 'conflito',
    'coord': 'coordenacao',
    'cump': 'cumprimento',
    'def': 'defensoria',
    'dep': 'departamento',
    'dist': 'distribuicao',
    'distr': 'distribuicao',
    'exec': 'execucao',
    'fisc': 'fiscal',
    'gest': 'gestao',
    'inform': 'informacao',
    'inq': 'inquerito',
    'jurid': 'juridico',
    'med': 'mediacao',
    'mun': 'municipal',
    'munic': 'municipal',
    'org': 'organizacao',
    'pres': 'presidencia',
    'proc': 'processo',
    'prog': 'programa',
    'proj': 'projeto',
    'prot': 'protocolo',
    'rec': 'recurso',
    'rel': 'relator',
    'sec': 'secretaria',
    'serv': 'servico',
    'sist': 'sistema',
    'tec': 'tecnico',
    'trib': 'tribunal',
    'unid': 'unidade',
    'fg' : 'fig',
    'gm' : 'gmf',
    '[microrregiao' : 'microrregiao',


    # Add more synonyms as needed
}

# Define multi-token replacements
multi_token_replacements = {
    'vt': ['vara', 'trabalho'],
    'cejusc' :['centro','judicial','solucao','conflitos','cidadania'],
    'tr': ['turma', 'recursal'],
    'jit': ['juizado', 'especial','civel'],
    'gades': ['gabinete', 'desembargador'],
    'jesp': ['juizado', 'especial','criminal'],
    'jec': ['juizado', 'especial','civel'],
    'saf': ['servico', 'anexo','fazendas'],
    # Add more multi-token replacements as needed
}

# Function to replace synonyms and multi-token replacements
def replace_synonyms_and_multi_tokens(token):
    if token in multi_token_replacements:
        return multi_token_replacements[token]
    else:
        return [synonyms.get(token, token)]


# Tokenization function
def tokenize_name(name, additional_stopwords=None):
    name = remove_accents(name).lower()
    
    # REPLACE 'CJ' OR 'C J' WITH 'CIRCUNSCRICAO JUDICIAL'
    name = re.sub(r'\bc\s*j\b', 'circunscricao', name)
    
    # REMOVE NUMBERS AND NUMBER-LETTER COMBINATIONS, BUT KEEP THE PRECEDING WORD
    name = re.sub(r'\b(\d+\w*)\b', '', name)
    
    # COMBINE 'GRAU' WITH THE PRECEDING WORD
    name = re.sub(r'(\b\w+\b)\s+grau', r'\1_grau', name)
    
    # COMBINE 'VICE' WITH THE FOLLOWING WORD (WITH SPACE OR HYPHEN) INTO A SINGLE TOKEN
    name = re.sub(r'\b(vice)[-\s]+(\w+)', r'\1_\2', name)

    
    tokens = re.split(r'\s|,|\.|\(|\)|-', name)
    tokens = [token.strip() for token in tokens if token.strip()]

    stopwords = ['de', 'da', 'do', 'das', 'dos', 
                 'e', 'a', 'o', 'i', 'u', 'b', 'as', 'ao',
                 '"', 'em', 'des', 'com', 'n', 'g', 'ap', 'sr', 'sra','/', '\\' ,'?', '\'', '\\\'', '\"gabinete', 'ou',
                'hora', 'solteira', 'villa','zz', '°', '¿',
                'i','ii','iii','iv','v','vi','vii','viii','ix','x',
                'xi','xii','xiii','xiv','xv','xvi','xvii','xviii','xix','xx',
                'xxi','xxii','xxiii','xxiv','xxv','xxvi','xxvii','xxviii','xxix','xxx',
                'xxxi','xxxii','xxxiii','xxxiv','xxxv','xxxvi','xxxvii','xxxviii','xxxix','xl',
                'xli','xlii','xliii','xliv','xlv','xlvi','xlvii','xlviii','xlix','l',
                'li','lii','liii','liv','lv','lvi','lvii','lviii','lix','lx',
                'lxi','lxii','lxiii','lxiv','lxv','lxvi','lxvii','lxviii','lxix','lxx',
                'lxxi','lxxii','lxxiii','lxxiv','lxxv','lxxvi','lxxvii','lxxviii','lxxix','lxxx',
                'lxxxi','lxxxii','lxxxiii','lxxxiv','lxxxv','lxxxvi','lxxxvii','lxxxviii','lxxxix','xc',
                'xci','xcii','xciii','xciv','xcv','xcvi','xcvii','xcviii','xcix','c','sanclerlandia','goianapolis','?',':','ci','cii','varao', '\""']

    if additional_stopwords:
        stopwords.extend(additional_stopwords)
    

    def combine_words(name, stopwords):
        # Define the words to be combined
        words_to_combine = [
            'sao', 'santa', 'santo', 'nova', 'novo', 'bom', 'boa', 
            'alto', 'alta', 'baixo', 'baixa', 'porto', 'campos', 
            'rio', 'foz', 'barra', 'passa', 'entre'
        ]

        # Function to replace word and its following non-stopword
        def replace_word(match):
            word1 = match.group(1)
            word2 = match.group(2)
            if word2.lower() not in stopwords:
                return f'{word1}_{word2}'
            return f'{word1} {word2}'

        # Create a combined regex pattern for all words to be combined
        pattern = r'\b(' + '|'.join(words_to_combine) + r')[\s-]+(\w+)'

        # Apply the replacement
        name = re.sub(pattern, replace_word, name)

        return name
    
    name = combine_words(name, stopwords)

    stopwords.extend([name.lower() for name in names])

    if additional_stopwords:
        stopwords.extend(additional_stopwords)

    tokens = re.split(r'\s|,|\.|\(|\)|-', name)
    tokens = [token.strip() for token in tokens if token.strip() and token not in stopwords]

    
    # PROCESS EACH TOKEN, APPLYING SYNONYMS AND MULTI-TOKEN REPLACEMENTS
    processed_tokens = []
    skip_next = False
    for i, token in enumerate(tokens):
        if skip_next:
            skip_next = False
            continue
        if token and token not in stopwords:
            # HANDLE CASE WHERE 'C' IS FOLLOWED BY 'J'
            if token == 'c' and i + 1 < len(tokens) and tokens[i + 1] == 'j':
                processed_tokens.append('circunscricao')
                skip_next = True
            else:
                processed_tokens.extend(replace_synonyms_and_multi_tokens(token))
    
    # REPLACE UNDERSCORES WITH SPACES IN PRESERVED CONJOINED EXPRESSIONS
    processed_tokens = [token.replace('_', ' ') for token in processed_tokens]
    return processed_tokens

# Apply tokenization and store original names
df['tokens'] = df['nomeUnidade'].apply(lambda x: tokenize_name(x))
df['original_name'] = df['nomeUnidade']

# Apply tokenization and store original names
df['tokens'] = df['nomeUnidade'].apply(lambda x: tokenize_name(x))
df['original_name'] = df['nomeUnidade']

# Flatten list of tokens
all_tokens = [token for sublist in df['tokens'] for token in sublist]

# Calculate frequency of each token
token_counts = Counter(all_tokens)

# Filter tokens with frequency 2 or more
frequent_tokens = [token for token, count in token_counts.items() if count >= 2]

# Combine places and names into a single set for faster lookup
exclude_set = []

# Filter out tokens that appear in places or names lists
filtered_tokens = [token for token in frequent_tokens if token.lower() not in exclude_set]

# Sort filtered tokens alphabetically
filtered_tokens.sort()

# Write filtered tokens to file
with open('tokens_processed.txt', 'w', encoding='utf-8') as f:
    f.write(', '.join(filtered_tokens))

print(f"Number of tokens with frequency 2 or more before filtering: {len(frequent_tokens)}")
print(f"Number of tokens after filtering out places and names: {len(filtered_tokens)}")
print("Filtered tokens have been written to 'tokens_processed.txt'")

Number of tokens with frequency 2 or more before filtering: 2767
Number of tokens after filtering out places and names: 2767
Filtered tokens have been written to 'tokens_processed.txt'
