In [36]:
from fileinput import filename
import pandas as pd
import os
import glob
import re

# Lista de palavras ignoradas
# Global set of ignored words
palavras_ignoradas = {
    "NETO", "NEGRI", "NOVE", "N KM", "Niemeyer", "NICOLAU", "NEREU", "NATAL", "NELSINA", "NILO", "Andrade", 
    "N KM430", "SALGADO", "SALMÃO", "NOVICKI", "Nébias", "NÚMERO", "NEVES", "NOTARI", "NEGRO", "NAGANO", 
    "ANDRÉ", "NERY", "NOBREGA", "NAKAZATO", "N BRCAO", "NEBLINA", "NOVEMBRO", "SALVESTRINI", "APOIO", 
    "SALLES", "NOGUEIRA", "NY", "NASCIMENTO", "NACOES", "NADER", "NICANOR", "Andorra", "NAZEAZENO", 
    "NORTE", "NOBORU", "NE", "SALES", "SALLA", "SALLE", "NOVELINO", "NELSON", "NS", "NAZARÉ", "NUNES", 
    "ANDRE", "N ETN", "NAGASHIMA", "NOSSA", "NAZARENO", "NORDESTINA", "NOBRE", "NATANAEL", "SALUSTIANO", 
    "ANDRADINA", "ANDRADAS", "NONOAI", "NEGRAS", "Noticias", "NELSIA", "N CH", "NATINGUI", "NORONHA", 
    "NU1700", "NSA", "BLUMENAU", "N GLEBA", "NAVES", "ANDREA", "SALVADOR", "N KM6", "NH", "NOVA", 
    "SALTINHO", "NICOLA", "NAGIB", "N GALPAO", "ANDREAZZA", "NASCENTE", "NONATO", "NAKATA", "NAÇOES", 
    "N DIBBI", "NA APULCRO", "APRIGIO", "ANDARAI", "NOVAES", "NS APARECIDA", "NOVIS", "NANUQUE", 
    "NARCISO", "NABUCO", "NCT", "SALVATORE", "NAZARETH", "ANDRELINA", "ANDRADE", "ANDORRA", "NÉBIAS",
    "NIEMEYER", "NOTÍCIAS", "NOTICIAS"
}

# Function to extract street components (logradouro, number, complement)
def extract_number_and_complement(lessee_street):
    
    # Split the street into words and remove ignored words
    palavras = lessee_street.split()
    palavras = [palavra for palavra in palavras if palavra.upper() not in palavras_ignoradas]
    endereco_limpo = " ".join(palavras)

    # Regex for street type and number/complement / Expressões regulares para detectar e separar os componentes
    # Logradouro: Pode ser "AV", "R", "EST", "ROD", etc.
    logradouro_regex = r"^(AV|R|EST|ROD|DT|QUADRA|UNIDADE|SÃO|SANTA|VIA|PRAÇA|AL|TRAVESSA|RUA|ALAMEDA|PARQUE|LARGO)[^\d]+"
    numero_complemento_regex = r"(\d+|S/N)(.*)"

    # # Extract street type/ Encontrar o logradouro
    logradouro_match = re.match(logradouro_regex, endereco_limpo)
    logradouro = logradouro_match.group(0).strip() if logradouro_match else ''

    # Extract number and complement/ Encontrar o número e complemento
    numero_complemento_match = re.search(numero_complemento_regex, endereco_limpo)
    if numero_complemento_match:
        numero = numero_complemento_match.group(1).strip()
        complemento = numero_complemento_match.group(2).strip()
    else:
        numero = ''
        complemento = ''

    # Cleanup complement / Remover vírgulas extra no complemento
    complemento = complemento.replace(",", "")
    
    # Casos específicos de complementos com mais de uma parte
    if "ANDAR" in complemento or "SL" in complemento or "CONJ" in complemento:
        complemento = complemento.replace("  ", " ").strip()

    return logradouro, numero, complemento

# Function to load a DataFrame from an Excel file
def load_excel(file_path, sheet_name=None):
    return pd.read_excel(file_path, sheet_name=sheet_name)


# Define paths and filenames
DATA_PATH = 'C:\\Users\\edinocencio\\DataClenupCmsCrm\\DataSources\\'
FINALIZED_PATH = os.path.join(DATA_PATH, 'finalizados')
MASTER_FILE = os.path.join(DATA_PATH, 'PMD-CMS.xlsx')
BASE_CEP_FILE = os.path.join(FINALIZED_PATH, 'base_cep_consolidada.xlsx')
OUTPUT_FILE = os.path.join(FINALIZED_PATH, 'enderecos_processados.xlsx')
SHEET_NAME = 'after synch'


df_master = load_excel(MASTER_FILE, sheet_name=SHEET_NAME)

# Set headers and select relevant columns
HEADERS = df_master.iloc[0, :].tolist()
df_master.columns = HEADERS
df_master = df_master.iloc[1:,0:13]

# Extract number and complement using the extraction function
df_master[['Numero', 'Complemento', 'LesseeStreet']] = df_master['LesseeStreet'].apply(
    lambda x: pd.Series(extract_number_and_complement(x))
)

# Select and rename columns
SELECTED_COLUMNS = [
    'CompanyNumber', 'Branch', 
    'ContractNumber', 'Bearbeitungsstand', 
    'Rueckstand', 'LesseeStreet','Numero', 
    'Complemento','LesseePostCode', 'LesseeCity',
    'LesseCountry', 'LesseeCounty',
    'LesseeState', 'LesseeName']

df_master = df_master.loc[:,SELECTED_COLUMNS]

# Load and merge CEP mapping
SHEET_NAME = 'CEPS'
cep_mapping = pd.read_excel(MASTER_FILE, sheet_name = SHEET_NAME)
df_master = df_master.merge(cep_mapping,left_on='LesseePostCode',right_on='CEP_OLD',how='left')

# Load and merge validated CEP data
FINAL_COLUMNS = [
    'CompanyNumber', 'Branch', 
    'ContractNumber', 'Bearbeitungsstand', 
    'Rueckstand', 'NewLesseeStreet', 
    'LesseeStreet','CEP_NEW','Numero', 
    'Complemento','LesseePostCode', 'LesseeCity',
    'LesseCountry', 'LesseeCounty',
    'LesseeState', 'LesseeName'
    ]
df_master["NewLesseeStreet"] = df_master["LesseeStreet"]
df_master = df_master.loc[ : , FINAL_COLUMNS ]

# Save final DataFrame to Excel
BASE_CEP_API = pd.read_excel(BASE_CEP_FILE)
CEP_COLUMNS = ['cep_validado', 'logradouro','bairro', 'cidade', 'uf', 'Estado']
BASE_CEP_API = BASE_CEP_API.loc[ : , CEP_COLUMNS ]


df_master = df_master.merge(BASE_CEP_API, how='left', left_on='CEP_NEW', right_on='cep_validado')
FINAL_COLUMNS = [
    'CompanyNumber', 'LesseeName','Branch',
    'ContractNumber', 'Bearbeitungsstand',
    'Rueckstand', 'CEP_NEW', 'NewLesseeStreet','logradouro',
    'Numero', 'Complemento', 'bairro', 'cidade',
    'uf', 'Estado'
]

df_master = df_master.loc[ : , FINAL_COLUMNS ]
df_master.to_excel(OUTPUT_FILE, index=False)
