In [1]:
!python3.10 -m pip install guardrails-ai

Defaulting to user installation because normal site-packages is not writeable


In [2]:
!python3.10 -m pip install urllib3

Defaulting to user installation because normal site-packages is not writeable


In [3]:
!guardrails hub install hub://guardrails/detect_pii

Installing hub:[35m/[0m[35m/guardrails/[0m[95mdetect_pii...[0m
[2K[32m[=   ][0m Fetching manifestst
[2K[32m[==  ][0m Downloading dependenciespendencies
[2K[32m[=   ][0m Running post-install setuptall setup
[1A[2K✅Successfully installed guardrails/detect_pii version [1;36m0.0[0m.[1;36m5[0m!


[1mImport validator:[0m
from guardrails.hub import DetectPII

[1mGet more info:[0m
[4;94mhttps://hub.guardrailsai.com/validator/guardrails/detect_pii[0m



In [4]:
!guardrails hub install hub://guardrails/teste --quiet

Installing hub:[35m/[0m[35m/guardrails/[0m[95mteste...[0m
ERROR:guardrails-cli:404
ERROR:guardrails-cli:Not Found
ERROR:guardrails-cli:Failed to install hub://guardrails/teste


In [5]:
from guardrails.validators import (register_validator, Validator, FailResult, ValidationResult, PassResult)
from guardrails import Guard
from guardrails.hub import DetectPII
from typing import Any, Dict, Optional, Callable, List

from presidio_analyzer import AnalyzerEngine, PatternRecognizer, RecognizerResult, Pattern
import re

from transformers import BertTokenizer, BertModel
import torch
import os
import numpy as np
import datetime as dt

In [6]:
if os.path.isdir("./bert_tokenizer") and os.path.isdir("./bert_model"):
    tokenizer = BertTokenizer.from_pretrained("./bert_tokenizer")
    model = BertModel.from_pretrained("./bert_model")
else:
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    model = BertModel.from_pretrained("bert-base-uncased")
    
    tokenizer.save_pretrained("./bert_tokenizer")
    model.save_pretrained("./bert_model")

In [7]:
"""
Retorna os embeddings do texto passado como parâmetro da função.

returns: Embeddings das palavras.
"""
def generate_embeddings(text: str) -> List[List]:
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    
    with torch.no_grad():
        outputs = model(**inputs)
        
    embeddings = outputs.last_hidden_state
    sentence_embedding = embeddings.mean(dim=1)
    
    return sentence_embedding


"""
Calcula a similaridae cosseno entre dois textos.
Gera-se os embeddings para cada texto e em seguida, fazemos a similarida usando os embeddings. 

returns: Grau de similaridade em um intervalo fechado entre 0 e 1.
"""
def cosine_similarity(text1: str, text2: str) -> float:
    emb_text1 = generate_embeddings(text1)
    emb_text2 = generate_embeddings(text2)
    
    emb_text1 = emb_text1.cpu().numpy()
    emb_text2 = emb_text2.cpu().numpy()
    
    dot_product = np.dot(emb_text1, emb_text2.T)
    text1Norm = np.linalg.norm(emb_text1)
    text2Norm = np.linalg.norm(emb_text2)
    
    return dot_product / (text1Norm * text2Norm)


"""
Divide o texto em alguns pedaços de frases.

returns: Lista de frases.
"""
def split_text(text, max_length: int = 10) -> List:
    tokens = tokenizer.tokenize(text)
    chunks = [tokenizer.convert_tokens_to_string(tokens[i:i + max_length]) for i in range(0, len(tokens), max_length)]
    
    return chunks


"""
Calcula a similaridade entre dois textos.

returns: Similaridade cosseno.
"""
def similarity_between_texts(text1, text2) -> float:
    split_text1 = split_text(text1)
    split_text2 = split_text(text2)
    
    similarities = []
    
    if len(split_text1) == 0 or len(split_text2) == 0:
        return 0
        
    for text1 in split_text1:
        for text2 in split_text2:
            similarities.append(cosine_similarity(text1, text2)[0][0])

    return np.mean(similarities)


@register_validator(name="guardrails/teste", data_type="string")
class ValidadorDeSimilaridade(Validator):
    def __init__(self, texto1: str, texto2: str, match_type: Optional[str] = None, on_fail: Optional[Callable] = None):        
        super().__init__(on_fail=on_fail, match_type=match_type)
        
        self.texto1 = texto1
        self.texto2 = texto2
        
    def validate(self, value: Any, metadata: Dict = {}) -> ValidationResult:
        similarity = similarity_between_texts(self.texto1, self.texto2)
        
        if similarity < 0.5:
            print(f"{value}: Similaridade baixa (menor que 0.5)")
            return FailResult(error_message="Erro")
        
        print(f"{value}: Similaridade alta (igual ou acima de 0.5)")
        return PassResult()

guard = Guard().use(
    ValidadorDeSimilaridade(texto1="texto1", texto2="texjjjjje  dfqweffqe wqto2")
)

try:
    guard.parse("Agente Inteligente").model_validate
    print("Passou no teste de similaridade cosseno!")
except Exception as e:
    print("Ocorreu um erro: ", e)
        



Agente Inteligente: Similaridade alta (igual ou acima de 0.5)
Passou no teste de similaridade cosseno!


In [8]:
!guardrails hub install hub://guardrails/pii --quiet

Installing hub:[35m/[0m[35m/guardrails/[0m[95mpii...[0m
ERROR:guardrails-cli:404
ERROR:guardrails-cli:Not Found
ERROR:guardrails-cli:Failed to install hub://guardrails/pii


In [12]:
@register_validator(name="guardrails/enrollment", data_type="string")
class PIIValidator(Validator):
    def __init__(self, on_match: Optional[str] = None, on_fail: Optional[Callable] = None):
        super().__init__(on_match=on_match, on_fail=on_fail)
        
    def validate(self, value: str, metadata: Dict = {}) -> ValidationResult:
        year = str(dt.datetime.now().year)[2:]
        
        enrollment = r"\b\d{1}[1-" + year[0] + r"]" + r"[0-" + year[1] + r"]" + r"[12]\d{5}\b"
        enrollment_pattern = Pattern(name="Enrollment_Pattern", regex=enrollment, score=0.6)
        
        enrollment_recognizer = PatternRecognizer(name="MATRICULA", patterns=[enrollment_pattern], supported_entity="Enrollment_Pattern", supported_language="en")
        
        analyzer = AnalyzerEngine()
        analyzer.registry.add_recognizer(enrollment_recognizer)
        
        results = analyzer.analyze(text=value, entities=["Enrollment_Pattern"], language="en")
        
        if results:
            for result in results:
                start, end = result.start, result.end
                value = value[:start] + "<MATRICULA>" + value[end:]
            return FailResult(error_message=value)
        
        return PassResult()



guardas_eureca = Guard().use(PIIValidator)

try:
    guardas_eureca.parse("A minha matricula é o seguinte: 221199999, busque informaões sobre ela").model_validate
except Exception as e:
    print(e)



Validation failed for field with errors: A minha matricula é o seguinte: <MATRICULA>, busque informaões sobre ela


In [10]:

@register_validator(name="guardrails/cpf", data_type="string")
class PIIValidatorCPF(Validator):
    def __init__(self, on_match: Optional[str] = None, on_fail: Optional[Callable] = None):
        super().__init__(on_match=on_match, on_fail=on_fail)
        
    def validate(self, value: str, metadata: Dict = {}) -> ValidationResult:
        year = str(dt.datetime.now().year)[2:]
        
        cpf_regex = r"\b\d{3}\.*\d{3}\.*\d{3}-*\d{2}\b"
        cpf_pattern = Pattern(name="CPF_Pattern", regex=cpf_regex, score=0.6)
        
        cpf_recognizer = PatternRecognizer(name="MATRICULA", patterns=[cpf_pattern], supported_entity="CPF_Pattern", supported_language="en")
        
        analyzer = AnalyzerEngine()
        analyzer.registry.add_recognizer(cpf_recognizer)
        
        results = analyzer.analyze(text=value, entities=["CPF_Pattern"], language="en")
        
        if results:
            for result in results:
                start, end = result.start, result.end
                value = value[:start] + "<CPF>" + value[end:]
            return FailResult(error_message=value)
        
        return PassResult()



guardas_eureca = Guard().use(PIIValidatorCPF)

try:
    result = guardas_eureca.parse("O meu CPF é o seguinte: 99999999999, busque informaões sobre ele").model_validate
except Exception as e:
    print(e)



Validation failed for field with errors: O meu CPF é o seguinte: <CPF>, busque informaões sobre ele


In [11]:
guarda_email = Guard().use(DetectPII(pii_entities="pii", on_fail="fix"))

text = """
Meus emails são demo@lol.com ou dominio@gmail.com ou dominio@hotmail.com e dominio@hotmail.com.br; 
e os meus números de telefones são esses (99) 999999999 ou (99)999999999 ou 99999999999 e 99999999999, 
busque no google.com.br ou uol.com. 
"""
output = guarda_email.parse(
    llm_output=text,
    metadata={"pii_entities": ["EMAIL_ADDRESS", "URL", "PHONE_NUMBER"]},
)

print(output.validated_output)


Meus emails são <EMAIL_ADDRESS> ou <EMAIL_ADDRESS> ou <EMAIL_ADDRESS> e <EMAIL_ADDRESS>; 
e os meus números de telefones são esses <PHONE_NUMBER> ou <PHONE_NUMBER> ou <PHONE_NUMBER> e <PHONE_NUMBER>, 
busque no <URL> ou <URL>. 

