In [1]:
import pandas as pd 
import numpy as np 
import torch
from transformers import (
    AutoModelForTokenClassification,
    AutoTokenizer,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer,
)

  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [2]:
from presidio_analyzer import AnalyzerEngine
from typing import List
from presidio_anonymizer import AnonymizerEngine

from presidio_analyzer import AnalyzerEngine, EntityRecognizer, RecognizerResult
from presidio_analyzer.nlp_engine import NlpArtifacts
from transformers import pipeline

In [None]:
##Uses mistral 7B model

tokenizer = AutoTokenizer.from_pretrained("/Users/dkim1998/Documents/Mistral_7b_v2") #Note it seems using 13B use so much memory
model = AutoModelForCausalLM.from_pretrained("/Users/dkim1998/Documents/Mistral_7b_v2")

# Encode some input text
inputs = tokenizer("who has the strongest military in the world in 2023", return_tensors="pt")

# Generate a response
outputs = model.generate(**inputs, max_length=50) #uses up to 30 gigabyte of ram just to run it
generated_text = tokenizer.decode(outputs[0])
print(generated_text)

##Importatn info about mistral 7b
#Loading checkpoint shards: 100%|██████████| 3/3 [00:14<00:00,  4.94s/it]
#The model 'MistralForCausalLM' is not supported for token-classification. Supported models are ['AlbertForTokenClassification', 'BertForTokenClassification', 'BigBirdForTokenClassification', 'BioGptForTokenClassification', 'BloomForTokenClassification', 'BrosForTokenClassification', 'CamembertForTokenClassification', 'CanineForTokenClassification', 'ConvBertForTokenClassification', 'Data2VecTextForTokenClassification', 'DebertaForTokenClassification', 'DebertaV2ForTokenClassification', 'DistilBertForTokenClassification', 'ElectraForTokenClassification', 'ErnieForTokenClassification', 'ErnieMForTokenClassification', 'EsmForTokenClassification', 'FalconForTokenClassification', 'FlaubertForTokenClassification', 'FNetForTokenClassification', 'FunnelForTokenClassification', 'GPT2ForTokenClassification', 'GPT2ForTokenClassification', 'GPTBigCodeForTokenClassification', 'GPTNeoForTokenClassification', 'GPTNeoXForTokenClassification', 'IBertForTokenClassification', 'LayoutLMForTokenClassification', 'LayoutLMv2ForTokenClassification', 'LayoutLMv3ForTokenClassification', 'LiltForTokenClassification', 'LongformerForTokenClassification', 'LukeForTokenClassification', 'MarkupLMForTokenClassification', 'MegaForTokenClassification', 'MegatronBertForTokenClassification', 'MobileBertForTokenClassification', 'MPNetForTokenClassification', 'MptForTokenClassification', 'MraForTokenClassification', 'NezhaForTokenClassification', 'NystromformerForTokenClassification', 'PhiForTokenClassification', 'QDQBertForTokenClassification', 'RemBertForTokenClassification', 'RobertaForTokenClassification', 'RobertaPreLayerNormForTokenClassification', 'RoCBertForTokenClassification', 'RoFormerForTokenClassification', 'SqueezeBertForTokenClassification', 'XLMForTokenClassification', 'XLMRobertaForTokenClassification', 'XLMRobertaXLForTokenClassification', 'XLNetForTokenClassification', 'XmodForTokenClassification', 'YosoForTokenClassification'].
#Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.

In [None]:
##Lets combine the LLM model with presidio 
####Comparing it to Microsoft Presidio to identifiy PII info (01/07/2024)
from presidio_analyzer import AnalyzerEngine

# Specify the file path
file_path = "/Users/dkim1998/Downloads/LLM_PII_Project_Code/fake_pii_data_unlabeled_training.txt"

# Read the content of the file
with open(file_path, 'r') as file:
    text_data = file.read()

# Create an instance of the AnalyzerEngine
analyzer = AnalyzerEngine()

# Analyze the text to detect PII
results = analyzer.analyze(text=text_data, language='en')

# Print the results
print("Detected PII:")
for result in results:
    pii_type = result.entity_type
    start_index = result.start
    end_index = result.end
    pii_text = text_data[start_index:end_index]
    print(f"Detected {pii_type}: {pii_text}")

##It works pretty well, much faster but not as perfect

In [None]:
#This is from the  documents: https://www.philschmid.de/pii-huggingface-sagemaker
class TransformersRecognizer(EntityRecognizer):
    def __init__(self,model_id_or_path=None,aggregation_strategy="average",supported_language="en",ignore_labels=["O","MISC"]):
      # inits transformers pipeline for given mode or path
      self.pipeline = pipeline("token-classification",model="/Users/dkim1998/Documents/Mistral_7b_v2",aggregation_strategy="average",ignore_labels=ignore_labels)
      # map labels to presidio labels
      self.label2presidio={
            "PER": "PERSON",
            "LOC": "LOCATION",
            "ORG": "ORGANIZATION",
            "ADR": "ADDRESS",
            "JOB": "JOB TITLE",
      }

      # passes entities from model into parent class
      super().__init__(supported_entities=list(self.label2presidio.values()),supported_language=supported_language)

    def load(self) -> None:
        """No loading is required."""
        pass

    def analyze(
        self, text: str, entities: List[str]=None, nlp_artifacts: NlpArtifacts=None
    ) -> List[RecognizerResult]:
        """
        Extracts entities using Transformers pipeline
        """
        results = []

        # keep max sequence length in mind
        predicted_entities = self.pipeline(text)
        if len(predicted_entities) >0:
          for e in predicted_entities:
            converted_entity = self.label2presidio[e["entity_group"]]
            if converted_entity in entities or entities is None:
              results.append(
                  RecognizerResult(
                      entity_type=converted_entity,
                      start=e["start"],
                      end=e["end"],
                      score=e["score"]
                      )
                  )
        return results


In [None]:
        "name": "Dr. Valerie Walker",
        "address": "PSC 3017, Box 4694\nAPO AA 82933",
        "email": "brownshannon@example.com",
        "phone": "857.533.0228x341",
        "ssn": "767-28-1557",
        "driver_license": "LV3 6414",
        "passport_number": "AGPS21459078",
        "credit_card": "American Express\nJanet Kaiser\n348336575359269 06/27\nCID: 1739\n",
        "bank_account": "PWEB40659151519708",
        "company": "Gilbert Ltd",
        "job_title": "Surveyor, land/geomatics",
        "username": "jaclynreed",
        "password": "cuB2ZAkto!",
        "url": "https://wright-day.com/",
        "date_of_birth": "2010-05-25",
        "medicine_name": "Throughout data.",
        "disease_name": "Number sort.",
        "latitude_longitude": "59.6460745, -150.368905",
        "random_text": "Report market place whatever strong. Will law water life heart.",
        "isbn": "978-0-12-463747-4"

In [7]:
from presidio_anonymizer import AnonymizerEngine
from presidio_analyzer import AnalyzerEngine
from typing import List

from presidio_analyzer import AnalyzerEngine, EntityRecognizer, RecognizerResult
from presidio_analyzer.nlp_engine import NlpArtifacts
from transformers import pipeline

# load spacy model -> workaround
#import os
#os.system("spacy download en_core_web_lg")

# list of entities: https://microsoft.github.io/presidio/supported_entities/#list-of-supported-entities
DEFAULT_ANOYNM_ENTITIES = [
    "CREDIT_CARD",
    "CRYPTO",
    "DATE_TIME",
    "EMAIL_ADDRESS",
    "IBAN_CODE",
    "IP_ADDRESS",
    "NRP",
    "LOCATION",
    "PERSON",
    "PHONE_NUMBER",
    "MEDICAL_LICENSE",
    "URL",
    "ORGANIZATION",
    "US_SSN"
]

# init anonymize engine
engine = AnonymizerEngine()

class HFTransformersRecognizer(EntityRecognizer):
    def __init__(
        self,
        model_id_or_path=None,
        aggregation_strategy="simple",
        supported_language="en",
        ignore_labels=["O", "MISC"],
    ):
        # inits transformers pipeline for given mode or path
        self.pipeline = pipeline(
            "text-classification", model="/Users/dkim1998/Documents/Mistral_7b_v2", ignore_labels=ignore_labels
        )
        # map labels to presidio labels
        self.label2presidio = {
            "PER": "PERSON",
            "LOC": "LOCATION",
            "ORG": "ORGANIZATION",
            "ADR": "ADDRESS",
            "JOB": "JOB TITLE",
            
            
        }

        # passes entities from model into parent class
        super().__init__(supported_entities=list(self.label2presidio.values()), supported_language=supported_language)

    def load(self) -> None:
        """No loading is required."""
        pass

    def analyze(
        self, text: str, entities: List[str] = None, nlp_artifacts: NlpArtifacts = None
    ) -> List[RecognizerResult]:
        """
        Extracts entities using Transformers pipeline
        """
        results = []

        # keep max sequence length in mind
        predicted_entities = self.pipeline(text)
        if len(predicted_entities) > 0:
            for e in predicted_entities:
                converted_entity = self.label2presidio[e["entity_group"]]
                if converted_entity in entities or entities is None:
                    results.append(
                        RecognizerResult(
                            entity_type=converted_entity, start=e["start"], end=e["end"], score=e["score"]
                        )
                    )
        return results


def model_fn(model_dir):
    transformers_recognizer = HFTransformersRecognizer(model_dir)
    # Set up the engine, loads the NLP module (spaCy model by default) and other PII recognizers
    analyzer = AnalyzerEngine()
    analyzer.registry.add_recognizer(transformers_recognizer)
    return analyzer


def predict_fn(data, analyzer):
    sentences = data.pop("inputs", data)
    if "parameters" in data:
        anonymization_entities = data["parameters"].get("entities", DEFAULT_ANOYNM_ENTITIES)
        anonymize_text = data["parameters"].get("anonymize", False)
    else:
        anonymization_entities = DEFAULT_ANOYNM_ENTITIES
        anonymize_text = False

    # identify entities
    results = analyzer.analyze(text=sentences, entities=anonymization_entities, language="en")
    # anonymize text
    if anonymize_text:
        result = engine.anonymize(text=sentences, analyzer_results=results)
        return {"anonymized": result.text}

    return {"found": [entity.to_dict() for entity in results]}


In [8]:
# Initialize the HFTransformersRecognizer and AnalyzerEngine
analyzer = model_fn("/Users/dkim1998/Documents/Mistral_7b_v2")  # Replace with your model path or identifier

# Specify the file path for your text data
file_path = "/Users/dkim1998/Downloads/LLM_PII_Project_Code/fake_pii_data_unlabeled_training.txt"

# Read the content of the file
with open(file_path, 'r') as file:
    text_data = file.read()

# Prepare the data for processing
data_to_process = {
    "inputs": text_data,
    "parameters": {
        "entities": DEFAULT_ANOYNM_ENTITIES,
        "anonymize": False  # Change to True if anonymization is needed
    }
}

# Process the text data
processed_data = predict_fn(data_to_process, analyzer)

# Output the results
print(processed_data)

Loading checkpoint shards: 100%|██████████| 3/3 [00:22<00:00,  7.40s/it]
Some weights of MistralForSequenceClassification were not initialized from the model checkpoint at /Users/dkim1998/Documents/Mistral_7b_v2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TypeError: PreTrainedTokenizerFast._batch_encode_plus() got an unexpected keyword argument 'ignore_labels'

In [3]:
from presidio_anonymizer import AnonymizerEngine
from presidio_analyzer import AnalyzerEngine
from typing import List

from presidio_analyzer import AnalyzerEngine, EntityRecognizer, RecognizerResult
from presidio_analyzer.nlp_engine import NlpArtifacts
from transformers import pipeline

DEFAULT_ANOYNM_ENTITIES = [
    "CREDIT_CARD",
    "CRYPTO",
    "DATE_TIME",
    "EMAIL_ADDRESS",
    "IBAN_CODE",
    "IP_ADDRESS",
    "NRP",
    "LOCATION",
    "PERSON",
    "PHONE_NUMBER",
    "MEDICAL_LICENSE",
    "URL",
    "ORGANIZATION",
    "US_SSN"
]

class HFTransformersRecognizer(EntityRecognizer):
    def __init__(
        self,
        model_id_or_path=None,
        aggregation_strategy="simple",
        supported_language="en",
        ignore_labels=["O", "MISC"],
    ):
        # inits transformers pipeline for given mode or path
        self.pipeline = pipeline(
            "text-classification", model=model_id_or_path, ignore_labels=ignore_labels, aggregation_strategy=aggregation_strategy
        )
        # map labels to presidio labels
        self.label2presidio = {
            "PER": "PERSON",
            "LOC": "LOCATION",
            "ORG": "ORGANIZATION",
            "ADR": "ADDRESS",
            "JOB": "JOB TITLE",
            
            
        }

        # passes entities from model into parent class
        super().__init__(supported_entities=list(self.label2presidio.values()), supported_language=supported_language)

    def load(self) -> None:
        """No loading is required."""
        pass

    def analyze(
        self, text: str, entities: List[str] = None, nlp_artifacts: NlpArtifacts = None
    ) -> List[RecognizerResult]:
        """
        Extracts entities using Transformers pipeline
        """
        results = []

        # keep max sequence length in mind
        predicted_entities = self.pipeline(text)
        if len(predicted_entities) > 0:
            for e in predicted_entities:
                converted_entity = self.label2presidio[e["entity_group"]]
                if converted_entity in entities or entities is None:
                    results.append(
                        RecognizerResult(
                            entity_type=converted_entity, start=e["start"], end=e["end"], score=e["score"]
                        )
                    )
        return results


# Function to initialize the model with your LLM
def model_fn(model_dir):
    transformers_recognizer = HFTransformersRecognizer(model_dir)
    analyzer = AnalyzerEngine()
    analyzer.registry.add_recognizer(transformers_recognizer)
    return analyzer

# Function for prediction
def predict_fn(data, analyzer):
    sentences = data.pop("inputs", data)
    if "parameters" in data:
        anonymization_entities = data["parameters"].get("entities", DEFAULT_ANOYNM_ENTITIES)
        anonymize_text = data["parameters"].get("anonymize", False)
    else:
        anonymization_entities = DEFAULT_ANOYNM_ENTITIES
        anonymize_text = False

    # identify entities
    results = analyzer.analyze(text=sentences, entities=anonymization_entities, language="en")
    # anonymize text
    if anonymize_text:
        result = engine.anonymize(text=sentences, analyzer_results=results)
        return {"anonymized": result.text}

    return {"found": [entity.to_dict() for entity in results]}

# Initialize the Analyzer with your LLM model path
analyzer = model_fn("/Users/dkim1998/Documents/llama7b_Jan_2024_hf_only")

# Reading text file
file_path = "/Users/dkim1998/Downloads/LLM_PII_Project_Code/fake_pii_data_unlabeled_training.txt"
with open(file_path, 'r') as file:
    text_data = file.read()

# Prepare the data for processing
data_to_process = {
    "inputs": text_data,
    "parameters": {
        "entities": DEFAULT_ANOYNM_ENTITIES,
        "anonymize": False
    }
}

# Process the text data
processed_data = predict_fn(data_to_process, analyzer)

# Output the results
print(processed_data)


Loading checkpoint shards: 100%|██████████| 2/2 [00:16<00:00,  8.21s/it]
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at /Users/dkim1998/Documents/llama7b_Jan_2024_hf_only and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TypeError: PreTrainedTokenizerFast._batch_encode_plus() got an unexpected keyword argument 'ignore_labels'

In [3]:
from presidio_analyzer import EntityRecognizer, RecognizerResult
from transformers import pipeline
from typing import List

class LargeLanguageModelRecognizer(EntityRecognizer):
    def __init__(self, model_name: str, supported_language: str = "en"):
        supported_entities = ["PERSON", "LOCATION", "ORGANIZATION","US_SNN","EMAIL_ADDRESS"]  # Add other entities as needed
        super().__init__(supported_language=supported_language,supported_entities=supported_entities)
        self.model = pipeline("ner", model=model_name)
        self.name = "Large Language Model Recognizer"
    
    def analyze(self, text: str, entities: List[str] = None, language: str = None) -> List[RecognizerResult]:
        transformer_results = self.model(text)
        presidio_results = []

        for entity in transformer_results:
            if entity['entity'] in self.supported_entities:
                presidio_results.append(RecognizerResult(entity_type=entity['entity'], start=entity['start'], end=entity['end'], score=entity['score']))

        return presidio_results

# Usage
model_name = "/Users/dkim1998/Documents/Mistral_7b_v2"
custom_recognizer = LargeLanguageModelRecognizer(model_name)


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.05s/it]
The model 'MistralForCausalLM' is not supported for ner. Supported models are ['AlbertForTokenClassification', 'BertForTokenClassification', 'BigBirdForTokenClassification', 'BioGptForTokenClassification', 'BloomForTokenClassification', 'BrosForTokenClassification', 'CamembertForTokenClassification', 'CanineForTokenClassification', 'ConvBertForTokenClassification', 'Data2VecTextForTokenClassification', 'DebertaForTokenClassification', 'DebertaV2ForTokenClassification', 'DistilBertForTokenClassification', 'ElectraForTokenClassification', 'ErnieForTokenClassification', 'ErnieMForTokenClassification', 'EsmForTokenClassification', 'FalconForTokenClassification', 'FlaubertForTokenClassification', 'FNetForTokenClassification', 'FunnelForTokenClassification', 'GPT2ForTokenClassification', 'GPT2ForTokenClassification', 'GPTBigCodeForTokenClassification', 'GPTNeoForTokenClassification', 'GPTNeoXForTokenClassification', 'IB

In [4]:
from presidio_analyzer import AnalyzerEngine, RecognizerResult
#from custom_recognizer import LargeLanguageModelRecognizer  # Import your custom recognizer

# Initialize your custom recognizer (adjust the model name/path as needed)
#llm_recognizer = LargeLanguageModelRecognizer(model_name="path_or_name_of_mistral_7b")

# Initialize Presidio's AnalyzerEngine with your custom recognizer
analyzer = AnalyzerEngine()
analyzer.registry.add_recognizer(custom_recognizer)

# Specify the file path and read the file
file_path = "/Users/dkim1998/Downloads/LLM_PII_Project_Code/fake_pii_data_unlabeled_training.txt"
with open(file_path, 'r') as file:
    text_data = file.read()

# Analyze the text data for PII using your custom recognizer
results = analyzer.analyze(text=text_data, language='en')

# Output the results
for result in results:
    print(f"Type: {result.entity_type}, Start: {result.start}, End: {result.end}, Score: {result.score}")


TypeError: LargeLanguageModelRecognizer.analyze() got an unexpected keyword argument 'nlp_artifacts'

In [5]:
from presidio_analyzer import AnalyzerEngine, EntityRecognizer, RecognizerResult
from presidio_analyzer.nlp_engine import NlpArtifacts
from transformers import pipeline
from typing import List

# Define the custom recognizer class
class LargeLanguageModelRecognizer(EntityRecognizer):
    def __init__(self, model_name: str, supported_language: str = "en"):
        super().__init__(supported_entities=["PERSON", "LOCATION", "ORGANIZATION", "US_SSN", "EMAIL_ADDRESS"], supported_language=supported_language)
        self.model = pipeline("ner", model=model_name)

    def analyze(self, text: str, entities: List[str] = None, nlp_artifacts: NlpArtifacts = None) -> List[RecognizerResult]:
        transformer_results = self.model(text)
        presidio_results = []

        for entity in transformer_results:
            if entity['entity_group'] in self.supported_entities:
                presidio_results.append(
                    RecognizerResult(
                        entity_type=entity['entity_group'], 
                        start=entity['start'], 
                        end=entity['end'], 
                        score=entity['score']
                    )
                )
        return presidio_results

# Initialize your custom recognizer
custom_recognizer = LargeLanguageModelRecognizer(model_name="/Users/dkim1998/Documents/Mistral_7b_v2")

# Initialize Presidio's AnalyzerEngine with your custom recognizer
analyzer = AnalyzerEngine()
analyzer.registry.add_recognizer(custom_recognizer)

# Specify the file path and read the file
file_path = "/Users/dkim1998/Downloads/LLM_PII_Project_Code/fake_pii_data_unlabeled_training.txt"
with open(file_path, 'r') as file:
    text_data = file.read()

# Analyze the text data for PII using your custom recognizer
results = analyzer.analyze(text=text_data, language='en')

# Output the results
for result in results:
    print(f"Type: {result.entity_type}, Start: {result.start}, End: {result.end}, Score: {result.score}")
##Uses way too much memory

Loading checkpoint shards: 100%|██████████| 3/3 [00:22<00:00,  7.63s/it]
The model 'MistralForCausalLM' is not supported for ner. Supported models are ['AlbertForTokenClassification', 'BertForTokenClassification', 'BigBirdForTokenClassification', 'BioGptForTokenClassification', 'BloomForTokenClassification', 'BrosForTokenClassification', 'CamembertForTokenClassification', 'CanineForTokenClassification', 'ConvBertForTokenClassification', 'Data2VecTextForTokenClassification', 'DebertaForTokenClassification', 'DebertaV2ForTokenClassification', 'DistilBertForTokenClassification', 'ElectraForTokenClassification', 'ErnieForTokenClassification', 'ErnieMForTokenClassification', 'EsmForTokenClassification', 'FalconForTokenClassification', 'FlaubertForTokenClassification', 'FNetForTokenClassification', 'FunnelForTokenClassification', 'GPT2ForTokenClassification', 'GPT2ForTokenClassification', 'GPTBigCodeForTokenClassification', 'GPTNeoForTokenClassification', 'GPTNeoXForTokenClassification', 'IB

: 

In [None]:
#Main issue: I'm having trouble setting up LLM in this coding to test it out
##update: It looks like it is possible
##Update 2: It looks like adding in the model to the presidio uses too much memory
#Maybe I'll need to train the model in LoRA then create a new model from it and then I can add that model to this coding combination
#worth a shot