<a href="https://colab.research.google.com/github/AlbertoRamirez1976/Awesome-GPT-Agents/blob/main/MCS680_Capstone_Anonymous_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<img src = "https://i.postimg.cc/DzShCQsb/a-cartoon-robot-in-a-school-wearing-a-guy-fawkes-m-W0sfvgch-Roe-Nmrlx-Fg-F4v-A-z6o-P9-I5-OR3q-Gkf-I8u-PCr-Q.png">

*a demo by Jérémie Rostan*

https://www.linkedin.com/in/jeremie-rostan/


---

<br>

Sources

[Microsoft Presidio](https://microsoft.github.io/presidio/)

[Langchain integration](https://python.langchain.com/v0.1/docs/guides/productionization/safety/presidio_data_anonymization/qa_privacy_protection/)


In [16]:
# @title AI prompt cell

import ipywidgets as widgets
from IPython.display import display, HTML, Markdown,clear_output
from google.colab import ai

dropdown = widgets.Dropdown(
    options=[],
    layout={'width': 'auto'}
)

def update_model_list(new_options):
    dropdown.options = new_options
update_model_list(ai.list_models())

text_input = widgets.Textarea(
    placeholder='Ask me anything....',
    layout={'width': 'auto', 'height': '100px'},
)

button = widgets.Button(
    description='Submit Text',
    disabled=False,
    tooltip='Click to submit the text',
    icon='check'
)

output_area = widgets.Output(
     layout={'width': 'auto', 'max_height': '300px','overflow_y': 'scroll'}
)

def on_button_clicked(b):
    with output_area:
        output_area.clear_output(wait=False)
        accumulated_content = ""
        for new_chunk in ai.generate_text(prompt=text_input.value, model_name=dropdown.value, stream=True):
            if new_chunk is None:
                continue
            accumulated_content += new_chunk
            clear_output(wait=True)
            display(Markdown(accumulated_content))

button.on_click(on_button_clicked)
vbox = widgets.GridBox([dropdown, text_input, button, output_area])

display(HTML("""
<style>
.widget-dropdown select {
    font-size: 18px;
    font-family: "Arial", sans-serif;
}
.widget-textarea textarea {
    font-size: 18px;
    font-family: "Arial", sans-serif;
}
</style>
"""))
display(vbox)


GridBox(children=(Dropdown(layout=Layout(width='auto'), options=('google/gemini-2.0-flash', 'google/gemini-2.0…

In [17]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from google.colab import userdata
userdata.get('OPEN_AI_LAB')

In [19]:
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

In [20]:
# Install requirements

!pip install --upgrade --quiet presidio-analyzer presidio-anonymizer Faker langchain_core langchain_community langchain_openai faiss-cpu --no-deps

In [None]:
# Set the local model that will be used to anonymize and de-anonymize your data on your computer

!python -m spacy download en_core_web_lg

import spacy
nlp = spacy.load("en_core_web_lg")

Add `%load_ext cudf.pandas` before importing pandas to speed up operations using GPU

In [31]:
%load_ext cudf.pandas
import pandas as pd
import numpy as np

# Randomly generated dataset of parking violations-
# Define the number of rows
num_rows = 1000000

states = ["NY", "NJ", "CA", "TX"]
violations = ["Double Parking", "Expired Meter", "No Parking",
              "Fire Hydrant", "Bus Stop"]
vehicle_types = ["SUBN", "SDN"]

# Create a date range
start_date = "2022-01-01"
end_date = "2022-12-31"
dates = pd.date_range(start=start_date, end=end_date, freq='D')

# Generate random data
data = {
    "Registration State": np.random.choice(states, size=num_rows),
    "Violation Description": np.random.choice(violations, size=num_rows),
    "Vehicle Body Type": np.random.choice(vehicle_types, size=num_rows),
    "Issue Date": np.random.choice(dates, size=num_rows),
    "Ticket Number": np.random.randint(1000000000, 9999999999, size=num_rows)
}

# Create a DataFrame
df = pd.DataFrame(data)

# Which parking violation is most commonly committed by vehicles from various U.S states?

(df[["Registration State", "Violation Description"]]  # get only these two columns
 .value_counts()  # get the count of offences per state and per type of offence
 .groupby("Registration State")  # group by state
 .head(1)  # get the first row in each group (the type of offence with the largest count)
 .sort_index()  # sort by state name
 .reset_index()
)



Unnamed: 0,Registration State,Violation Description,count
0,CA,Expired Meter,50174
1,NJ,Double Parking,50319
2,NY,Expired Meter,50332
3,TX,Expired Meter,50228


# Step 2 - Input your data and detect personally identifiable information (PII)

In [22]:
# Import the necessary Presidio dependencies
from presidio_analyzer import AnalyzerEngine, Pattern, PatternRecognizer, RecognizerRegistry
from presidio_anonymizer import AnonymizerEngine, DeanonymizeEngine, OperatorConfig
from presidio_anonymizer.entities import OperatorResult

# Set the document to process
document_content = """
**Green Valley High School**
**Disciplinary Incident Report**
**Date:** July 15, 2024
**Incident Number:** GVHS-2024-072
**Location:** Cafeteria
---
### **Incident Report**
On July 14, 2024, at approximately 12:30 PM, an altercation occurred in the cafeteria between John Doe and Sarah Smith.
John Doe, an 11th-grade student with Student ID 123456, was born on March 3, 2006.
He resides at 789 Oak Street, Green Valley, CA 90210, with his parent, Jane Doe, who can be reached at (555) 123-4567.
Sarah Smith, a 10th-grade student with Student ID 654321, was born on April 14, 2007, and lives at 456 Pine Avenue, Green Valley, CA 90210.
Her parent, John Smith, can be contacted at (555) 987-6543.
The incident began when John Doe accidentally spilled his drink on Sarah Smith.
This led to a verbal argument between the two students, which quickly escalated to physical aggression, including pushing and shoving.
The confrontation was witnessed by two other students, Emily Brown and Michael Johnson.
Emily Brown, an 11th grader, provided her account of the incident and can be reached at (555) 321-7654.
Michael Johnson, a 10th grader, also witnessed the event and his contact number is (555) 432-1765.
School staff intervened promptly to separate John and Sarah and de-escalate the situation.
Both students were then escorted to the principal’s office for further questioning.
Parents of both students were notified immediately following the incident.
After reviewing the circumstances, it was determined that John Doe would receive a one-day suspension for initiating the physical contact.
Similarly, Sarah Smith received a one-day suspension for retaliating physically.
To address the incident and prevent future occurrences, both students are required to attend a conflict resolution workshop
scheduled for July 21, 2024. Additionally, a meeting with the parents of both students is scheduled for July 18, 2024,
to discuss the incident and preventive measures.
This report was prepared by Ms. Laura Stevens, Vice Principal, and reviewed by Mr. James Anderson, Principal.
Contact: principal.office@greenvalleyhighschool.com
"""

# Set the registry to identify PII using predefined recognizers
registry = RecognizerRegistry()
registry.load_predefined_recognizers()

# Call analyzer to get results
analyzer = AnalyzerEngine(registry=registry)
analysis_results = analyzer.analyze(text=document_content,
                                    entities=["PHONE_NUMBER", "EMAIL_ADDRESS", "PERSON", "LOCATION", "DATE_TIME"],
                                    language='en')





# Step 3 - Check and improve PII detection

In [23]:
# Check identified PII
def highlight_pii(text, results):
    results = sorted(results, key=lambda r: r.start, reverse=True)
    for result in results:
        start = result.start
        end = result.end
        entity_text = text[start:end]
        highlighted_entity = f"\033[1m{entity_text}\033[0m"
        text = text[:start] + highlighted_entity + text[end:]
    return text

highlighted_text = highlight_pii(document_content, analysis_results)
print(highlighted_text)


**Green Valley High School**
**Disciplinary Incident Report**
**Date:** [1mJuly 15, 2024[0m
**Incident Number:** GVHS-2024-072
**Location:** Cafeteria
---
### **Incident Report**
On [1mJuly 14, 2024[0m, at [1mapproximately 12:30 PM[0m, an altercation occurred in the cafeteria between [1mJohn Doe[0m and [1mSarah Smith[0m.
[1mJohn Doe[0m, an 11th-grade student with Student ID 123456, was born on [1mMarch 3, 2006[0m.
He resides at [1m789 Oak Street[0m, [1mGreen Valley[0m, [1mCA 90210[0m, with his parent, [1mJane Doe[0m, who can be reached at [1m(555) 123-4567[0m.
[1mSarah Smith[0m, a 10th-grade student with Student ID 654321, was born on [1mApril 14, 2007[0m, and lives at [1m456 Pine Avenue[0m, [1mGreen Valley[0m, [1mCA 90210[0m.
Her parent, [1mJohn Smith[0m, can be contacted at [1m(555) 987-6543[0m.
The incident began when [1mJohn Doe[0m accidentally spilled his drink on [1mSarah Smith[0m.
This led to a verbal argument between the two students, 

In [24]:
# Define custom patterns to identify specific PII types not covered by built-in recognizers
student_id_pattern = Pattern(
    name="student_id_pattern",
    regex=r"\bStudent ID \d{6}\b",
    score=0.9
)

address_pattern = Pattern(
    name="address_pattern",
    regex=r"\b\d+\s[A-Z][a-zA-Z]*\s(?:Street|St|Avenue|Ave|Boulevard|Blvd|Road|Rd|Lane|Ln|Drive|Dr|Court|Ct|Place|Pl|Square|Sq|Loop|Lp|Trail|Trl|Parkway|Pkwy|Commons|Cmns|Terrace|Ter|Circle|Cir|Way)\b",
    score=0.9
)

school_name_pattern = Pattern(
    name="school_name_pattern",
    regex=r"\b(?:[A-Z][a-z]+(?:\s|$)){1,3}(?:High School|Elementary School|University|College|Academy|Institute)(?:\s[A-Z][a-z]+)*\b",
    score=0.9
)

# Create pattern recognizers for these custom PII types
student_id_recognizer = PatternRecognizer(supported_entity="STUDENT_ID", patterns=[student_id_pattern])
address_recognizer = PatternRecognizer(supported_entity="ADDRESS", patterns=[address_pattern])
school_name_recognizer = PatternRecognizer(supported_entity="SCHOOL_NAME", patterns=[school_name_pattern])

# Add the custom recognizers to the registry
registry.add_recognizer(student_id_recognizer)
registry.add_recognizer(address_recognizer)
registry.add_recognizer(school_name_recognizer)

# Call the analyzer with the updted registry
analyzer = AnalyzerEngine(registry=registry)
analysis_results = analyzer.analyze(text=document_content,
                                    entities=["PHONE_NUMBER", "EMAIL_ADDRESS", "PERSON", "LOCATION", "DATE_TIME", "STUDENT_ID", "ADDRESS", "SCHOOL_NAME"],
                                    language='en')


In [25]:
# Check identified PII again
def highlight_pii(text, results):
    results = sorted(results, key=lambda r: r.start, reverse=True)
    for result in results:
        start = result.start
        end = result.end
        entity_text = text[start:end]
        highlighted_entity = f"\033[1m{entity_text}\033[0m"
        text = text[:start] + highlighted_entity + text[end:]
    return text

highlighted_text = highlight_pii(document_content, analysis_results)
print(highlighted_text)


**[1mGreen Valley High School[0m**
**Disciplinary Incident Report**
**Date:** [1mJuly 15, 2024[0m
**Incident Number:** GVHS-2024-072
**Location:** Cafeteria
---
### **Incident Report**
On [1mJuly 14, 2024[0m, at [1mapproximately 12:30 PM[0m, an altercation occurred in the cafeteria between [1mJohn Doe[0m and [1mSarah Smith[0m.
[1mJohn Doe[0m, an 11th-grade student with [1mStudent ID 123456[0m, was born on [1mMarch 3, 2006[0m.
He resides at [1m[1m789 Oak St[0mreet[0m, [1mGreen Valley[0m, [1mCA 90210[0m, with his parent, [1mJane Doe[0m, who can be reached at [1m(555) 123-4567[0m.
[1mSarah Smith[0m, a 10th-grade student with [1mStudent ID 654321[0m, was born on [1mApril 14, 2007[0m, and lives at [1m[1m456 Pine Av[0menue[0m, [1mGreen Valley[0m, [1mCA 90210[0m.
Her parent, [1mJohn Smith[0m, can be contacted at [1m(555) 987-6543[0m.
The incident began when [1mJohn Doe[0m accidentally spilled his drink on [1mSarah Smith[0m.
This led to a ve

# Step 4 - Replace PII with fake data



In [26]:
# Map PII entities to fake values
from faker import Faker
from presidio_analyzer import AnalyzerEngine, Pattern, PatternRecognizer, RecognizerRegistry

fake = Faker()
fake_map = {}
reverse_map = {}

def generate_fake(entity_type, original_value):
    if original_value not in fake_map:
        if entity_type == "PERSON":
            fake_map[original_value] = fake.name()
        elif entity_type == "PHONE_NUMBER":
            fake_map[original_value] = fake.phone_number()
        elif entity_type == "ADDRESS":
            fake_map[original_value] = fake.address().replace("\n", ", ")
        elif entity_type == "DATE_TIME":
            fake_map[original_value] = fake.date()
        elif entity_type == "STUDENT_ID":
            fake_map[original_value] = f"Student ID {fake.random_number(digits=6, fix_len=True)}"
        elif entity_type == "LOCATION":
            fake_map[original_value] = fake.city()
        elif entity_type == "EMAIL_ADDRESS":
            fake_map[original_value] = fake.email()
        elif entity_type == "SCHOOL_NAME":
            fake_map[original_value] = fake.company()
        reverse_map[fake_map[original_value]] = original_value
    return fake_map[original_value]

# Replace PII with fake values
def replace_with_fakes(text, results):
    results = sorted(results, key=lambda r: r.start, reverse=True)
    for result in results:
        original_value = text[result.start:result.end]
        fake_value = generate_fake(result.entity_type, original_value)
        text = text[:result.start] + fake_value + text[result.end:]
    return text

# Ensure all PII entities are identified and replaced
anonymized_text_with_fakes = replace_with_fakes(document_content, analysis_results)

print("Anonymized Text with Fakes:")
print(anonymized_text_with_fakes)

Anonymized Text with Fakes:

**Ruiz, Cannon and Miller**
**Disciplinary Incident Report**
**Date:** 1985-02-13
**Incident Number:** GVHS-2024-072
**Location:** Cafeteria
---
### **Incident Report**
On 1987-04-25, at 2005-01-19, an altercation occurred in the cafeteria between Jerome Nunez and Jordan Murray.
Jerome Nunez, an 11th-grade student with Student ID 151156, was born on 1988-12-02.
He resides at Strongtonnch Suite 053, East Elizabethborough, WY 11421, Jacksonville, 2002-01-18, with his parent, Jeffrey Ray DDS, who can be reached at 3366343537.
Jordan Murray, a 10th-grade student with Student ID 408932, was born on 1978-11-01, and lives at South SamanthaApt. 415, East Angelaton, FM 75742, Jacksonville, 2002-01-18.
Her parent, Kelly Thornton, can be contacted at 001-703-453-3796.
The incident began when Jerome Nunez accidentally spilled his drink on Jordan Murray.
This led to a verbal argument between the two students, which quickly escalated to physical aggression, including pus

# Step 5 - Chat with you anonymized data

In [28]:
# Import dependencies for a RAG chain
from langchain_core.documents import Document
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableLambda, RunnableParallel, RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from operator import itemgetter
from google.colab import userdata
import os

# Load and anonymize the data
documents = [Document(page_content=document_content)]
for doc in documents:
    doc.page_content = replace_with_fakes(doc.page_content, analysis_results)

# Split the documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
chunks = text_splitter.split_documents(documents)

# Set OpenAI API key
try:
    os.environ["OPENAI_API_KEY"] = userdata.get('OPEN_AI_LAB')
except userdata.SecretNotFoundError:
    print("Secret OPENAI_API_KEY not found. Please add it to Colab secrets.")
    # Exit or handle the error appropriately, e.g., raise an exception
    raise

# Index the chunks (using OpenAI embeddings, since the data is anonymized)
embeddings = OpenAIEmbeddings()
docsearch = FAISS.from_documents(chunks, embeddings)
retriever = docsearch.as_retriever()

# Create the RAG chain
template = """Answer the question based only on the following context: {context}
Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)
model = ChatOpenAI(model="gpt-4o", temperature=0.1)

# Define the processing chain
_inputs = RunnableParallel(
    question=RunnablePassthrough(),
    context=retriever
)

rag_chain = (
    _inputs
    | {
        "context": itemgetter("context"),
        "question": itemgetter("question"),
    }
    | prompt
    | model
    | StrOutputParser()
)

In [29]:
# Example question
query = "Summarize the incident"
response = rag_chain.invoke(query)
print(response)


The incident involved a physical altercation between two students, Jerome Nunez and Jordan Murray, in the cafeteria. It began when Jerome accidentally spilled his drink on Jordan, leading to a verbal argument that escalated into physical aggression, including pushing and shoving. The confrontation was witnessed by two other students, Richard Gray and Todd Franklin. School staff intervened promptly to separate the students and de-escalate the situation. Both students were escorted to the principal’s office, and their parents were notified immediately. As a result, Jerome Nunez received a suspension for initiating the physical contact, and Jordan Murray was suspended for retaliating. Both students are required to attend a conflict resolution workshop to address the incident and prevent future occurrences.


In [30]:
# De-anonymize the response
# Reminder: nlp = "en_core_web_lg"
def custom_deanonymize(text):
    doc = nlp(text)
    for ent in doc.ents:
        if ent.text in reverse_map:
            text = text.replace(ent.text, reverse_map[ent.text])
    return text

# Define the processing chain with de-anonymization
chain_with_deanonymization = rag_chain | RunnableLambda(custom_deanonymize)

# Example question to test the chain
response = chain_with_deanonymization.invoke(query)
print(response)

The incident occurred in the cafeteria on July 14, 2024 between John Doe, an 11th grader, and Sarah Smith, a 10th grader. It began when Jerome accidentally spilled his drink on Jordan, leading to a verbal argument that escalated into physical aggression, including pushing and shoving. The confrontation was witnessed by two other students, Richard Gray and Michael Johnson. School staff intervened promptly to separate the students and de-escalate the situation. Both students were escorted to the principal’s office, and their parents were notified immediately. John Doe received a suspension for initiating the physical contact, and Sarah Smith was suspended for retaliating. Both students are required to attend a conflict resolution workshop to prevent future occurrences.
