In [9]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

# Load Dataset
df = pd.read_csv("breach_report (1).csv")
df

Unnamed: 0,Name of Covered Entity,State,Covered Entity Type,Individuals Affected,Breach Submission Date,Type of Breach,Location of Breached Information,Business Associate Present,Web Description
0,William F Rinehart DMD PA,SC,Healthcare Provider,25000,03/24/2025,Hacking/IT Incident,Network Server,No,
1,Meigs County Emergency Medical Services,OH,Healthcare Provider,5802,03/20/2025,Hacking/IT Incident,Email,No,
2,Presbyterian Health Plan,NM,Health Plan,7100,03/18/2025,Hacking/IT Incident,Email,No,
3,Baylor Scott & White Texas Spine & Joint Hospital,TX,Healthcare Provider,1640,03/14/2025,Hacking/IT Incident,Email,No,
4,Lake Psychological Services,PA,Healthcare Provider,987,03/14/2025,Hacking/IT Incident,Network Server,No,
...,...,...,...,...,...,...,...,...,...
11259,"Mark D. Lurie, MD",CA,Healthcare Provider,5166,11/20/2009,Theft,Desktop Computer,No,A shared Computer that was used for backup was...
11260,Health Services for Children with Special Need...,DC,Health Plan,3800,11/17/2009,Loss,Laptop,No,A laptop was lost by an employee while in tran...
11261,Alaska Department of Health and Social Services,AK,Healthcare Provider,501,10/30/2009,Theft,"Other, Other Portable Electronic Device",No,The Alaska Department of Health and Social Ser...
11262,"Mid America Kidney Stone Association, LLC",MO,Healthcare Provider,1000,10/28/2009,Theft,Network Server,No,Five desktop computers containing unencrypted ...


In [None]:
# Clean Column Names
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

# Handle Missing Values
df['state'].fillna("Unknown", inplace=True)
df['covered_entity_type'].fillna("Unknown", inplace=True)
df['individuals_affected'] = pd.to_numeric(df['individuals_affected'], errors='coerce')
df['individuals_affected'].fillna(df['individuals_affected'].median(), inplace=True)
df['type_of_breach'].fillna("Unknown", inplace=True)
df['web_description'].fillna("No description provided", inplace=True)

# Parse Dates
df['breach_submission_date'] = pd.to_datetime(df['breach_submission_date'], errors='coerce')
df['breach_year'] = df['breach_submission_date'].dt.year
df['breach_month'] = df['breach_submission_date'].dt.month

# Create Severity Buckets
df['severity_level'] = pd.qcut(df['individuals_affected'], q=4, labels=['Low', 'Moderate', 'High', 'Critical'])

# Label Encode Categorical Features
le = LabelEncoder()
df['type_of_breach_encoded'] = le.fit_transform(df['type_of_breach'])
df['location_of_breached_information_encoded'] = le.fit_transform(df['location_of_breached_information'])
df['covered_entity_type_encoded'] = le.fit_transform(df['covered_entity_type'])