# Educational Data Masking with Delphix: Privacy in Academic Environments

This notebook demonstrates the application of Delphix masking techniques to educational data, with special consideration for:
- FERPA compliance
- COPPA requirements
- State-specific education privacy laws
- Educational research needs
- Cross-institutional data sharing requirements

In [None]:
import pandas as pd
import numpy as np
from faker import Faker
import random
from datetime import datetime, timedelta
import uuid

# Set up our environment
fake = Faker()
np.random.seed(42)
random.seed(42)

# Constants for data generation
CURRENT_YEAR = 2024
SCHOOL_YEARS = range(9, 13)  # High school grades 9-12
SUBJECTS = ['Math', 'English', 'Science', 'History', 'Foreign Language', 'Art', 'Physical Education']
ETHNICITIES = ['White', 'Hispanic/Latino', 'Black/African American', 'Asian', 'Native American', 'Pacific Islander', 'Two or More Races']
ACCOMMODATIONS = ['Extended Time', 'Separate Setting', 'Read Aloud', 'Use of Calculator', 'None']
PROGRAMS = ['General Education', 'Special Education', 'Gifted/Talented', 'English Language Learner', 'Section 504']
LUNCH_STATUS = ['Full Price', 'Reduced Price', 'Free']
ATTENDANCE_CODES = ['Present', 'Excused Absence', 'Unexcused Absence', 'Tardy', 'Early Dismissal']
BEHAVIOR_TYPES = ['Positive Behavior', 'Minor Infraction', 'Major Infraction', 'None']
EXTRACURRICULAR = ['Sports', 'Music', 'Drama', 'Student Government', 'Academic Club', 'None']

In [None]:
def generate_student_base(num_students=10000):
    """Generate base student demographic and enrollment data"""
    students = []
    
    for _ in range(num_students):
        grade = random.choice(SCHOOL_YEARS)
        dob_year = CURRENT_YEAR - (grade + 6)  # Approximate age for grade
        
        student = {
            'student_id': str(uuid.uuid4())[:8].upper(),  # Short UUID as student ID
            'state_id': fake.unique.random_number(digits=10, fix_len=True),
            'first_name': fake.first_name(),
            'last_name': fake.last_name(),
            'dob': fake.date_of_birth(minimum_age=grade+5, maximum_age=grade+7),
            'grade_level': grade,
            'enrollment_date': fake.date_between(start_date='-4y', end_date='today'),
            'ethnicity': random.choice(ETHNICITIES),
            'gender': random.choice(['M', 'F', 'NB']),
            'primary_language': random.choice(['English', 'Spanish', 'Chinese', 'Vietnamese', 'Arabic']),
            'program': random.choice(PROGRAMS),
            'lunch_status': random.choice(LUNCH_STATUS),
            
            # Guardian Information
            'guardian1_name': fake.name(),
            'guardian1_relationship': random.choice(['Mother', 'Father', 'Grandmother', 'Grandfather', 'Guardian']),
            'guardian1_phone': fake.phone_number(),
            'guardian1_email': fake.email(),
            'guardian2_name': fake.name() if random.random() > 0.3 else None,
            
            # Address Information
            'street_address': fake.street_address(),
            'city': fake.city(),
            'state': fake.state_abbr(),
            'zip_code': fake.zipcode(),
            
            # Health and Accommodations
            'accommodations': random.choice(ACCOMMODATIONS),
            'medical_alert': 'Yes' if random.random() < 0.1 else 'No',
            'immunization_status': random.choice(['Complete', 'Incomplete', 'Exempt']),
            
            # Additional Demographics
            'transportation': random.choice(['Bus', 'Parent Transport', 'Self Transport', 'Walk']),
            'extracurricular': random.choice(EXTRACURRICULAR)
        }
        students.append(student)
    
    return pd.DataFrame(students)

# Generate base student data
student_df = generate_student_base()
print(f"Generated {len(student_df)} student records")
student_df.head()