# Remove PII

This notebook processes the original data downloaded from the experiment host and removed the identifiable information. The original data is not included in this repo.
The processing includes:
1. Remove the student name
2. Mask the student ID with random UUID

In [1]:
import pandas as pd

# Replace the ROOT with the path where the original data is stored
ROOT = "../original_data"

trial_df = pd.read_csv(f"{ROOT}/trial.csv")
describe_df = pd.read_csv(f"{ROOT}/describe.csv")
survey_df = pd.read_csv(f"{ROOT}/survey.csv")

In [2]:
student_ids = (
    set(trial_df["uid"].unique())
    | set(describe_df["uid"].unique())
    | set(survey_df["uid"].unique())
)

In [3]:
# setup a mapping from original student id to UUID

import uuid

student_id_to_uuid = {student_id: str(uuid.uuid4()) for student_id in student_ids}

In [4]:
# Apply the mapping to the dataframes and remove the student name

for df in (trial_df, describe_df, survey_df):
    df["uid"] = df["uid"].map(student_id_to_uuid)
    if "name" in df.columns:
        df.drop(columns="name", inplace=True)

In [5]:
import os

# Save the dataframes to the data folder
OUT_FOLDER = "../data"

if not os.path.exists(OUT_FOLDER):
    os.mkdir(OUT_FOLDER)

trial_df.to_csv(f"{OUT_FOLDER}/trial.csv", index=False)
describe_df.to_csv(f"{OUT_FOLDER}/describe.csv", index=False)
survey_df.to_csv(f"{OUT_FOLDER}/survey.csv", index=False)