# Anonymize the patient information.
- Anonymize the clinical information in excel;
- Anonymize the patient information in the DICOM or Nifti imaging data.

In [None]:
import os
from sklearn.utils import shuffle
import pandas as pd
import pinyin

In [None]:
"""
Determine whether a char is Chinese.
"""
def is_chinese(char):
    if '\u4e00' <= char <= '\u9fff':
        return True
    else:
        return False
    
"""
Convert a Chinese character/pinyin to Pinyin initials.
"""
def convert_Chinese_to_PinyinInitials(x):
    x=str(x)
    x=x.replace("  ", " ")
    pinyin_initial=""
    if x!="":
        if is_chinese(x) :
            pinyin_initial=pinyin.get_initial(x, delimiter="").upper() 
        else:
            pinyin_initial=pinyin_initial.join([i[0].upper() for i in str(x).split(" ")])
    return pinyin_initial

    
    

In [None]:
"""
Anonymize the patient ID and patient name.
"""
def create_anonymous_table(excel_path, anonymize_id_column, anonymize_name_column, save_base_path):
    
    #read the data
    Data=pd.read_excel(excel_path,index_col=0, header=0) 
    num_samples=Data.shape[0]
    num_columns=Data.shape[1]
    print("- Number of patients: {}. \n - Number of Columns: {}.\n".format(num_samples, num_columns))
    print("\n -----------------Original Data----------------\n {}.".format(Data.head(10)))
    
    #randomly shuffle the patients
    shuffled_Data = shuffle(Data, random_state=0)
    print("\n Shuffle the patients randomly.......")
    
#     #check the shuffle results
#     Data_sorted=Data.sort_values(anonymous_column,inplace=True)
#     shuffled_Data_sorted=shuffled_Data.sort_values(anonymous_column,inplace=True)
#     assert(shuffled_Data_sorted == Data_sorted)
    
    # Anonymize the patient id column.
    Anonymous_Data=shuffled_Data 
    anonymized_id_column_=anonymize_id_column+"_anonymized"
    Anonymous_Data[anonymized_id_column_]=range(1,num_samples+1)
    Anonymous_Data[anonymized_id_column_] = Anonymous_Data[anonymized_id_column_].map(lambda i: "Gliomas_"+str(i).zfill(5))
    #Anonymous_Data.set_index(anonymized_id_column_,drop=True, inplace=True) 
    print("\n Anonymize the patient id successfully.......")
    
    # Anonymize the patient name.
    if anonymize_name_column!=[]:
        anonymized_name_column_=anonymize_name_column+"_anonymized"
        Anonymous_Data[anonymized_name_column_] = Anonymous_Data[anonymize_name_column].apply(convert_Chinese_to_PinyinInitials)
        print("\n Anonymize the patient name successfully.......")
    
    
    #save the anonymous table to excel.           
    save_anonymous_table_path=save_base_path+'/anonymous_table.xlsx'
    Anonymous_Data=Anonymous_Data.reset_index()
    Anonymous_table=Anonymous_Data[[anonymize_id_column, anonymized_id_column_, anonymize_name_column, anonymized_name_column_]]
    Anonymous_table.to_excel(save_anonymous_table_path)
    print("\n ------------Anonymous table--------------\n {}.".format(Anonymous_table.head(10)))
    print('\n Anonymous table has been saved sucessfully to :', save_anonymous_table_path)
    
    #Drop the patient inforamtion which is not anonymized.
    Anonymous_Data=Anonymous_Data.drop([anonymize_id_column, anonymize_name_column], axis=1)
    
    #save the anonymized data;
    save_anonymous_data_path=save_base_path+"/"+os.path.basename(excel_path)[:-5]+'_anonymized.xlsx'
    Anonymous_Data.to_excel(save_anonymous_data_path)
    print("\n ------------Anonymized Data--------------\n {}.".format(Anonymous_Data.head(10)))
    print('\n Anonymized data has been saved sucessfully to :', save_anonymous_data_path)

### Main

In [None]:
excel_path="G://metaData/All_patientID.xlsx"
save_base_path="G://metaData"
anonymize_id_column="PatientID"
anonymize_name_column="PatientName"
drop_columns=["PatientName"]
create_anonymous_table(excel_path, anonymize_id_column, anonymize_name_column, save_base_path)