In [1]:
import pandas as pd
import torch
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

PATH = '../data/'
file_name = '2005-2022 viral uveitis_220624.xlsx'

In [2]:
# Read data
df = pd.read_excel(PATH + file_name)

# Total number
print(df.shape)


(10000, 350)


In [16]:
# 진단일자는 전처리를 위해 넣어준다.
init_columns = ['연구등록번호','Diagnosis','진단일자','Gender','진단시점나이','CMV IgM[Serum]','CMV IgG[Serum]',
'HSV IgM[Serum]','HSV IgG[Serum]','VZV IgM[Serum]','VZV IgG[Serum]','WBC COUNT[Whole blood]','Lymphocyte(#)[Whole blood]',
'Lymphocyte(%)[Whole blood]','Monocyte(#)[Whole blood]','Monocyte(%)[Whole blood]','Neutrophil(#)[Whole blood]',
'Neutrophil(%)[Whole blood]','ESR[Whole blood]','CRP[Serum]']

feature_column = ['Diagnosis','CMV IgM[Serum]','CMV IgG[Serum]',
'HSV IgM[Serum]','HSV IgG[Serum]','VZV IgM[Serum]','VZV IgG[Serum]','WBC COUNT[Whole blood]','Lymphocyte(#)[Whole blood]',
'Lymphocyte(%)[Whole blood]','Monocyte(#)[Whole blood]','Monocyte(%)[Whole blood]','Neutrophil(#)[Whole blood]',
'Neutrophil(%)[Whole blood]','ESR[Whole blood]','CRP[Serum]']
print(len(init_columns))
print(len(df.columns))

df_init = df[init_columns]

20
350


In [4]:
# sort by 연구등록번호
df_init = df_init.sort_values(by=['연구등록번호','진단일자'])

# save to xlsx
df_init.to_excel(PATH + 'init_data.xlsx', index=False)

# Information

In [5]:
df_init.info()

# Count Diagnosis 0,1,2 
print(df_init['Diagnosis'].value_counts())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 9391 to 873
Data columns (total 20 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   연구등록번호                      10000 non-null  int64  
 1   Diagnosis                   10000 non-null  int64  
 2   진단일자                        10000 non-null  object 
 3   Gender                      10000 non-null  int64  
 4   진단시점나이                      10000 non-null  int64  
 5   CMV IgM[Serum]              866 non-null    float64
 6   CMV IgG[Serum]              839 non-null    float64
 7   HSV IgM[Serum]              893 non-null    float64
 8   HSV IgG[Serum]              0 non-null      float64
 9   VZV IgM[Serum]              775 non-null    object 
 10  VZV IgG[Serum]              775 non-null    float64
 11  WBC COUNT[Whole blood]      9444 non-null   float64
 12  Lymphocyte(#)[Whole blood]  9075 non-null   float64
 13  Lymphocyte(%)[Whole blood]  90

In [17]:
# 각 column 별 분포도 확인
df_cluster = df[feature_column]

print(df_cluster.groupby('Diagnosis').count())


           CMV IgM[Serum]  CMV IgG[Serum]  HSV IgM[Serum]  HSV IgG[Serum]  \
Diagnosis                                                                   
0                     796             766             819               0   
1                      27              28              31               0   
2                      43              45              43               0   

           VZV IgM[Serum]  VZV IgG[Serum]  WBC COUNT[Whole blood]  \
Diagnosis                                                           
0                     715             707                    8359   
1                      32              32                     150   
2                      28              36                     935   

           Lymphocyte(#)[Whole blood]  Lymphocyte(%)[Whole blood]  \
Diagnosis                                                           
0                                8028                        8028   
1                                 138                        

## 데이터 확인

* '연구등록번호'는 동일하나 'Diagnosis' 가 다른 케이스 확인

In [74]:
df_check = df_init.groupby('연구등록번호')

flag_list = []

# group마다 다 동일한 'Diagnosis'이 있는지 확인
for name, group in df_check:
    flag = False
    for i in range(len(group)-1):
        if group.iloc[i]['Diagnosis'] != group.iloc[i+1]['Diagnosis']:
            flag = True
        break
    
    if flag :
        flag_list.append(name)



In [75]:
print(len(flag_list))
print(flag_list)


34
[1650597, 1651044, 1787320, 2255608, 2580939, 2928404, 2999386, 3020526, 3071154, 3322426, 3380351, 3593450, 3810313, 3888182, 4024186, 4074938, 4117045, 4127695, 4129131, 4297907, 4320777, 4448924, 4515988, 4532082, 4567104, 4660852, 4776068, 4876243, 4940693, 5939457, 8769082, 9470306, 9824777, 10404588]


* 휴먼 어노테이트 후 
* 검사관련 feature 가 다 비어 있는 row 제거

In [79]:
# after 인간 annotation
df_snd = pd.read_excel(PATH + 'snd_data.xlsx')
print(df_snd.info())

# Count Diagnosis 0,1,2 
print(df_snd['Diagnosis'].value_counts())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9852 entries, 0 to 9851
Data columns (total 20 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   연구등록번호                      9852 non-null   int64  
 1   Diagnosis                   9852 non-null   int64  
 2   진단일자                        9852 non-null   object 
 3   Gender                      9852 non-null   int64  
 4   진단시점나이                      9852 non-null   int64  
 5   CMV IgM[Serum]              858 non-null    float64
 6   CMV IgG[Serum]              831 non-null    float64
 7   HSV IgM[Serum]              885 non-null    float64
 8   HSV IgG[Serum]              0 non-null      float64
 9   VZV IgM[Serum]              768 non-null    object 
 10  VZV IgG[Serum]              769 non-null    float64
 11  WBC COUNT[Whole blood]      9359 non-null   float64
 12  Lymphocyte(#)[Whole blood]  9006 non-null   float64
 13  Lymphocyte(%)[Whole blood]  9006 

In [80]:
feature_column = ['CMV IgM[Serum]','CMV IgG[Serum]',
'HSV IgM[Serum]','HSV IgG[Serum]','VZV IgM[Serum]','VZV IgG[Serum]','WBC COUNT[Whole blood]','Lymphocyte(#)[Whole blood]',
'Lymphocyte(%)[Whole blood]','Monocyte(#)[Whole blood]','Monocyte(%)[Whole blood]','Neutrophil(#)[Whole blood]',
'Neutrophil(%)[Whole blood]','ESR[Whole blood]','CRP[Serum]']


In [82]:
df_snd.dropna(subset=feature_column, inplace=True, how="all")

print(df_snd.info())
# Count Diagnosis 0,1,2 
print(df_snd['Diagnosis'].value_counts())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9566 entries, 0 to 9851
Data columns (total 20 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   연구등록번호                      9566 non-null   int64  
 1   Diagnosis                   9566 non-null   int64  
 2   진단일자                        9566 non-null   object 
 3   Gender                      9566 non-null   int64  
 4   진단시점나이                      9566 non-null   int64  
 5   CMV IgM[Serum]              858 non-null    float64
 6   CMV IgG[Serum]              831 non-null    float64
 7   HSV IgM[Serum]              885 non-null    float64
 8   HSV IgG[Serum]              0 non-null      float64
 9   VZV IgM[Serum]              768 non-null    object 
 10  VZV IgG[Serum]              769 non-null    float64
 11  WBC COUNT[Whole blood]      9359 non-null   float64
 12  Lymphocyte(#)[Whole blood]  9006 non-null   float64
 13  Lymphocyte(%)[Whole blood]  9006 

In [84]:
# WBC COUNT[Whole blood] 제거
feature_column2 = ['CMV IgM[Serum]','CMV IgG[Serum]',
'HSV IgM[Serum]','HSV IgG[Serum]','VZV IgM[Serum]','VZV IgG[Serum]','Lymphocyte(#)[Whole blood]',
'Lymphocyte(%)[Whole blood]','Monocyte(#)[Whole blood]','Monocyte(%)[Whole blood]','Neutrophil(#)[Whole blood]',
'Neutrophil(%)[Whole blood]','ESR[Whole blood]','CRP[Serum]']

df_snd.dropna(subset=feature_column2, inplace=True, how="all")

print(df_snd.info())
# Count Diagnosis 0,1,2 
print(df_snd['Diagnosis'].value_counts())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9204 entries, 0 to 9851
Data columns (total 20 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   연구등록번호                      9204 non-null   int64  
 1   Diagnosis                   9204 non-null   int64  
 2   진단일자                        9204 non-null   object 
 3   Gender                      9204 non-null   int64  
 4   진단시점나이                      9204 non-null   int64  
 5   CMV IgM[Serum]              858 non-null    float64
 6   CMV IgG[Serum]              831 non-null    float64
 7   HSV IgM[Serum]              885 non-null    float64
 8   HSV IgG[Serum]              0 non-null      float64
 9   VZV IgM[Serum]              768 non-null    object 
 10  VZV IgG[Serum]              769 non-null    float64
 11  WBC COUNT[Whole blood]      8997 non-null   float64
 12  Lymphocyte(#)[Whole blood]  9006 non-null   float64
 13  Lymphocyte(%)[Whole blood]  9006 

In [86]:
df_snd.to_excel(PATH + 'third_data.xlsx', index=False)