In [18]:
import pandas as pd
import torch
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

%matplotlib inline

PATH = '../data/preprocess/'
PATH_learn = '../data/'
file_name = 'final_data_normalize.xlsx'

In [19]:
df = pd.read_excel(PATH + file_name)

print(df.columns)

Index(['연구등록번호', 'Unnamed: 1', 'Diagnosis', 'Gender', '진단시점나이',
       'CMV IgM[Serum]', 'CMV IgG[Serum]', 'HSV IgM[Serum]', 'VZV IgM[Serum]',
       'VZV IgG[Serum]', 'WBC COUNT[Whole blood]',
       'Lymphocyte(#)[Whole blood]', 'Lymphocyte(%)[Whole blood]',
       'Monocyte(#)[Whole blood]', 'Monocyte(%)[Whole blood]',
       'Neutrophil(#)[Whole blood]', 'Neutrophil(%)[Whole blood]',
       'ESR[Whole blood]', 'CRP[Serum]'],
      dtype='object')


# Split data

In [20]:
feature_cols = ['Gender','진단시점나이','CMV IgM[Serum]', 'CMV IgG[Serum]', 'HSV IgM[Serum]', 'VZV IgM[Serum]',
       'VZV IgG[Serum]', 'WBC COUNT[Whole blood]',
       'Lymphocyte(#)[Whole blood]', 'Lymphocyte(%)[Whole blood]',
       'Monocyte(#)[Whole blood]', 'Monocyte(%)[Whole blood]',
       'Neutrophil(#)[Whole blood]', 'Neutrophil(%)[Whole blood]',
       'ESR[Whole blood]', 'CRP[Serum]']

X = df[feature_cols]
y = df['Diagnosis']

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

print(y_train.value_counts())
print(y_valid.value_counts())
print(y_test.value_counts())

0    6473
2     762
1     102
Name: Diagnosis, dtype: int64
0    815
2     87
1     15
Name: Diagnosis, dtype: int64
0    823
2     82
1     13
Name: Diagnosis, dtype: int64


In [22]:
df_train = pd.DataFrame(X_train, columns=feature_cols)
df_train['Diagnosis'] = y_train

df_valid = pd.DataFrame(X_valid, columns=feature_cols)
df_valid['Diagnosis'] = y_valid

df_test = pd.DataFrame(X_test, columns=feature_cols)
df_test['Diagnosis'] = y_test

In [23]:
df_train.to_csv(PATH_learn + 'train.csv', index=False)
df_valid.to_csv(PATH_learn + 'valid.csv', index=False)
df_test.to_csv(PATH_learn + 'test.csv', index=False)

# Figure Data Imbalance problem

In [24]:
from imblearn.over_sampling import SMOTE, BorderlineSMOTE, ADASYN
from imblearn.under_sampling import TomekLinks, CondensedNearestNeighbour, OneSidedSelection, EditedNearestNeighbours
from imblearn.combine import SMOTETomek, SMOTEENN

## Under Sampling

In [26]:
tl = TomekLinks()
cnn = CondensedNearestNeighbour()
oss = OneSidedSelection()
enn = EditedNearestNeighbours()

x_tl, y_tl = tl.fit_resample(X_train, y_train)
x_cnn, y_cnn = cnn.fit_resample(X_train, y_train)
x_oss, y_oss = oss.fit_resample(X_train, y_train)
x_enn, y_enn = enn.fit_resample(X_train, y_train)

x_valid_tl, y_valid_tl = tl.fit_resample(X_valid, y_valid)
x_valid_cnn, y_valid_cnn = cnn.fit_resample(X_valid, y_valid)
x_valid_oss, y_valid_oss = oss.fit_resample(X_valid, y_valid)
x_valid_enn, y_valid_enn = enn.fit_resample(X_valid, y_valid)


In [27]:
print('-----Train Data-----')
print(y_tl.value_counts())
print(y_cnn.value_counts())
print(y_oss.value_counts())
print(y_enn.value_counts())

print('-----Valid Data-----')
print(y_valid_tl.value_counts())
print(y_valid_cnn.value_counts())
print(y_valid_oss.value_counts())
print(y_valid_enn.value_counts())

-----Train Data-----
0    6471
2     760
1     102
Name: Diagnosis, dtype: int64
0    156
1    102
2     36
Name: Diagnosis, dtype: int64
0    4152
2     709
1     102
Name: Diagnosis, dtype: int64
0    6393
2     717
1     102
Name: Diagnosis, dtype: int64
-----Valid Data-----
0    815
2     87
1     15
Name: Diagnosis, dtype: int64
0    33
2    16
1    15
Name: Diagnosis, dtype: int64
0    464
2     20
1     15
Name: Diagnosis, dtype: int64
0    808
2     72
1     15
Name: Diagnosis, dtype: int64


## Over Sampling

In [28]:
sm = SMOTE(random_state=42)
blsm = BorderlineSMOTE(random_state=42)
adasyn = ADASYN(random_state=42)

x_sm, y_sm = sm.fit_resample(X_train, y_train)
x_blsm, y_blsm = blsm.fit_resample(X_train, y_train)
x_adasyn, y_adasyn = adasyn.fit_resample(X_train, y_train)

x_valid_sm, y_valid_sm = sm.fit_resample(X_valid, y_valid)
x_valid_blsm, y_valid_blsm = blsm.fit_resample(X_valid, y_valid)
x_valid_adasyn, y_valid_adasyn = adasyn.fit_resample(X_valid, y_valid)

In [29]:
print('-----Train Data-----')
print(y_sm.value_counts())
print(y_blsm.value_counts())
print(y_adasyn.value_counts())

print('-----Valid Data-----')
print(y_valid_sm.value_counts())
print(y_valid_blsm.value_counts())
print(y_valid_adasyn.value_counts())

-----Train Data-----
0    6473
2    6473
1    6473
Name: Diagnosis, dtype: int64
0    6473
2    6473
1    6473
Name: Diagnosis, dtype: int64
1    6480
0    6473
2    6466
Name: Diagnosis, dtype: int64
-----Valid Data-----
0    815
2    815
1    815
Name: Diagnosis, dtype: int64
0    815
2    815
1    815
Name: Diagnosis, dtype: int64
0    815
2    813
1    813
Name: Diagnosis, dtype: int64


## Combine

In [30]:
st = SMOTETomek(random_state=42)
stn = SMOTEENN(random_state=42)

x_st, y_st = st.fit_resample(X_train, y_train)
x_stn, y_stn = stn.fit_resample(X_train, y_train)

x_valid_st, y_valid_st = st.fit_resample(X_valid, y_valid)
x_valid_stn, y_valid_stn = stn.fit_resample(X_valid, y_valid)


In [31]:
print('-----Train Data-----')
print(y_st.value_counts())
print(y_stn.value_counts())

print('-----Valid Data-----')
print(y_valid_st.value_counts())
print(y_valid_stn.value_counts())

-----Train Data-----
0    6473
2    6473
1    6473
Name: Diagnosis, dtype: int64
1    6473
2    6471
0    6343
Name: Diagnosis, dtype: int64
-----Valid Data-----
0    815
2    815
1    815
Name: Diagnosis, dtype: int64
1    813
2    813
0    800
Name: Diagnosis, dtype: int64


In [34]:
df_smote = pd.DataFrame(x_sm, columns=feature_cols)
df_smote['Diagnosis'] = y_sm
df_valid_smote = pd.DataFrame(x_valid_sm, columns=feature_cols)
df_valid_smote['Diagnosis'] = y_valid_sm

df_adasyn = pd.DataFrame(x_adasyn, columns=feature_cols)
df_adasyn['Diagnosis'] = y_adasyn
df_valid_adasyn = pd.DataFrame(x_valid_adasyn, columns=feature_cols)
df_valid_adasyn['Diagnosis'] = y_valid_adasyn

df_smoteenn = pd.DataFrame(x_stn, columns=feature_cols)
df_smoteenn['Diagnosis'] = y_stn
df_valid_smoteenn = pd.DataFrame(x_valid_stn, columns=feature_cols)
df_valid_smoteenn['Diagnosis'] = y_valid_stn

# Train
df_smote.to_csv(PATH_learn + 'train_data/smote.csv', index=False)
df_adasyn.to_csv(PATH_learn + 'train_data/adasyn.csv', index=False)
df_smoteenn.to_csv(PATH_learn + 'train_data/smoteenn.csv', index=False)

# Valid
df_valid_smote.to_csv(PATH_learn + 'valid_data/smote.csv', index=False)
df_valid_adasyn.to_csv(PATH_learn + 'valid_data/adasyn.csv', index=False)
df_valid_smoteenn.to_csv(PATH_learn + 'valid_data/smoteenn.csv', index=False)