In [None]:
import os
import pandas as pd
import numpy as np
import random

seed = 0
random.seed(seed)
np.random.seed(seed)

In [99]:
EDDFS_root = './data/EDDFS'
EDDFS_train_df = pd.read_csv('./data/EDDFS/train.csv')
EDDFS_test_df = pd.read_csv('./data/EDDFS/test.csv')

In [100]:
EDDFS_df = pd.concat([EDDFS_train_df, EDDFS_test_df], ignore_index=True)

In [101]:
EDDFS_df['abnormal'] = EDDFS_df['normal'].apply(lambda x: 0 if x == 1 else 1)

In [103]:
EDDFS_df_normal_original = EDDFS_df[EDDFS_df['normal'] == 1]
EDDFS_df_abnormal_original = EDDFS_df[EDDFS_df['normal'] == 0]

In [None]:
len(EDDFS_df_normal_original), len(EDDFS_df_abnormal_original)

In [105]:
EDDFS_df_abnormal_original_no_others = EDDFS_df_abnormal_original[(EDDFS_df_abnormal_original['RVO'] == 0) & (EDDFS_df_abnormal_original['LS'] == 0) & (EDDFS_df_abnormal_original['hyper'] == 0) & (EDDFS_df_abnormal_original['others'] == 0)]

In [None]:
len(EDDFS_df_abnormal_original_no_others)

In [None]:
# randomly select 50 normal lines and 50 abnormal lines as validation set
EDDFS_df_normal_val = EDDFS_df_normal_original.sample(n=50, random_state=seed)
EDDFS_df_abnormal_val = EDDFS_df_abnormal_original_no_others.sample(n=50, random_state=seed)
EDDFS_df_val = pd.concat([EDDFS_df_normal_val, EDDFS_df_abnormal_val])
EDDFS_df_val.to_csv('./data/EDDFS/BenchReAD/valid.csv', index=False)
print(len(EDDFS_df_val))

In [None]:
# remove the validation set from the original dataset
EDDFS_df_normal = EDDFS_df_normal_original[~EDDFS_df_normal_original.index.isin(EDDFS_df_normal_val.index)]
EDDFS_df_abnormal_no_others = EDDFS_df_abnormal_original_no_others[~EDDFS_df_abnormal_original_no_others.index.isin(EDDFS_df_abnormal_val.index)]
print(len(EDDFS_df_normal), len(EDDFS_df_abnormal_no_others))

In [None]:
# randomly select 1/3 normal lines as normal_labeled set
EDDFS_df_normal_labeled = EDDFS_df_normal.sample(n=len(EDDFS_df_normal)//3, random_state=seed)
print('normal labeled:', len(EDDFS_df_normal_labeled))
# randomly select 1/3 abnormal lines as abnormal_labeled set
EDDFS_df_abnormal_labeled = EDDFS_df_abnormal_no_others.sample(n=len(EDDFS_df_abnormal_no_others)//3, random_state=seed)
print('abnormal labeled:', len(EDDFS_df_abnormal_labeled))
EDDFS_df_train_labeled = pd.concat([EDDFS_df_normal_labeled, EDDFS_df_abnormal_labeled])
EDDFS_df_train_labeled.to_csv('./data/EDDFS/BenchReAD/train_labeled.csv', index=False)
print('train labeled:', len(EDDFS_df_train_labeled))


In [None]:
# other normal lines and abnormal lines as unlabeled set
EDDFS_df_normal_unlabeled = EDDFS_df_normal[~EDDFS_df_normal.index.isin(EDDFS_df_train_labeled.index)]
print('normal unlabeled:', len(EDDFS_df_normal_unlabeled))
EDDFS_df_abnormal_unlabeled = EDDFS_df_abnormal_no_others[~EDDFS_df_abnormal_no_others.index.isin(EDDFS_df_train_labeled.index)]
print('abnormal unlabeled:', len(EDDFS_df_abnormal_unlabeled))
EDDFS_df_train_unlabeled = pd.concat([EDDFS_df_normal_unlabeled, EDDFS_df_abnormal_unlabeled])
EDDFS_df_train_unlabeled.to_csv('./data/EDDFS/BenchReAD/train_unlabeled.csv', index=False)
print('train unlabeled:', len(EDDFS_df_train_unlabeled))


In [111]:
assert len(EDDFS_df_normal_labeled) + len(EDDFS_df_normal_unlabeled) + len(EDDFS_df_normal_val) == len(EDDFS_df_normal_original)
assert len(EDDFS_df_abnormal_labeled) + len(EDDFS_df_abnormal_unlabeled) + len(EDDFS_df_abnormal_val) == len(EDDFS_df_abnormal_original_no_others)