# Undersampling Data Manipulation

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.utils import resample

# Load Data
df0 = pd.read_csv('dataset0.csv')
df1 = pd.read_csv('dataset1.csv')
df2 = pd.read_csv('dataset2.csv')
df3 = pd.read_csv('dataset3.csv')
# Merge Data 
df_merge = pd.concat([df0.iloc[:,1:], df1.iloc[:,1:], df2.iloc[:,1:], df3.iloc[:,1:]], ignore_index = True)

### 1) Undersample Dataset for (0.3, 0.7, 1, 1)

In [20]:
undersample_ratios = {0: 0.3, 1: 0.7, 2: 1, 3: 1}

# 각 클래스에 대해 별도의 데이터프레임을 생성합니다.
class_dfs = {cls: df_merge[df_merge['level'] == cls] for cls in df_merge['level'].unique()}

# 언더샘플링을 수행합니다.
undersampled_dfs = []
for cls, ratio in undersample_ratios.items():
    # 각 클래스별 샘플 수를 계산합니다.
    n_samples = int(len(class_dfs[cls]) * ratio)
    # sklearn의 resample 함수를 사용하여 데이터를 언더샘플링합니다.
    undersampled_df = resample(class_dfs[cls],
                               replace=False,  # 비복원 추출
                               n_samples=n_samples,  # 줄이고자 하는 샘플 수
                               random_state=123)  # 결과의 재현성을 위해
    undersampled_dfs.append(undersampled_df)

# 언더샘플링된 데이터프레임을 결합합니다.
final_undersampled_df = pd.concat(undersampled_dfs)

In [21]:
final_undersampled_df

Unnamed: 0,level,full_log
211221,0,Oct 25 03:54:03 localhost logstash: 7304 S...
205640,0,Jan 28 10:36:06 localhost logstash: 18862 ...
28990,0,Feb 8 20:04:25 localhost logstash: [2021-02-0...
227783,0,Feb 26 14:15:06 localhost logstash: 2468 ...
97503,0,Nov 25 22:46:00 localhost suricata[1454]: [1:2...
...,...,...
471870,3,File '/etc/lpp/diagnostics/data/diagrpt21.dat'...
472877,3,ossec: output: 'netstat listening ports':\ntcp...
472489,3,ossec: output: 'netstat listening ports':\ntcp...
471845,3,ossec: output: 'netstat listening ports':\ntcp...


In [23]:
final_undersampled_df['level'].value_counts()

level
0    100219
1     92761
2      4141
3      2219
Name: count, dtype: int64

In [26]:
final_undersampled_df.to_csv('sample1.csv', index = False)

### 2) Undersample Dataset for (0.1, 0.3, 1, 1)

In [28]:
undersample_ratios = {0: 0.1, 1: 0.3, 2: 1, 3: 1}

class_dfs = {cls: df_merge[df_merge['level'] == cls] for cls in df_merge['level'].unique()}
undersampled_dfs = []
for cls, ratio in undersample_ratios.items():
    n_samples = int(len(class_dfs[cls]) * ratio)
    undersampled_df = resample(class_dfs[cls],
                               replace=False,  
                               n_samples=n_samples, 
                               random_state=123) 
    undersampled_dfs.append(undersampled_df)

final_undersampled_df2 = pd.concat(undersampled_dfs)

In [29]:
final_undersampled_df2['level'].value_counts()

level
1    39755
0    33406
2     4141
3     2219
Name: count, dtype: int64

In [32]:
final_undersampled_df2.to_csv('sample2.csv', index = False)

## 3) Undersample Dataset for the same proportion throughout the data

In [35]:
classes = df_merge['level'].unique()
class_dfs = {cls: df_merge[df_merge['level'] == cls] for cls in classes}
min_count = min([len(class_df) for class_df in class_dfs.values()])
undersampled_dfs = [resample(class_df, 
                             replace=False, 
                             n_samples=min_count, 
                             random_state=123)  
                    for class_df in class_dfs.values()]
final_undersampled_df3 = pd.concat(undersampled_dfs)
print(final_undersampled_df3['level'].value_counts())

level
0    2219
1    2219
2    2219
3    2219
Name: count, dtype: int64


In [37]:
final_undersampled_df3.to_csv('sample3.csv', index = False)