In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import numpy as np
import pandas as pd
import random

## 문제 1

In [None]:
path = "./drive/MyDrive/colab/ML_lab/"
file = path+"lab6_spambase.csv"

data = pd.read_csv(file, header=None).values
x_data = data[:,:-1]
y_data = data[:,-1]

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, random_state=0, test_size=0.3)

In [None]:
clf = DecisionTreeClassifier(random_state=0, criterion='entropy')
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print("F1... \t {:.3f}".format(f1_score(y_test, y_pred)))
print("ACC... \t {:.3f}".format(np.mean(y_pred==y_test)))

F1... 	 0.839
ACC... 	 0.967


## 문제 2

In [None]:
spam = []
ham = []

for sample in data:
  if sample[-1] == 0:
    ham.append(sample)
  else:
    spam.append(sample)

number_of_spam = len(spam)
number_of_ham = len(ham)

# print(f'spam의 개수 : {number_of_spam}')
# print(f'ham의 개수 : {number_of_ham}')

In [None]:
random.seed(42)

# 4 : 6의 비율이 되도록 oversampling 하기 위한 계산
size = max(number_of_spam, number_of_ham) * (2/3)
oversampling = int(size - min(number_of_spam, number_of_ham))

# spam의 개수가 더 적으면 spam을 oversampling
if number_of_spam < number_of_ham:
  for _ in range(oversampling):
    random_int = random.randint(0, number_of_spam - 1)
    spam.append(spam[random_int])

# ham의 개수가 더 적으면 ham을 oversampling
elif number_of_spam > number_of_ham:
  for _ in range(oversampling):
    random_int = random.randint(0, number_of_ham - 1)
    ham.append(ham[random_int])

In [None]:
# oversampling 결과를 이용하여 데이터 셋을 재구성
new_data_set = np.array(spam + ham)

# feature와 label을 분리하고, train set : test set = 7 : 3이 되도록 나눔
new_x_data = new_data_set[:,:-1]
new_y_data = new_data_set[:,-1]
new_x_train, new_x_test, new_y_train, new_y_test = train_test_split(new_x_data, new_y_data, random_state=0, test_size=0.3)

In [None]:
clf = DecisionTreeClassifier(random_state=0, criterion='entropy')
clf.fit(new_x_train, new_y_train)
new_y_pred = clf.predict(new_x_test)
print("F1... \t {:.3f}".format(f1_score(new_y_test, new_y_pred)))
print("ACC... \t {:.3f}".format(np.mean(new_y_pred==new_y_test)))

F1... 	 0.972
ACC... 	 0.978


## 문제 3

In [None]:
spam = []
ham = []

for sample in data:
  if sample[-1] == 0:
    ham.append(sample)
  else:
    spam.append(sample)

number_of_spam = len(spam)
number_of_ham = len(ham)

# print(f'spam의 개수 : {number_of_spam}')
# print(f'ham의 개수 : {number_of_ham}')

In [None]:
undersampled_result = []
random.seed(42)

# 4 : 6의 비율이 되도록 undersampling 하기 위한 계산
size = min(number_of_spam, number_of_ham) * (3/2)
undersampling = int(size)

# spam이 ham보다 더 많으면 spam을 undersampling
if number_of_spam > number_of_ham:
  prev_idx = []
  for _ in range(undersampling):
    idx = 0
    while True:
      random_int = random.randint(0, number_of_spam - 1)
      if random_int not in prev_idx:
        idx = random_int
        break

    undersampled_result.append(spam[idx])
    prev_idx.append(idx)

# ham이 spam보다 더 많으면 ham을 undersampling
elif number_of_spam < number_of_ham:
  prev_idx = []
  for _ in range(undersampling):
    idx = 0
    while True:
      random_int = random.randint(0, number_of_ham - 1)
      if random_int not in prev_idx:
        idx = random_int
        break

    undersampled_result.append(ham[idx])
    prev_idx.append(idx)

In [None]:
if number_of_spam > number_of_ham:
  new_data_set = np.array(ham + undersampled_result)
elif number_of_spam < number_of_ham:
  new_data_set = np.array(spam + undersampled_result)
else:
  new_data_set = data

new_x_data = new_data_set[:,:-1]
new_y_data = new_data_set[:,-1]
new_x_train, new_x_test, new_y_train, new_y_test = train_test_split(new_x_data, new_y_data, random_state=0, test_size=0.3)

In [None]:
clf = DecisionTreeClassifier(random_state=0, criterion='entropy')
clf.fit(new_x_train, new_y_train)
new_y_pred = clf.predict(new_x_test)
print("F1... \t {:.3f}".format(f1_score(new_y_test, new_y_pred)))
print("ACC... \t {:.3f}".format(np.mean(new_y_pred==new_y_test)))

F1... 	 0.714
ACC... 	 0.789
