## Валидация алгоритма

In [1]:
import os
import pandas as pd
import numpy as np
from task2.NaiveBayesSpamClassifier import NaiveBayesSpamClassifier

# % файлов для обучающей выборки. Остальные - в валидационную
TRAIN_VALUES_PART = 0.8

### Список имен спам и не-спам файлов

In [2]:
files_spam = list(map(lambda fname: 'data/spam/' + fname, os.listdir("data/spam")))
files_ham = list(map(lambda fname: 'data/notSpam/' + fname, os.listdir("data/notSpam")))

### Разбиваем файлы на обучающую и валидационную выборки

In [3]:
# Число файлов на обучающую выборку
num_files_spam_train = int(TRAIN_VALUES_PART * len(files_spam))
num_files_ham_train = int(TRAIN_VALUES_PART * len(files_ham))

# Имена файлов обучающей выборки
files_spam_train = files_spam[:num_files_spam_train]
files_ham_train = files_ham[:num_files_ham_train]

# Имена файлов валидационной выборки
files_spam_validating = files_spam[num_files_spam_train:]
files_ham_validating = files_ham[num_files_ham_train:]

# Число файлов валидационной выборки
num_files_spam_vd = len(files_spam_validating)
num_files_ham_vd = len(files_ham_validating)
num_files_vd = num_files_spam_vd + num_files_ham_vd

# DataFrame'ы валидационной выборки
df_vd_spam = pd.DataFrame({'name': files_spam_validating, 'is_spam': [1] * num_files_spam_vd})
df_vd_ham = pd.DataFrame({'name': files_ham_validating, 'is_spam': [0] * num_files_ham_vd})
df_vd = (df_vd_spam.append(df_vd_ham)).sample(frac=1)

### Создаём и обучаем классификатор (код самого классификатор -- в файле `NaiveBayesSpamClassifier.py`)

In [4]:
classifier = NaiveBayesSpamClassifier()
classifier.fit(files_spam_train, files_ham_train)

### Предсказываем категорию для каждого письма валидационной выборки

In [5]:
df_pred = classifier.predict(df_vd['name'])

### Считаем и выводим метрики

In [6]:
false_positives = df_pred['name'][np.logical_and(df_pred['is_spam'] == 1, df_vd['is_spam'] == 0)]
false_negatives = df_pred['name'][np.logical_and(df_pred['is_spam'] == 0, df_vd['is_spam'] == 1)]

num_true_positives = np.sum(np.logical_and(df_pred['is_spam'] == 1, df_vd['is_spam'] == 1))
num_true_negatives = np.sum(np.logical_and(df_pred['is_spam'] == 0, df_vd['is_spam'] == 0))

accuracy = (num_true_positives + num_true_negatives) / num_files_vd

print(f'accuracy: {accuracy} ({num_true_positives + num_true_negatives} / {num_files_vd})')
if len(false_positives) != 0:
    print(f'False positives [{len(false_positives)}]:\n\t' + '\n\t'.join(false_positives))
if len(false_negatives) != 0:
    print(f'False negatives [{len(false_negatives)}]:\n\t' + '\n\t'.join(false_negatives))

accuracy: 0.90625 (145 / 160)
False positives [3]:
	data/notSpam/000400.7b3d3743d2ec2f08d210cd8a2c
	data/notSpam/000399.6aff42ecf03e44df55f34ce6eb
	data/notSpam/000374.f00b971036b69384c6948d1b2a
False negatives [12]:
	data/spam/000342.8dca4ba3ace07255db1c125ce4
	data/spam/000378.cdf67e2a1f8e2dd2351a7899d9
	data/spam/000365.94bcde15ad0333351681a29a3e
	data/spam/000373.6ed8e54e0207bc38a0a2611905
	data/spam/000363.3c7b8959e332af5aed39897458
	data/spam/000389.1d5547366fbcd95877d38a9a39
	data/spam/000325.084173920278a8f5b5b171b8f3
	data/spam/000393.9d22124a06ed6d007eb737f7d3
	data/spam/000331.50392132b3c36ad6422dd8f4e0
	data/spam/000375.2b2732827154333f1ac3bc32ed
	data/spam/000352.0c8fd45deac89918457366848a
	data/spam/000348.bca86e7b4bed07838f9ad4d25f
