### データセットのダウンロード
https://github.com/oreilly-japan/ml-security-jp/blob/master/ch02/enron1.zip  
を取得して同じディレクトリに格納して、解凍する。  
enron1  
├── ham  
├── spam  
└── Summary.txt  
といった配置になる。

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix
import numpy as np
import optuna
from sklearn.model_selection import StratifiedKFold, cross_validate, train_test_split
import os
import codecs
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import optuna.integration.lightgbm as olgb
import optuna
import lightgbm as lgb
import matplotlib.pyplot as plt


In [None]:
def init_lists(folder):
    key_list = []
    file_list = os.listdir(folder)
    for filename in file_list:
        f = codecs.open(folder + filename, 'r', encoding='utf-8', errors='ignore')
        key_list.append(f.read())
        
    f.close()
    
    return key_list

In [None]:
all_mails = list()
spam = init_lists('./enron1/spam/')
ham = init_lists('./enron1/ham/')

all_mails = [(mail, '1') for mail in spam]
all_mails +=[(mail, '0') for mail in ham]

In [None]:
df = pd.DataFrame(all_mails, columns=['text', 'label'])

In [None]:
tfidf = TfidfVectorizer(stop_words='english')

X = tfidf.fit_transform(df['text'])
column_names = tfidf.get_feature_names_out()

In [None]:
X = pd.DataFrame(X.toarray())
X = X.astype(float)
X.columns = column_names
y = df['label'].astype('float')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=101)

In [None]:
train = olgb.Dataset(X_train, y_train)

params = {
    'objective': 'binary',
    'verbosity': -1,
    'boosting_type': 'gbdt'
}

tuner = olgb.LightGBMTunerCV(params, train, num_boost_round=100)

tuner.run()

In [None]:
print('Best score:', 1 - tuner.best_score)
best_params = tuner.best_params

print('Best params:')
for key, value in best_params.items():
    print(f'    {key}: {value}')

In [None]:
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test)

params = {
    'objective': 'binary',
    'verbosity': -1,
    'boosting_type': 'gbdt',
    'lambda_l1': best_params['lambda_l1'],
    'lambda_l2': best_params['lambda_l2'],
    'num_leaves': best_params['num_leaves'],
    'feature_fraction': best_params['feature_fraction'],
    'bagging_fraction': best_params['bagging_fraction'],
    'bagging_freq': best_params['bagging_freq'],
    'min_child_samples': best_params['min_child_samples']
}

gbm = lgb.train(params, train_data, num_boost_round=100, verbose_eval=0)

In [None]:
preds = gbm.predict(X_test)
pred_labels  = np.rint(preds)

print('正解率: {:.5f}%'.format(100 * accuracy_score(y_test, pred_labels)))
print(confusion_matrix(y_test, pred_labels))

In [None]:
lgb.plot_importance(gbm, figsize=(12, 6), max_num_features=10)
plt.show()

In [None]:
spam_rows = (df.label == '1')
spam_data = df[spam_rows]

count = 0
for i in spam_data['text']:
    count = count + i.count('subject')
    
print(count)

In [None]:
legit_rows = (df.label == '0')
legit_data = df[legit_rows]

count = 0
for i in legit_data['text']:
    count = count + i.count('subject')
    
print(count)