### データセットのダウンロード
https://github.com/oreilly-japan/ml-security-jp/blob/master/ch03/MalwareData.csv.gz  
を取得して同じディレクトリに格納し、解凍を行う。  

In [None]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split, cross_validate
from sklearn import model_selection
import ydata_profiling
import matplotlib.pyplot as plt
import optuna

In [None]:
MalwareDataset = pd.read_csv('MalwareData.csv', sep='|')

In [None]:
ydata_profiling.ProfileReport(df=MalwareDataset, minimal=True)

In [None]:
%matplotlib inline

plt.hist(MalwareDataset.loc[MalwareDataset['legitimate'] == 1, 'VersionInformationSize'], range=(0, 26), alpha=0.5, label='1')
plt.hist(MalwareDataset.loc[MalwareDataset['legitimate'] == 0, 'VersionInformationSize'], range=(0, 26), alpha=0.5, label='0')

plt.legend(title='legitimate')
plt.xlim(0, 26)
plt.show()

In [None]:
plt.hist(MalwareDataset.loc[MalwareDataset['legitimate'] == 1, 'MajorSubsystemVersion'], range=(0, 10), alpha=0.5, label='1')
plt.hist(MalwareDataset.loc[MalwareDataset['legitimate'] == 0, 'MajorSubsystemVersion'], range=(0, 10), alpha=0.5, label='0')

plt.legend(title='legitimate')
plt.xlim(2, 11)
plt.show()

In [None]:
X = MalwareDataset.drop(['Name', 'md5', 'legitimate'], axis='columns')
y = MalwareDataset['legitimate']
FeatSelect = ExtraTreesClassifier().fit(X, y)
Model = SelectFromModel(FeatSelect, prefit=True)
feature_idx = Model.get_support()
feature_name = X.columns[feature_idx]
X = Model.transform(X)
X = pd.DataFrame(X)
X.columns = feature_name

In [None]:
Features = X.shape[1]
FI = ExtraTreesClassifier().fit(X, y).feature_importances_
Index = np.argsort(FI)[::-1][:Features]
for feat in range(Features):
    print("Feature: {} Importance: {:.5f}".format(MalwareDataset.columns[2+Index[feat]].ljust(30), FI[Index[feat]]))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=101)


In [None]:
class Objective_RF:
    def __init__(self, X, y):
        self.X = X
        self.y = y
    
    def __call__(self, trial):
        criterion = trial.suggest_categorical('criterion', ['gini', 'entropy'])
        bootstrap = trial.suggest_categorical('bootstrap', [True, False])
        # max_features = trial.suggest_int('max_features', ['auto', 'sqrt', 'log2'])
        min_samples_split = trial.suggest_int('min_samples_split', 2, 5)
        min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
        
        model = RandomForestClassifier(
            criterion=criterion,
            bootstrap=bootstrap,
            # max_features=max_features,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf
        )
        
        scores = cross_validate(model, self.X, self.y, cv=5, n_jobs=-1)
        
        return scores['test_score'].mean()

In [None]:
objective = Objective_RF(X_train, y_train)
study = optuna.create_study()
study.optimize(objective, timeout=60)
print('params:', study.best_params)

In [None]:
model = RandomForestClassifier(
    criterion=study.best_params['criterion'],
    bootstrap=study.best_params['bootstrap'],
    # max_features=study.best_params['max_features'],
    min_samples_split=study.best_params['min_samples_split'],
    min_samples_leaf=study.best_params['min_samples_leaf']
)

model.fit(X_train, y_train)

pred = model.predict(X_test)

print('正解率: {:.5f} %'.format(100 * accuracy_score(y_test, pred)))
print(confusion_matrix(y_test, pred))

In [None]:
feat_importances = pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=True)

feat_importances.plot(kind='barh')

In [None]:
class Objective_GBC:
    def __init__(self, X, y):
        self.X = X
        self.y = y
        
    def __call__(self, trial):
        max_depth = trial.suggest_int('max_depth', 3, 10)
        max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2'])
        learning_rate = float(trial.suggest_loguniform('learning_rate', 1e-2, 1e-0))
        criterion = trial.suggest_categorical('criterion', ['friedman_mse', 'squared_error'])
        
        model = GradientBoostingClassifier(
            max_depth=max_depth,
            max_features=max_features,
            learning_rate=learning_rate,
            criterion=criterion
        )
        
        scores = cross_validate(model, self.X, self.y, cv=5, n_jobs=-1)
        
        return scores['test_score'].mean()

In [None]:
objective = Objective_GBC(X_test, y_test)
study = optuna.create_study()

study.optimize(objective, n_trials=1)

print('params:', study.best_params)

In [None]:
model = GradientBoostingClassifier(
    max_depth=study.best_params['max_depth'],
    max_features=study.best_params['max_features'],
    learning_rate=study.best_params['learning_rate'],
    criterion=study.best_params['criterion']
)

model.fit(X_train, y_train)

pred = model.predict(X_test)

print('正解率: {:.5f} %'.format(100 * accuracy_score(y_test, pred)))
print(confusion_matrix(y_test, pred))

In [None]:
feat_importances = pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=True)
feat_importances.plot(kind='barh')

In [None]:
class Objective_ABC:
    def __init__(self, X, y):
        self.X = X
        self.y = y
    
    def __call__(self, trial):
        
        algorithm = trial.suggest_categorical('algorithm', ['SAMME', 'SAMME.R'])
        learning_rate = trial.suggest_loguniform('learning_rate', 1e-2, 1e-0)
        
        model = AdaBoostClassifier(
            algorithm=algorithm,
            learning_rate=learning_rate
        )
        
        scores = cross_validate(model, self.X, self.y, cv=5, n_jobs=-1)
        return scores['test_score'].mean()

In [None]:
objective = Objective_ABC(X_train, y_train)
study = optuna.create_study()

study.optimize(objective, timeout=60)

print('params:', study.best_params)

In [None]:
model = AdaBoostClassifier(
    algorithm=study.best_params['algorithm'],
    learning_rate=study.best_params['learning_rate']
)

model.fit(X_train, y_train)

pred = model.predict(X_test)

print('正解率: {:.5f} %'.format(100 * accuracy_score(y_test, pred)))
print(confusion_matrix(y_test, pred))

In [None]:
feat_importances = pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=True)
feat_importances.plot(kind='barh')