In [None]:
import ember
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

np.random.seed(0)

In [None]:
X_train, y_train, X_test, y_test = ember.read_vectorized_features("./ember_data/ember2018/")

In [None]:
train_rows = (y_train != -1)
X_train = X_train[train_rows]
y_train = y_train[train_rows]

# メモリ容量の問題の為、データ数を絞る。
target_num = 160000
target_index = np.random.choice(X_train.shape[0], target_num, replace=False)
X_train = X_train[target_index]
y_train = y_train[target_index]
print(pd.DataFrame(y_train).value_counts())

In [None]:
orig_data =pd.DataFrame(X_train[6, :])
orig_data.plot()

In [None]:
scaler  = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
scaled_data = pd.DataFrame(X_train[6, :])
scaled_data.plot()

In [None]:
from tensorflow.keras.backend import clear_session
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras.optimizers import Adam
import gc
import optuna

In [None]:
def Objective(trial):
    X_train_copy = np.copy(X_train)
    y_train_copy = np.copy(y_train)
    
    model = Sequential()
    model.add(Dense(2048, activation='relu', input_dim=2381))
    model.add(Dense(1024, activation='relu'))
    
    dropout_rate = trial.suggest_uniform('dropout_rate', 0, 0.5)
    model.add(Dropout(dropout_rate))
    model.add(Dense(1024, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    
    optimizer = Adam(
        learning_rate=trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
        beta_1=trial.suggest_uniform('beta_1', 0.0, 1.0),
        beta_2=trial.suggest_uniform('beta_2', 0.0, 1.0)
        )
    
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    
    history = model.fit(
        X_train_copy,
        y_train_copy,
        batch_size=512,
        epochs=5,
        validation_split=0.2
    )
    
    eval_value = value = 1 - history.history['val_accuracy'][-1]
    
    clear_session()
    del model, optimizer, history, X_train_copy, y_train_copy
    gc.collect()
    
    return eval_value

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(Objective, n_trials=5, timeout=1200)
print('Best params:', study.best_params)

In [None]:
from sklearn.model_selection import KFold, cross_validate
from keras.wrappers.scikit_learn import KerasClassifier

In [None]:
def buildmodel():
    estimator = Sequential()
    estimator.add(Dense(2048, activation='relu', input_dim=2381))
    estimator.add(Dense(1024, activation='relu'))
    
    estimator.add(Dropout(rate=study.best_params['dropout_rate']))
    estimator.add(Dense(1024, activation='relu'))
    estimator.add(Dense(1, activation='sigmoid'))
    
    optimizer = Adam(
        learning_rate=study.best_params['learning_rate'],
        beta_1=study.best_params['beta_1'],
        beta_2=study.best_params['beta_2']
        )
    
    estimator.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    
    return  (estimator)



In [None]:
estimator = KerasClassifier(build_fn=buildmodel, epochs=5, batch_size=256, verbose=1)   
results = cross_validate(estimator, X_train, y_train, cv=5)
print('Test accuracy:', results['test_score'].mean())

In [None]:
estimator = buildmodel()
estimator.fit(X_train, y_train, batch_size=256, epochs=5)
estimator.save('detect_malware_model.h5')
