In [20]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from os import listdir
from datetime import datetime, date, timedelta
from sklearn.metrics import accuracy_score
import pickle 
from keras.preprocessing import sequence
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.optimizers import Adam
from keras.models import load_model
from keras.callbacks import ModelCheckpoint

In [21]:
# The data is quite big here, and all of it cannot be loaded at once with a simple read_csv call.
# A solution is to specify types, to gain memory (for example switching from float64 to float32)
dtypes = {
        'MachineIdentifier':                                    'category',
        'ProductName':                                          'category',
        'EngineVersion':                                        'category',
        'AppVersion':                                           'category',
        'AvSigVersion':                                         'category',
        'IsBeta':                                               'int8',
        'RtpStateBitfield':                                     'float16',
        'IsSxsPassiveMode':                                     'int8',
        'DefaultBrowsersIdentifier':                            'float32',
        'AVProductStatesIdentifier':                            'float32',
        'AVProductsInstalled':                                  'float16',
        'AVProductsEnabled':                                    'float16',
        'HasTpm':                                               'int8',
        'CountryIdentifier':                                    'int16',
        'CityIdentifier':                                       'float32',
        'OrganizationIdentifier':                               'float16',
        'GeoNameIdentifier':                                    'float16',
        'LocaleEnglishNameIdentifier':                          'int16',
        'Platform':                                             'category',
        'Processor':                                            'category',
        'OsVer':                                                'category',
        'OsBuild':                                              'int16',
        'OsSuite':                                              'int16',
        'OsPlatformSubRelease':                                 'category',
        'OsBuildLab':                                           'category',
        'SkuEdition':                                           'category',
        'IsProtected':                                          'float16',
        'AutoSampleOptIn':                                      'int8',
        'PuaMode':                                              'category',
        'SMode':                                                'float16',
        'IeVerIdentifier':                                      'float16',
        'SmartScreen':                                          'category',
        'Firewall':                                             'float16',
        'UacLuaenable':                                         'float64', 
        'Census_MDC2FormFactor':                                'category',
        'Census_DeviceFamily':                                  'category',
        'Census_OEMNameIdentifier':                             'float32', 
        'Census_OEMModelIdentifier':                            'float32',
        'Census_ProcessorCoreCount':                            'float16',
        'Census_ProcessorManufacturerIdentifier':               'float16',
        'Census_ProcessorModelIdentifier':                      'float32', 
        'Census_ProcessorClass':                                'category',
        'Census_PrimaryDiskTotalCapacity':                      'float64', 
        'Census_PrimaryDiskTypeName':                           'category',
        'Census_SystemVolumeTotalCapacity':                     'float64', 
        'Census_HasOpticalDiskDrive':                           'int8',
        'Census_TotalPhysicalRAM':                              'float32',
        'Census_ChassisTypeName':                               'category',
        'Census_InternalPrimaryDiagonalDisplaySizeInInches':    'float32', 
        'Census_InternalPrimaryDisplayResolutionHorizontal':    'float32', 
        'Census_InternalPrimaryDisplayResolutionVertical':      'float32', 
        'Census_PowerPlatformRoleName':                         'category',
        'Census_InternalBatteryType':                           'category',
        'Census_InternalBatteryNumberOfCharges':                'float64', 
        'Census_OSVersion':                                     'category',
        'Census_OSArchitecture':                                'category',
        'Census_OSBranch':                                      'category',
        'Census_OSBuildNumber':                                 'int16',
        'Census_OSBuildRevision':                               'int32',
        'Census_OSEdition':                                     'category',
        'Census_OSSkuName':                                     'category',
        'Census_OSInstallTypeName':                             'category',
        'Census_OSInstallLanguageIdentifier':                   'float16',
        'Census_OSUILocaleIdentifier':                          'int16',
        'Census_OSWUAutoUpdateOptionsName':                     'category',
        'Census_IsPortableOperatingSystem':                     'int8',
        'Census_GenuineStateName':                              'category',
        'Census_ActivationChannel':                             'category',
        'Census_IsFlightingInternal':                           'float16',
        'Census_IsFlightsDisabled':                             'float16',
        'Census_FlightRing':                                    'category',
        'Census_ThresholdOptIn':                                'float16',
        'Census_FirmwareManufacturerIdentifier':                'float16',
        'Census_FirmwareVersionIdentifier':                     'float32',
        'Census_IsSecureBootEnabled':                           'int8',
        'Census_IsWIMBootEnabled':                              'float16',
        'Census_IsVirtualDevice':                               'float16',
        'Census_IsTouchEnabled':                                'int8',
        'Census_IsPenCapable':                                  'int8',
        'Census_IsAlwaysOnAlwaysConnectedCapable':              'float16',
        'Wdft_IsGamer':                                         'float16',
        'Wdft_RegionIdentifier':                                'float16',
        'HasDetections':                                        'int8'
        }

load = ['HasDetections', 'AvSigVersion', 'Census_OSVersion', 'OsBuildLab']
df_train = pd.read_csv('../input/microsoft-malware-prediction/train.csv',dtype='category',usecols=load)
df_train['HasDetections'] = df_train['HasDetections'].astype('int8')


# AS timestamp
datedictAS = np.load('../input/malware-timestamps/AvSigVersionTimestamps.npy',allow_pickle = True)[()]
df_train['DateAS'] = df_train['AvSigVersion'].map(datedictAS)  

# OS timestamp
datedictOS = np.load('../input/malware-timestamps-2/OSVersionTimestamps.npy',allow_pickle = True)[()]
df_train['DateOS'] = df_train['Census_OSVersion'].map(datedictOS)  

# BL timestamp
def convert(x):
    try:
        d = datetime.strptime(x.split('.')[4],'%y%m%d-%H%M')
    except:
        d = np.nan
    return d
df_train['DateBL'] = df_train['OsBuildLab'].map(convert)
df_train.head()

data = pd.read_csv('../input/google-safe-browsing-transparency-report-data/data.csv')
data['WeekOf'] = data['WeekOf'].map(lambda x: datetime.strptime(x,'%Y-%m-%d').date())
datedictAS = np.load('../input/malware-timestamps/AvSigVersionTimestamps.npy',allow_pickle = True)[()]
weekdictAS={}
for x in datedictAS: 
    weekdictAS[x] = (datedictAS[x] - timedelta(days= -7+1+datedictAS[x].weekday())).date()
df_train['WeekOf'] = df_train['AvSigVersion'].map(weekdictAS)
df_train = pd.merge(df_train, data, on='WeekOf', how='left')
print('GOOGLE DATA')
data.sample(5)

data2 = pd.read_csv('../input/malware-avsigversion-threats/AvSigversion_Threats.csv')
cv = pd.DataFrame(data2.groupby('AvSigVersion')['index'].count()).rename({'index':'ThreatCount'},axis=1)
df_train = pd.merge(df_train,cv,on='AvSigVersion',how='left')
df_train['ThreatCount'].fillna(0,inplace=True)
print('THREAT DATA')
data2.sample(10)

del df_train['DateAS'], df_train['DateOS'], df_train['DateBL'], df_train['WeekOf'] 
del df_train['AvSigVersion'], df_train['OsBuildLab'], df_train['Census_OSVersion']
print('TRAIN DATA')
df_train.sample(5)


GOOGLE DATA
THREAT DATA
TRAIN DATA


Unnamed: 0,HasDetections,Malware sites detected,Phishing sites detected,Malware sites number,Phishing sites number,Attack sites,Compromised sites,Browser warnings,Search warnings,Webmaster response time,Reinfection rate,ThreatCount
4307476,0,6517.0,38832.0,259239.0,853083.0,392.0,6128.0,7541783.0,16170079.0,60.0,12.0,0.0
8603769,1,6777.0,37305.0,267963.0,841865.0,418.0,6363.0,7826785.0,15288182.0,60.0,13.0,3.0
6767507,0,5041.0,29230.0,171790.0,911673.0,153.0,4890.0,7229923.0,11306559.0,73.0,14.0,2.0
8457949,0,5453.0,29086.0,182114.0,904374.0,163.0,5292.0,7525883.0,12147001.0,76.0,13.0,0.0
4159995,0,6372.0,30082.0,392490.0,733295.0,252.0,6122.0,15181236.0,32800683.0,60.0,15.0,7.0


In [22]:
from sklearn.model_selection import train_test_split
train,test,train_y,test_y = train_test_split(df_train,df_train['HasDetections'],test_size=0.3)

In [23]:
del train["HasDetections"]
del test["HasDetections"]

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
train = scaler.fit_transform(train)
test = scaler.transform(test)



In [24]:
train = np.array(train)
test = np.array(test)
# Reshaping the dataset
X_train = np.reshape(train,(train.shape[0],train.shape[1],1))
X_test = np.reshape(test,(test.shape[0],train.shape[1],1))

In [35]:
# Creating the sequntial LSTM model with one layer
model = Sequential()
model.add(LSTM(200, input_shape=(X_train.shape[1],1)))
model.add(Dense(1, activation='relu'))

In [36]:
# Using adam optimiser for training the model weights.
adam = Adam(lr=0.001)
chk = ModelCheckpoint('best_model.pkl', monitor='val_acc', save_best_only=True, mode='max', verbose=1)
model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])
model.fit(X_train, train_y, epochs=1, batch_size=128, callbacks=[chk], validation_data=(X_test,test_y))

Train on 6245038 samples, validate on 2676445 samples
Epoch 1/1


<keras.callbacks.callbacks.History at 0x7f7e6d4d9f98>

In [29]:
saved_model = pickle.dumps(model) 
  
# Load the pickled model 
model_saved = pickle.loads(saved_model) 
