# Import Data 

In [36]:
import pandas as pd
import numpy as np
import os
import gc

# LOAD AND FREQUENCY-ENCODE
FE = ['EngineVersion','AppVersion','AvSigVersion','Census_OSVersion']
# LOAD AND ONE-HOT-ENCODE

OHE = [ 'RtpStateBitfield','IsSxsPassiveMode','DefaultBrowsersIdentifier',
        'AVProductStatesIdentifier','AVProductsInstalled', 'AVProductsEnabled',
        'CountryIdentifier', 'CityIdentifier', 
        'GeoNameIdentifier', 'LocaleEnglishNameIdentifier',
        'Processor', 'OsBuild', 'OsSuite',
        'SmartScreen','Census_MDC2FormFactor',
        'Census_OEMNameIdentifier', 
        'Census_ProcessorCoreCount',
        'Census_ProcessorModelIdentifier', 
        'Census_PrimaryDiskTotalCapacity', 'Census_PrimaryDiskTypeName',
        'Census_HasOpticalDiskDrive',
        'Census_TotalPhysicalRAM', 'Census_ChassisTypeName',
        'Census_InternalPrimaryDiagonalDisplaySizeInInches',
        'Census_InternalPrimaryDisplayResolutionHorizontal',
        'Census_InternalPrimaryDisplayResolutionVertical',
        'Census_PowerPlatformRoleName', 'Census_InternalBatteryType',
        'Census_InternalBatteryNumberOfCharges',
        'Census_OSEdition', 'Census_OSInstallLanguageIdentifier',
        'Census_GenuineStateName','Census_ActivationChannel',
        'Census_FirmwareManufacturerIdentifier',
        'Census_IsTouchEnabled', 'Census_IsPenCapable',
        'Census_IsAlwaysOnAlwaysConnectedCapable', 'Wdft_IsGamer',
        'Wdft_RegionIdentifier']

# LOAD ALL AS CATEGORIES
dtypes = {}
for x in FE+OHE: dtypes[x] = 'category'
dtypes['MachineIdentifier'] = 'str'
dtypes['HasDetections'] = 'int8'

# LOAD CSV FILE
df_train = pd.read_csv('./data/train.csv', usecols=dtypes.keys(), dtype=dtypes, nrows=1000)

df_train

Unnamed: 0,MachineIdentifier,EngineVersion,AppVersion,AvSigVersion,RtpStateBitfield,IsSxsPassiveMode,DefaultBrowsersIdentifier,AVProductStatesIdentifier,AVProductsInstalled,AVProductsEnabled,...,Census_OSInstallLanguageIdentifier,Census_GenuineStateName,Census_ActivationChannel,Census_FirmwareManufacturerIdentifier,Census_IsTouchEnabled,Census_IsPenCapable,Census_IsAlwaysOnAlwaysConnectedCapable,Wdft_IsGamer,Wdft_RegionIdentifier,HasDetections
0,0000028988387b115f69f31a3bf04f09,1.1.15100.1,4.18.1807.18075,1.273.1735.0,7,0,,53447,1,1,...,26,IS_GENUINE,Retail,628,0,0,0,0,10,0
1,000007535c3f730efa9ea0b7ef1bd645,1.1.14600.4,4.13.17134.1,1.263.48.0,7,0,,53447,1,1,...,8,OFFLINE,Retail,628,0,0,0,0,8,0
2,000007905a28d863f6d0d597892cd692,1.1.15100.1,4.18.1807.18075,1.273.1341.0,7,0,,53447,1,1,...,7,IS_GENUINE,OEM:NONSLP,142,0,0,0,0,3,0
3,00000b11598a75ea8ba1beea8459149f,1.1.15100.1,4.18.1807.18075,1.273.1527.0,7,0,,53447,1,1,...,17,IS_GENUINE,OEM:NONSLP,355,0,0,0,0,3,1
4,000014a5f00daa18e76b81417eeb99fc,1.1.15100.1,4.18.1807.18075,1.273.1379.0,7,0,,53447,1,1,...,8,IS_GENUINE,Retail,355,0,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,00074c9f018c3e3d470115c70249c5da,1.1.15100.1,4.18.1807.18075,1.273.920.0,7,0,,47238,2,1,...,8,IS_GENUINE,OEM:DM,554,1,1,0,0,10,0
996,00074d23b53a581a23006c0e0ae0dabc,1.1.15200.1,4.18.1807.18075,1.275.497.0,7,0,,47238,2,1,...,8,IS_GENUINE,OEM:DM,554,0,0,0,0,11,0
997,00074e1776d7a11b01583b92ce9a22d1,1.1.15200.1,4.18.1807.18075,1.275.568.0,7,0,,53447,1,1,...,39,IS_GENUINE,Retail,500,0,0,0,0,7,1
998,000757f5602ee68e3419d3bfc4be808a,1.1.15000.2,4.12.17007.18022,1.271.105.0,7,0,,53447,1,1,...,33,IS_GENUINE,Retail,127,0,1,0,0,15,0


# Data Preparation

In [33]:
import math

# CHECK FOR NAN
def nan_check(x):
    if isinstance(x,float):
        if math.isnan(x):
            return True
    return False

# FREQUENCY ENCODING
def encode_FE(df,col):
    d = df[col].value_counts(dropna=False)
    n = col+"_FE"
    df[n] = df[col].map(d)/d.max()
    return [n]

# ONE-HOT-ENCODE ALL CATEGORY VALUES THAT COMPRISE MORE THAN
# "FILTER" PERCENT OF TOTAL DATA AND HAS SIGNIFICANCE GREATER THAN "ZVALUE"
def encode_OHE(df, col, filter, zvalue, tar='HasDetections', m=0.5):
    cv = df[col].value_counts(dropna=False)
    cvd = cv.to_dict()
    th = filter * len(df)
    sd = zvalue * 0.5/ math.sqrt(th)
    n = []; d = {}
    for x in cv.index:
        try:
            if cv[x]<th: break
            sd = zvalue * 0.5/ math.sqrt(cv[x])
        except:
            if cvd[x]<th: break
            sd = zvalue * 0.5/ math.sqrt(cvd[x])
        if nan_check(x): r = df[df[col].isna()][tar].mean()
        else: r = df[df[col]==x][tar].mean()
        if abs(r-m)>sd:
            nm = col+'_BE_'+str(x)
            if nan_check(x): df[nm] = (df[col].isna()).astype('int8')
            else: df[nm] = (df[col]==x).astype('int8')
            n.append(nm)
            d[x] = 1
    return [n,d]

In [37]:
cols = []
dd = []

# ENCODE NEW
for x in FE:
    cols += encode_FE(df_train,x)
for x in OHE:
    tmp = encode_OHE(df_train,x,0.005,5)
    cols += tmp[0]; dd.append(tmp[1])

for x in FE+OHE:
    del df_train[x]

    
x=gc.collect()
df_train


Unnamed: 0,MachineIdentifier,HasDetections,EngineVersion_FE,AppVersion_FE,AvSigVersion_FE,Census_OSVersion_FE,SmartScreen_BE_ExistsNotSet
0,0000028988387b115f69f31a3bf04f09,0,1.000000,1.000000,0.166667,0.691824,0
1,000007535c3f730efa9ea0b7ef1bd645,0,0.043779,0.052817,0.500000,0.075472,0
2,000007905a28d863f6d0d597892cd692,0,1.000000,1.000000,0.111111,0.691824,0
3,00000b11598a75ea8ba1beea8459149f,1,1.000000,1.000000,0.444444,1.000000,1
4,000014a5f00daa18e76b81417eeb99fc,1,1.000000,1.000000,0.555556,0.207547,0
...,...,...,...,...,...,...,...
995,00074c9f018c3e3d470115c70249c5da,0,1.000000,1.000000,0.166667,0.044025,0
996,00074d23b53a581a23006c0e0ae0dabc,0,0.953917,1.000000,0.111111,0.283019,0
997,00074e1776d7a11b01583b92ce9a22d1,1,0.953917,1.000000,0.111111,0.220126,0
998,000757f5602ee68e3419d3bfc4be808a,0,0.069124,0.015845,0.055556,0.100629,0


In [43]:
import torch 
from sklearn.model_selection import train_test_split

# Data Preparation
X = df_train.drop('HasDetections', axis=1).drop('MachineIdentifier', axis=1).values
y = df_train['HasDetections'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

X_train = torch.FloatTensor(X_train)
X_test = torch.FloatTensor(X_test)
y_train = torch.FloatTensor(y_train)
y_test = torch.FloatTensor(y_test)

In [44]:
from torch import nn
from torch import optim

# Model Definition

input_size = len(cols)
hidden_sizes = [100, 100, 1]
output_size = 1

model = nn.Sequential(nn.Linear(input_size, hidden_sizes[0]),
                      nn.Dropout(p=.4),
                      nn.ReLU(),
                      nn.Linear(hidden_sizes[0], hidden_sizes[1]),
                      nn.Dropout(p=.4),
                      nn.ReLU(),
                      nn.Linear(hidden_sizes[1], output_size),
                      nn.Sigmoid())

loss = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

In [48]:
# Model Training

epochs = 10

for i in range(epochs):
    y_hat = model.forward(X_train)
    output = loss(y_hat, y_train)

    if i % 2 == 0:
        print(f'Epoch: {i} Loss: {output}')

    optimizer.zero_grad()
    output.backward()
    optimizer.step()

Epoch: 0 Loss: 0.250324547290802
Epoch: 2 Loss: 0.25031915307044983
Epoch: 4 Loss: 0.25035566091537476
Epoch: 6 Loss: 0.2503216862678528
Epoch: 8 Loss: 0.2503221929073334


  return F.mse_loss(input, target, reduction=self.reduction)


In [49]:
import torch
# Model Evaluation

preds = []

with torch.no_grad():
    for val in X_test:
        y_hat = model.forward(val)
        preds.append(y_hat.argmax().item())

df = pd.DataFrame({'Y': y_test, 'YHat': preds})
df['Correct'] = [1 if corr == pred else 0 for corr, pred in zip(df['Y'], df['YHat'])]
df['Correct'].sum() / len(df)


0.49