# Import Data 

In [1]:
import pandas as pd
import numpy as np
import os
import gc

# LOAD AND FREQUENCY-ENCODE
FE = ['EngineVersion','AppVersion','AvSigVersion','Census_OSVersion']
# LOAD AND ONE-HOT-ENCODE

OHE = [ 'RtpStateBitfield','IsSxsPassiveMode','DefaultBrowsersIdentifier',
        'AVProductStatesIdentifier','AVProductsInstalled', 'AVProductsEnabled',
        'CountryIdentifier', 'CityIdentifier', 
        'GeoNameIdentifier', 'LocaleEnglishNameIdentifier',
        'Processor', 'OsBuild', 'OsSuite',
        'SmartScreen','Census_MDC2FormFactor',
        'Census_OEMNameIdentifier', 
        'Census_ProcessorCoreCount',
        'Census_ProcessorModelIdentifier', 
        'Census_PrimaryDiskTotalCapacity', 'Census_PrimaryDiskTypeName',
        'Census_HasOpticalDiskDrive',
        'Census_TotalPhysicalRAM', 'Census_ChassisTypeName',
        'Census_InternalPrimaryDiagonalDisplaySizeInInches',
        'Census_InternalPrimaryDisplayResolutionHorizontal',
        'Census_InternalPrimaryDisplayResolutionVertical',
        'Census_PowerPlatformRoleName', 'Census_InternalBatteryType',
        'Census_InternalBatteryNumberOfCharges',
        'Census_OSEdition', 'Census_OSInstallLanguageIdentifier',
        'Census_GenuineStateName','Census_ActivationChannel',
        'Census_FirmwareManufacturerIdentifier',
        'Census_IsTouchEnabled', 'Census_IsPenCapable',
        'Census_IsAlwaysOnAlwaysConnectedCapable', 'Wdft_IsGamer',
        'Wdft_RegionIdentifier']

def run_import(nrows, offset=0, use_fe=True, use_ohe=True):
    
    # LOAD ALL AS CATEGORIES
    dtypes = {}
    if use_fe:
        for x in FE: dtypes[x] = 'category'
    if use_ohe:
        for x in OHE: dtypes[x] = 'category'
    dtypes['MachineIdentifier'] = 'str'
    dtypes['HasDetections'] = 'int8'
    
    # LOAD CSV FILE
    return pd.read_csv('./data/train.csv', usecols=dtypes.keys(), dtype=dtypes, nrows=nrows, skiprows=[offset])


# Data Preparation

In [2]:
import math

# CHECK FOR NAN
def nan_check(x):
    if isinstance(x,float):
        if math.isnan(x):
            return True
    return False

# FREQUENCY ENCODING
def encode_FE(df,col):
    d = df[col].value_counts(dropna=False)
    n = col+"_FE"
    df[n] = df[col].map(d)/d.max()
    return [n]

# ONE-HOT-ENCODE ALL CATEGORY VALUES THAT COMPRISE MORE THAN
# "FILTER" PERCENT OF TOTAL DATA AND HAS SIGNIFICANCE GREATER THAN "ZVALUE"
def encode_OHE(df, col, filter, zvalue, tar='HasDetections', m=0.5):
    cv = df[col].value_counts(dropna=False)
    cvd = cv.to_dict()
    th = filter * len(df)
    sd = zvalue * 0.5/ math.sqrt(th)
    n = []; d = {}
    for x in cv.index:
        try:
            if cv[x]<th: break
            sd = zvalue * 0.5/ math.sqrt(cv[x])
        except:
            if cvd[x]<th: break
            sd = zvalue * 0.5/ math.sqrt(cvd[x])
        if nan_check(x): r = df[df[col].isna()][tar].mean()
        else: r = df[df[col]==x][tar].mean()
        if abs(r-m)>sd:
            nm = col+'_BE_'+str(x)
            if nan_check(x): df[nm] = (df[col].isna()).astype('int8')
            else: df[nm] = (df[col]==x).astype('int8')
            n.append(nm)
            d[x] = 1
    return [n,d]

In [3]:
def run_encoding(df, use_fe=True, use_ohe=True):
    cols = []
    dd = []

    # ENCODE NEW
    if use_fe:
        for x in FE:
            cols += encode_FE(df,x)
        for x in FE:
            del df[x]
    if use_ohe:
        for x in OHE:
            tmp = encode_OHE(df,x,0.005,5)
            cols += tmp[0]; dd.append(tmp[1])
        for x in OHE:
            del df[x]

    x=gc.collect()
    return cols, dd


In [4]:
import torch 
from sklearn.model_selection import train_test_split

# Data Preparation
def run_preparation(df, test_size=0.2):
    X = df.drop('HasDetections', axis=1).drop('MachineIdentifier', axis=1).values
    y = df['HasDetections'].values

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)

    X_train = torch.FloatTensor(X_train)
    X_test = torch.FloatTensor(X_test)
    y_train = torch.FloatTensor(y_train)
    y_test = torch.FloatTensor(y_test)
    return X_train, X_test, y_train, y_test

In [5]:
from torch import nn
from torch import optim

# Model Definition
def run_model(input_size):
    hidden_sizes = [100, 100, 1]
    output_size = 1

    model = nn.Sequential(nn.Linear(input_size, hidden_sizes[0]),
                      nn.Dropout(p=.4),
                      # nn.BatchNorm1d(num_features=hidden_sizes[0]),
                      nn.ReLU(),
                      nn.Linear(hidden_sizes[0], hidden_sizes[1]),
                      nn.Dropout(p=.4),
                      # nn.BatchNorm1d(num_features=hidden_sizes[1]),
                      nn.ReLU(),
                      nn.Linear(hidden_sizes[1], output_size),
                      nn.Sigmoid())

    loss = nn.MSELoss()
    optimizer = optim.SGD(model.parameters(), lr=0.01)
    
    return model, loss, optimizer

In [6]:
# Model Training


def run_training(epochs, model, X_train, y_train, optimizer, loss):
    print("runnig training...")
    epochs = epochs

    for i in range(1, epochs+1):
        y_hat = model(X_train)
        output = loss(y_hat, y_train)

        if i % 2 == 0:
            print(f'Epoch: {i} Loss: {output}')

        optimizer.zero_grad()
        output.backward()
        optimizer.step()

In [7]:
import torch
# Model Evaluation

def run_prediction(X_test):
    preds = []

    with torch.no_grad():
        for val in X_test:
            y_hat = model.forward(val)
            preds.append(y_hat.argmax().item())

    df = pd.DataFrame({'Y': y_test, 'YHat': preds})
    df['Correct'] = [1 if corr == pred else 0 for corr, pred in zip(df['Y'], df['YHat'])]
    df['Correct'].sum() / len(df)

In [8]:
import os
from torch import distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP

# Model distribution
def run(rank, size):
    ROWS=50000
    os.environ['MASTER_ADDR']='192.168.0.4'
    os.environ['MASTER_PORT']='49162'
    dist.init_process_group('gloo', rank=rank, world_size=size)
    df_train = run_import(ROWS, offset=(rank*ROWS)+1, use_ohe=False)
    print(f"Rank: {rank}")
    df_train
    cols, _ = run_encoding(df_train, use_ohe=False)
    X_train, X_test, y_train, y_test = run_preparation(df_train)
    model, loss, optimizer = run_model(len(cols))
    model = DDP(model)
    run_training(10, model, X_train, y_train, optimizer, loss)
    #run_prediction(X_test)

In [None]:
from torch.multiprocessing import Process

size = 2
processes = []
for rank in range(size):
    p = Process(target=run, args=(rank, size))
    p.start()
    processes.append(p)

for p in processes:
    p.join()

Rank: 1
Rank: 0
runnig training...


In [10]:
dist.destroy_process_group()

RuntimeError: Invalid process group specified