# Create Data Set 

This notebook transforms data from 'processesSummary' and 'processesModule' file into a single dataframe.
In the process it creates following features:
`ProcPath` - recursive parent process names leading to the root process.
`ProcPathId` - recursive parent process id's leading to the root process.
`DLL's`- multiple columns in the dataframe contain onehot encoded indicators whether the given process loaded a specific dll.


In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
date = '10-07'

In [3]:
process_df = pd.read_csv('perf+extracted/PerfViewData-{}.processesSummary.csv'.format(date), sep=';')
modules_df = pd.read_csv('perf+extracted/PerfViewData-{}.processesModule.csv'.format(date), sep=';')

### Setting Name column as strict string type

In [4]:
process_df = process_df.astype({"Name": str})

In [5]:
process_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 490 entries, 0 to 489
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Name          490 non-null    object
 1   ID            490 non-null    int64 
 2   Parent_ID     490 non-null    int64 
 3   Bitness       490 non-null    int64 
 4   CPUMsec       490 non-null    int64 
 5   AveProcsUsed  490 non-null    object
 6   DurationMSec  490 non-null    object
 7   StartMSec     490 non-null    object
 8   ExitCode      490 non-null    object
 9   CommandLine   486 non-null    object
dtypes: int64(4), object(6)
memory usage: 38.4+ KB


## Creating Test Paths

In [8]:
process_df['StartMSec'] = process_df['StartMSec'].replace({',':'.'},regex=True).apply(pd.to_numeric,1)
process_df['DurationMSec'] = process_df['DurationMSec'].replace({',':'.'},regex=True).apply(pd.to_numeric,1)
process_df['EndMSec'] = process_df['StartMSec'] + process_df['DurationMSec']

In [9]:
def procPath(parentId, name, start, procTree):
    ids = set()
    path = '/' + name
    finished = False
    
    try:
        while finished == False:
            probableParents = procTree[parentId]
            found = False
            ids.add(parentId)
            for parent in probableParents:
                if start > parent['Start'] and start < parent['End']:   
                    path = '/' + parent['Name'] + path
                    parentId = parent['Parent']
                    start = parent['Start'] 
                    found = True
                    if parentId in ids:
                        finished = True
                    break
            if not found:
                break
    except KeyError as ex:
        pass
    return path
tree = {}
for x, y, z, s, e in process_df[['ID', 'Parent_ID', 'Name', 'StartMSec', 'EndMSec']].itertuples(index=False): 
    if x in tree:
        tree[x].append({'Parent': y, 'Name': z, 'Start': s, 'End': e})
    else:
        tree[x] = [{'Parent': y, 'Name': z, 'Start': s, 'End': e}]
        
def procPathId(parentId, idValue, start, procTree):
    ids = set()
    path = '/' + str(idValue)
    finished = False
    
    try:
        while finished == False:
            probableParents = procTree[parentId]
            found = False
            ids.add(parentId)
            for parent in probableParents:
                if start > parent['Start'] and start < parent['End']:   
                    path = '/' + str(parentId) + path
                    parentId = parent['Parent']
                    start = parent['Start'] 
                    found = True
                    if parentId in ids:
                        finished = True
                    break
            if not found:
                break
    except KeyError as ex:
        pass
    return path
tree = {}
for x, y, z, s, e in process_df[['ID', 'Parent_ID', 'Name', 'StartMSec', 'EndMSec']].itertuples(index=False): 
    if x in tree:
        tree[x].append({'Parent': y, 'Name': z, 'Start': s, 'End': e})
    else:
        tree[x] = [{'Parent': y, 'Name': z, 'Start': s, 'End': e}]

In [10]:
process_df['Path'] = [procPath(x,y, z, tree) for x, y, z in process_df[['Parent_ID', 'Name', 'StartMSec']].itertuples(index=False)]
process_df['PathId'] = [procPathId(x,y, z, tree) for x, y, z in process_df[['Parent_ID', 'ID', 'StartMSec']].itertuples(index=False)]

## Names with extentions

In [11]:
modules_df['NameExt'] = [x.split('\\')[-1] for x in modules_df['FilePath']]

## OneHot

The first row for each process is droped as analysis sugests that it is either executable of the process it self or simmilar.


In [12]:
onehot_df = pd.DataFrame(columns = modules_df['Name'].value_counts().keys())
onehot_df['ID'] = process_df['ID']
onehot_df['Name'] = process_df['Name']
onehot_df = onehot_df.fillna(0)
skips = set(process_df[(process_df['Name'] == 'Registry') | (process_df['Name'] == 'MemCompression') | (process_df['Name'] == 'nan')].index)
# onehot_df = onehot_df[(onehot_df['Name'] != 'Registry') & (onehot_df['Name'] != 'MemCompression') & (onehot_df['Name'] != 'nan')]

i = -1
no_id = 0
cur_proc_name = ''
for row in modules_df.iterrows():
    if cur_proc_name != row[1][0] or no_id != row[1][1]:
        i+=1
        while i in skips:
            i+=1
        cur_proc_name = row[1][0]
        no_id = row[1][1]
#         print(i, cur_proc_name)
    if row[1][2].lower() != cur_proc_name.lower():
        onehot_df.loc[i, row[1][2]] = 1
#         print('\t',row[1][2])
onehot_df = onehot_df.fillna(0)
print(onehot_df.shape)
onehot_df = onehot_df.loc[:, onehot_df.sum(axis=0)!= 0]
print(onehot_df.shape)

(490, 1379)
(490, 1307)


In [13]:
# temp_proces_df = process_df[(process_df['Name'] != 'Registry') & (process_df['Name'] != 'MemCompression') & (process_df['Name'] != 'nan')]
onehot_df['StartMSec'] = process_df['StartMSec']
onehot_df['DurationMSec'] = process_df['DurationMSec']
onehot_df['EndMSec'] = process_df['EndMSec']
onehot_df['Path'] = process_df['Path']
onehot_df['PathId'] = process_df['PathId']
onehot_df['CommandLine'] = process_df['CommandLine']

## Save

In [14]:
onehot_df.to_csv('processed_data/onehot_{}.csv'.format(date), index=False)

## Validate

In [15]:
only_expl =  onehot_df[onehot_df['Name'] == 'explorer'].drop(['ID', 'Name', 'Path', 'CommandLine', 'StartMSec', 'DurationMSec', 'EndMSec'], axis=1)
only_expl.loc[:, (only_expl != 0).any(axis=0)]

Unnamed: 0,combase,rpcrt4,ucrtbase,msvcrt,ntdll,kernel32,kernelbase,msvcp_win,sechost,advapi32,...,windows.internal.ui.shell.windowtabmanager,shdocvw,duser,prnfldr,windows.data.activities,virtualmonitormanager,wscinterop,wpdshserviceobj,explorerframe,PathId
485,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,/3736


In [16]:
arr = ['puiobj','prnntfy','themeui','windows.internal.graphics.display.displaycolormanagement','cdprt','cdprt','cdprt','windows.internal.graphics.display.displaycolormanagement','windows.cloudstore.schema.desktopshell','cdprt','cdprt','cdprt','cdprt','uiribbon','cdprt','tiptsf','ieproxy','wpdshext','cdprt','wpdshext','framedynos','msiso','ieframe','pcacli','wpdshext','wpdshext','playtodevice','smartscreenps','taskflowui','tilecontrol','windowsinternal.composableshell.experiences.switcher','bthprops.cpl','imapi2','synccenter','srchadmin','pnidui','cscobj','mlang','settingmonitor','portabledeviceapi','dxp','actioncenter','inputswitch','stobject','icu','icu','icu','icu','icu','icu','icu','icu','icu','icu','icu','uiautomationcore','windows.media.mediacontrol','windows.media.mediacontrol','windows.media.mediacontrol','wpnclient','windows.services.targetedcontent','windows.internal.shell.broker','gdiplus','windows.ui.core.textinput','filesyncshell64','windows.internal.signals','dictationmanager','windowsudk.shellcommon','windows.internal.ui.shell.windowtabmanager','windows.shell.bluelightreduction','windows.fileexplorer.common','abovelockapphost','holographicextensions','applicationframe','ntshrui','twinui','portabledevicetypes','windows.immersiveshell.serviceprovider','twinui.pcshell','thumbcache','explorerframe','wpdshserviceobj','windows.storage.search','dataexchange','syncreg','virtualmonitormanager','cldapi','oleacc','settingsynccore','starttiledata','appextension','sndvolsso','winmm','cflapi','shellcommoncommonproxystub','notificationcontrollerps','windows.cloudstore.schema.shell','windows.cloudstore.schema.shell','ondemandconnroutehelper','appresolver','capauthz','wlidprov','windows.staterepositoryclient','windows.security.authentication.web.core','windows.applicationmodel','tiledatarepository','shdocvw','execmodelproxy','execmodelproxy','container','daxexec','audioses','uiamanager','twinui.appcore','windows.system.launcher','windows.system.launcher','appcontracts','atlthunk','windows.shell.servicehostbuilder','windows.shell.servicehostbuilder','edputil','capabilityaccessmanagerclient','capabilityaccessmanagerclient','capabilityaccessmanagerclient','twinapi','puiapi','printui','rdpendp','settingsync','windows.cloudstore','npsm','cryptngc','provsvc','provsvc','provsvc','bthavctpsvc','npsmdesktopprovider','dlnashext','cryptnet','windows.web','mssprxy','tquery','windows.staterepositoryps','winbrand','ondemandbrokerclient','windows.media.devices','staterepository.core','windows.staterepository','storageusage','wpnapps','wpnapps','wpnapps','mpr','ncryptsslp','windows.staterepositorybroker','hcproviders','werconcpl','wscui.cpl','wscinterop','windows.networking.connectivity','cdp','wscapi','onecorecommonproxystub','ethernetmediamanager','networkuxbroker','comppkgsup','cdprt','cdprt','nlmproxy','windows.ui.fileexplorer','duser','dui70','davclnt','ntlanman','windows.ui.appdefaults','provsvc','ehstorapi','aepic','windows.data.activities','taskflowdataengine','cscui','windows.ui.shell','prnfldr','devdispitemprovider','mswb7','photometadatahandler','structuredquery','windows.devices.enumeration','wininet','provsvc','mskeyprotect','textshaping','windows.globalization','windows.globalization','windows.globalization','windows.globalization','windows.globalization','windows.globalization','windows.globalization','windows.globalization','windows.globalization','windows.globalization','windows.globalization','comctl32','windows.ui.immersive','windows.ui.xaml','languageoverlayutil','inputhost','textinputframework','windows.ui','wincorlib','batmeter','linkinfo','idstore','windows.staterepositorycore','mrmcorer','dusmapi','workfoldersshell','searchfolder','wpprecorderum','networkexplorer','davhlpr','drprov','iconcodecservice','pcshellcommonproxystub','bcp47mrm','ehstorshell','bcp47langs','npmproxy','pdh','dwrite','rasadhlp','rtworkq','mfplat','iertutil','urlmon','cscapi','winhttp','mobilenetworking','dxcore','d3d10warp','fwpuclnt','wlanapi','version','dhcpcsvc','dhcpcsvc6','actxprxy','mmdevapi','winnsi','winspool.drv','secur32','sfc_os','fltlib','virtdisk','netapi32','srvcli','samcli','policymanager','es','wevtapi','onecoreuapcommonproxystub','coloradapterclient','mscms','uianimation','windowscodecs','twinapi.appcore','windowmanagementapi','ninput','appxdeploymentclient','usermgrcli','netprofm','wmiclnt','taskschd','taskschd','xmllite','propsys','samlib','wtsapi32','dsreg','d3d11','d2d1','wintypes','dcomp','coreuicomponents','coremessaging','apphelp','uxtheme','resourcepolicyclient','wer','rmclient','dwmapi','kernel.appcore','windows.storage','dxgi','tdh','sppc','slc','ntmarta','schannel','rsaenh','wkscli','msvcp110_win','iphlpapi','netutils','dnsapi','mswsock','cryptdll','cryptsp','cryptbase','winsta','umpdc','powrprof','wldp','ntasn1','ncrypt','msasn1','devobj','dpapi','sxs','userenv','sspicli','profapi','ucrtbase','wintrust','gdi32full','bcrypt','crypt32','cfgmgr32','kernelbase','win32u','bcryptprimitives','msvcp_win','nsi','shlwapi','sechost','shcore','kernel32','setupapi','combase','msvcrt','ws2_32','msctf','ole32','gdi32','clbcatq','rpcrt4','shell32','oleaut32','coml2','user32','imm32','advapi32','ntdll']

### Comparing the number of dlls in created dataset to the known dlls of the explorer process

In [17]:
if len(np.unique(arr, return_index=False)) == len(set(arr)):
    print("Validation passed")
else:
    print("Validation failed")

Validation passed


### Checking if any process didn't load all dlls

In [29]:
temp_oh_df = onehot_df.drop(['CommandLine', 'Path', 'PathId', 'EndMSec', 'DurationMSec', 'StartMSec', 'Name', 'ID'], axis=1)
curr_name = ''
curr_id = -1
i = -1
j = 0
curr_set = set()

for x, y, z in modules_df[['ProcessName', 'Name', 'ProcessID']].itertuples(index=False):
    
    if curr_name != x  or curr_id != z:
        if i >= 0:
            if temp_oh_df.loc[i, :].sum(axis=0) != len(curr_set): 
                print('For', i, curr_name, curr_id, 'SUM', temp_oh_df.loc[i, :].sum(axis=0), 'not equal', len(curr_set))
                print(curr_set)
        i+=1
        while onehot_df.loc[i,'Name'] == 'MemCompression' or onehot_df.loc[i,'Name'] == 'Registry':
            i+= 1
        j = 0 
        curr_name = x
        curr_id = z
        curr_set = set()
    if onehot_df.loc[i,'Name'] !=  curr_name:
        print('Name not alligned')
        print(curr_name, curr_id, i, onehot_df.loc[i,'Name'])
        break
    if y.lower() != x.lower():
        j += 1
        curr_set.add(y)
        if temp_oh_df.loc[i, y] != 1:
            print('1 not set')
            print(curr_name, curr_id, i, y)          
    