# Generating Adjacency and Feature Matrices for Juliet Dataset

Notebook to generate adjacency matrices of our scripts in the juliet dataset to be used as input for our neural network model. We take the AST trees from the python clang bindings and then create ajacency matrices for the edges and feature matrices for the code properties.

# Import and Preprocess Dataset

In [2]:
import ast
import pickle
import numpy as np
import pandas as pd
from scipy import sparse
import networkx as nx
from preprocess_code import *



In [47]:
data = pd.read_csv("../data/buffer_overflow_data.csv.gz")

In [7]:
with open("../data/adj.pickle",'rb') as f:
    adj = pickle.load(f)

In [48]:
labels = data.copy()
del labels['Unnamed: 0']
del labels['Unnamed: 0.1']
del labels['filename']
del labels['code']
del labels['flaw']
del labels['flaw_loc']
labels = labels.drop_duplicates().sort_values('testcase_ID').reset_index(drop=True)

In [49]:
adj = adj.rename(columns={0: 'testcase_ID', 1: 'matrix'})

In [50]:
adj_df = pd.merge(labels, adj, on='testcase_ID')

In [51]:
adj_df = adj_df[['testcase_ID', 'matrix', 'bug']]

In [52]:
adj_df['matrix_size'] = adj_df.matrix.apply(lambda x: x.shape[0])

In [53]:
matrix_size = 614
adj_df = adj_df[adj_df['matrix_size'] <= matrix_size]

In [54]:
dat = data[data.set_index(['testcase_ID']).index.isin(adj_df.set_index(['testcase_ID']).index)]

In [31]:
np.random.seed(1248)

## Adjacency Matrices

In [62]:
def generate_edge_list1(testcase, **kwargs):
    """
    Takes in a list of files/datapoints from juliet.csv.zip 
    or (as loaded with pandas) matching one particular testcase, 
    and returns an edge list of its graph representation.
    """
    parse_list = [
        (datapoint.filename, datapoint.code)
        for datapoint in testcase.itertuples()
    ]

    primary = find_primary_source_file(testcase)

    # Parse the source code with clang, and get out an ast:
    index = clang.cindex.Index.create()
    translation_unit = index.parse(
        path=primary.filename,
        unsaved_files=parse_list,
    )
    ast_root = translation_unit.cursor

    # Memoise/concretise the ast so that we can consistently
    # modify it, then number each node in the tree uniquely.
    concretise_ast(ast_root)
    number_ast_nodes(ast_root)

    # Next, construct an edge list for the graph2vec input:
    edgelist = generate_edgelist(ast_root)
    
    edgelist_representation = {
        "edges": edgelist,
    }

    # Explicitly delete clang objects
    del translation_unit
    del ast_root
    del index

    return json.dumps(edgelist_representation)

In [None]:
dask_data = dd.from_pandas(data, npartitions=20)

generate the graphs for all the testcases in the dataset 

graphs = data.groupby(['testcase_ID']).apply(
        generate_edge_list1,
        axis='columns',
        meta=('generate_edge_list', 'unicode'),
    )

In [None]:
def gen_adj_matrix1(testcase):
    
    """
    Takes in a list of files/datapoints from buffer_overflow_data.csv.gz 
    matching one particular testcase, and generates an adjacency matrix 
    from the edgelist created.
    """
    
    # extracting the list of edges 

    x = testcase.split('edges": ')
    x = x[1].split('}')
    x = ast.literal_eval(x[0])
    
#     return x

    # generating the matrix
    
    G = nx.Graph()

    G.add_edges_from(x)

    A = nx.adjacency_matrix(G)

    B = A.todense()

    return B

In [16]:
# create a dataframe containing the testcase ID and its adjacency matrix 
adjacency_df = pd.DataFrame()

In [17]:
adjacency_df['testcase_ID'] = data.testcase_ID.drop_duplicates()

In [18]:
# kernel dies when there are more than 200 datapoints

adj_matrices = graphs.apply(gen_adj_matrix1, meta = ('generate_adj_matrices', 'O'))
adj_matrices = graphs.apply(gen_adj_matrix1)

NameError: name 'graphs' is not defined

In [19]:
adj_matrices = pd.DataFrame(adj_matrices)
adj_matrices = adj_matrices.to_frame()

NameError: name 'adj_matrices' is not defined

In [20]:
adj_matrices = adj_matrices.compute()
adj_matrices = adj_matrices.reset_index(level='testcase_ID')

NameError: name 'adj_matrices' is not defined

In [12]:
adjacency_df['adj_matrix'] = adj_matrices[0]

In [13]:
adj_df = adjacency_df.dropna()

In [14]:
adj_df.to_csv("../data/adj_df.csv.gz")

## Feature Matrices

In [55]:
dat = dd.from_pandas(dat, npartitions=20)
ast_roots = dat.groupby(['testcase_ID']).apply(generate_ast_roots,axis='columns',
        meta=('generate_ast_roots', 'unicode'),)

Manually extracting the node properties relating to 'alloc', 'sizeOf' and 'writeToPointer' from clang generated ASTs.

In [56]:
alloc_list = ['__builtin_alloca', 
              '__alloc', 
              'malloc', 
              'valloc', 
              '__alloc_on_copy', 
              '__alloc_on_move', 
              'calloc', 
              'realloc', 
              'alloca',
              'ALLOCA'
             ]

sizeOf_list = ['std::aligned_storage<sizeof(_Tp), __alignof(_Tp)>'
              ]

writeToPointer_list = ['__builtin_memmove', 
                       '__builtin_memcpy', 
                       'wmempcpy', 
                       'wmemmove'
                      ]

In [57]:
def generate_features_matrix_test(ast_root):
    """
    Given a concretised & numbered clang ast, returns a matrix of one hot encoded features of node names kind and 
    whether it's alloc/writeToPointer/sizeOf/other, i.e. our feature matrix
    """
    index = []
    kind = {}
    spelling = {}
    

    def walk_tree_and_set_properties(node):
        out_degree = len(node.children)
        in_degree = 1
        degree = out_degree + in_degree
        
        index.append(node.identifier)
        
        kind[node.identifier] = node.kind
        spelling[node.identifier] = node.spelling
        
        if str(node.spelling) in writeToPointer_list:
            spelling[node.identifier] = 'WriteToPointer'
        
        elif str(node.spelling) in sizeOf_list:
            spelling[node.identifier] = 'SizeOf'
            
        elif str(node.spelling) in alloc_list:
            spelling[node.identifier] = 'Alloca'
        
        else:
            spelling[node.identifier] = ''
        

        for child in node.children:
            walk_tree_and_set_properties(child)

    walk_tree_and_set_properties(ast_root)
    
#     return index
    
    d = {'Identifier': index, 'kind': list(str(kind.values()), 'spelling': list(spelling.values())}
        
    ast_df = pd.DataFrame(data = d)
    return ast_df

Generating a dictionary of node features to be later used as column names in the feature matrices.

In [58]:
ast_list = []
testcase_list = []
ast_bag = {}

for index, row in ast_roots.iteritems():
    print(index)
    # colnames
    eg = generate_features_matrix_test(row)
    ast_bag[index] = eg

-234271
-234259
-234243
-234233
-234213
-234132
-234110
-234101
-234084
-234063
-234029
-234025
-234024
-234023
-234018
-233998
-233903
-233878
-233873
-233860
-233856
-233800
-233773
-233755
-233712
-233667
-233652
-233632
-233555
-233493
-233425
-233410
-233408
-233386
-233364
-233361
-233348
-233325
-233303
-233281
-233219
-233171
-233162
-233157
-233119
-233081
-233007
-232993
-232964
-232959
-232951
-232930
-232922
-232899
-232867
-232860
-232842
-232824
-232763
-232741
-232734
-232709
-232675
-232666
-232660
-232602
-232579
-232566
-232560
-232496
-232435
-232421
-232310
-232295
-232270
-232217
-232096
-232094
-232082
-232009
-231989
-231984
-231972
-231971
-231967
-231920
-231911
-231858
-231833
-231806
-231784
-231764
-231758
-231752
-231685
-231632
-231626
-231625
-231612
-231590
-231583
-231529
-231498
-231494
-231487
-231416
-73265
-73215
-72883
-72838
-72832
-72741
-72739
-72675
-72596
-72595
-72579
-72550
-72350
-71927
-71680
-71679
-71486
-71439
-71437
-71299
-71206
-7100

63401
63421
63442
63512
63523
63537
63563
63567
63579
63596
63624
63626
63643
63684
63691
63698
63718
63719
63728
63796
63819
63836
63847
63938
63974
64011
64024
64040
64049
64050
64053
64064
64154
64161
64177
64220
64334
64337
64362
64404
64440
64470
64475
64495
64500
64517
64567
64573
64577
64600
64614
64616
64626
64640
64645
64646
64650
64656
64664
64696
64830
65047
65059
65079
65084
65085
65091
65116
65175
65198
65222
65284
65294
65301
65320
65339
65359
65368
65400
65435
65515
65566
65575
65581
65589
65669
65713
65722
65753
65761
65762
65766
65783
65808
65837
65864
65906
65914
65917
65925
65927
65935
65999
66002
66004
66040
66048
66101
66187
66212
66241
66261
66262
66297
66361
66396
66412
66431
66474
66508
66524
66560
66578
66599
66665
66708
66737
66752
66758
66787
66811
66829
66855
66894
66898
66919
66960
66961
66968
66975
66976
67033
67043
67064
67069
67086
67091
67119
67137
67164
67166
67167
67187
67193
67218
67223
67228
67257
67307
67320
67323
67334
67367
67376
67389
67398
6742

-232643
-232617
-232603
-232576
-232572
-232552
-232546
-232534
-232528
-232525
-232508
-232500
-232467
-232463
-232461
-232459
-232454
-232443
-232417
-232402
-232294
-232249
-232233
-232214
-232073
-232031
-232022
-232020
-232019
-232013
-231900
-231875
-231859
-231808
-231776
-231762
-231727
-231715
-231713
-231690
-231677
-231676
-231673
-231621
-231608
-231599
-231584
-231526
-231503
-231500
-73354
-73213
-73176
-73173
-73023
-73022
-72931
-72695
-72686
-72544
-72493
-72402
-72353
-72303
-72118
-72110
-72072
-72023
-72015
-71975
-71871
-71869
-71582
-71536
-71496
-71341
-71215
-71214
-71109
-71022
-71016
-71015
-70920
-70823
-70776
-70775
-70732
-70543
-70333
-70219
-70210
-70196
-70193
-70150
-70093
-70028
-70004
-70002
-69997
-69953
-69903
-69902
-69816
-69814
-69714
-69551
-69487
-69480
-69475
-69471
-69333
-69189
-69091
-68944
-68942
-68916
-68864
-68830
-68804
-68802
-68795
-68781
-68739
-68729
-68724
-68685
-68663
-68645
-68635
-68616
-68613
-68603
-68564
-68562
-68431
-6843

63821
63834
63888
63921
63928
63958
63962
64008
64009
64018
64022
64026
64059
64072
64080
64083
64093
64101
64103
64114
64134
64136
64164
64182
64260
64343
64344
64350
64352
64371
64374
64375
64412
64439
64441
64448
64490
64493
64523
64546
64551
64555
64615
64618
64625
64632
64636
64676
64712
64734
64735
64760
64776
64780
65030
65037
65041
65042
65053
65087
65135
65184
65197
65200
65210
65213
65214
65328
65341
65344
65441
65532
65539
65540
65543
65594
65596
65607
65615
65647
65692
65693
65734
65743
65768
65778
65810
65846
65847
65851
65852
65901
65952
65961
65992
65997
66013
66027
66057
66098
66136
66164
66176
66186
66201
66239
66252
66296
66382
66386
66387
66414
66425
66484
66547
66635
66664
66670
66674
66683
66704
66722
66734
66809
66853
66913
66936
66966
66970
66972
66984
66989
67003
67092
67111
67134
67141
67173
67182
67232
67308
67309
67339
67439
67573
67578
67603
67648
67649
67660
67662
67668
67804
67834
67937
67950
68088
68431
68511
68587
68595
68598
68675
68676
68734
68741
6874

-232648
-232636
-232620
-232591
-232558
-232544
-232507
-232486
-232447
-232422
-232396
-232329
-232325
-232323
-232285
-232281
-232275
-232260
-232241
-232239
-232237
-232228
-232139
-232107
-232104
-232099
-232086
-232032
-232014
-231985
-231935
-231905
-231879
-231857
-231853
-231840
-231797
-231790
-231789
-231773
-231750
-231746
-231739
-231732
-231711
-231698
-231694
-231688
-231618
-231575
-231548
-231524
-231519
-231507
-231491
-231484
-231438
-73257
-73117
-73030
-72926
-72735
-72694
-72693
-72638
-72400
-72354
-72263
-72120
-72079
-72022
-72021
-72013
-71875
-71779
-71733
-71728
-71695
-71678
-71631
-71535
-71502
-71397
-71392
-71350
-71342
-71295
-71293
-71118
-70926
-70871
-70864
-70768
-70387
-70383
-70381
-70052
-70033
-70006
-69998
-69978
-69768
-69576
-69542
-69539
-69522
-69498
-69440
-69423
-69393
-69390
-69332
-69326
-69247
-69182
-69103
-68963
-68940
-68921
-68908
-68899
-68870
-68834
-68800
-68789
-68765
-68730
-68703
-68698
-68693
-68678
-68673
-68653
-68651
-6862

65931
65979
66024
66045
66052
66064
66073
66096
66138
66144
66150
66189
66251
66318
66376
66422
66457
66477
66497
66528
66530
66574
66595
66677
66689
66706
66728
66745
66753
66756
66760
66846
66856
66881
66882
66886
66948
66978
67007
67035
67053
67096
67123
67125
67138
67177
67183
67188
67198
67220
67263
67303
67305
67313
67314
67357
67361
67369
67406
67605
67637
67643
67677
67680
67697
67710
67778
67840
67848
67992
68035
68174
68177
68191
68462
68469
68510
68517
68616
68632
68633
68661
68670
68719
68745
68765
68779
68852
68855
68873
68899
68925
68949
68964
68992
69037
69229
69235
69246
69380
69486
69518
69524
69567
69582
69630
69811
70062
70111
70239
70765
70917
71071
71156
71198
71207
71251
71343
71406
71445
71492
71589
71679
71686
71694
72063
72070
72167
72254
72259
72260
72406
72450
72500
72542
72599
72686
72829
72835
72935
72943
73026
73030
73077
73125
73215
73216
73266
73362
231439
231492
231502
231503
231545
231572
231577
231586
231596
231638
231724
231725
231757
231767
231812
2

-69354
-69187
-69150
-68980
-68977
-68946
-68930
-68928
-68855
-68832
-68812
-68808
-68797
-68788
-68773
-68745
-68719
-68706
-68684
-68641
-68630
-68623
-68615
-68565
-68512
-68444
-68414
-68328
-68278
-68221
-68175
-68152
-68144
-68128
-68094
-68082
-68027
-67984
-67954
-67929
-67780
-67767
-67677
-67668
-67627
-67625
-67620
-67600
-67595
-67577
-67446
-67434
-67430
-67401
-67323
-67315
-67313
-67216
-67186
-67162
-67133
-67088
-67066
-67062
-67046
-66999
-66975
-66882
-66872
-66857
-66805
-66771
-66753
-66722
-66713
-66693
-66678
-66677
-66666
-66615
-66614
-66585
-66579
-66536
-66526
-66477
-66454
-66451
-66361
-66347
-66327
-66309
-66271
-66264
-66247
-66208
-66134
-66120
-66078
-66016
-66002
-65933
-65924
-65808
-65715
-65711
-65695
-65658
-65646
-65587
-65585
-65561
-65548
-65528
-65523
-65518
-65462
-65457
-65435
-65414
-65410
-65400
-65376
-65362
-65360
-65291
-65143
-65136
-65106
-65097
-65080
-65079
-65057
-65038
-65036
-64991
-64951
-64871
-64870
-64772
-64752
-64733
-64651

71016
71062
71070
71157
71246
71263
71443
71581
71630
71646
71823
71920
72117
72243
72257
72301
72308
72641
72675
72693
72847
73032
73078
73120
231413
231484
231520
231616
231624
231637
231756
231915
231936
232031
232119
232210
232213
232232
232242
232251
232308
232321
232387
232418
232423
232440
232443
232511
232554
232576
232634
232667
232676
232681
232700
232709
232717
232743
232784
232824
232834
232858
232866
232867
232875
232889
232904
232928
232941
232964
232979
232995
233008
233016
233021
233088
233094
233126
233149
233196
233208
233210
233212
233252
233258
233260
233280
233347
233375
233425
233458
233469
233477
233491
233516
233517
233518
233531
233552
233612
233617
233640
233679
233683
233708
233715
233716
233728
233733
233739
233756
233757
233783
233792
233835
233851
233897
233910
233922
233949
233957
233959
233967
233983
233991
234002
234003
234007
234031
234115
234118
234128
234165
234217
234218
234219
234227
-234220
-234215
-234151
-234139
-234087
-234080
-234061
-234046
-

-66583
-66578
-66559
-66531
-66528
-66523
-66462
-66415
-66401
-66399
-66385
-66384
-66381
-66318
-66253
-66245
-66215
-66190
-66153
-66148
-66139
-66124
-66121
-66091
-66076
-66043
-66039
-66013
-65997
-65982
-65978
-65968
-65928
-65906
-65903
-65878
-65869
-65853
-65851
-65837
-65831
-65811
-65768
-65754
-65712
-65677
-65637
-65636
-65632
-65611
-65609
-65569
-65550
-65543
-65451
-65384
-65305
-65299
-65289
-65287
-65225
-65203
-65181
-65168
-65166
-65154
-65152
-65121
-65090
-65058
-64776
-64774
-64756
-64743
-64731
-64719
-64706
-64689
-64683
-64680
-64605
-64592
-64585
-64582
-64572
-64540
-64538
-64475
-64465
-64456
-64440
-64410
-64407
-64352
-64351
-64337
-64332
-64325
-64315
-64186
-64157
-64153
-64096
-64053
-64026
-64020
-64013
-63961
-63960
-63924
-63919
-63916
-63885
-63860
-63845
-63784
-63772
-63761
-63719
-63635
-63626
-63622
-63604
-63602
-63585
-63572
-63569
-63558
-63466
-63458
-63436
-63366
-63354
-63339
-63298
-63290
-63289
-63288
-63266
-63199
-63198
-63184
-63156

232512
232525
232527
232528
232552
232574
232601
232626
232652
232660
232695
232697
232705
232767
232797
232803
232841
232851
232853
232862
232863
232922
232931
232971
232981
233014
233019
233025
233035
233045
233082
233118
233144
233164
233185
233197
233199
233224
233228
233384
233401
233413
233430
233479
233493
233561
233564
233576
233592
233604
233631
233647
233648
233655
233758
233768
233813
233853
233864
233865
233875
233886
233907
233913
233993
234033
234045
234047
234076
234079
234089
234104
234119
234123
234127
234139
234170
234222
234231
-234269
-234251
-234216
-234191
-234172
-234169
-234144
-234092
-234076
-234045
-234033
-234030
-234027
-234009
-233960
-233933
-233931
-233889
-233877
-233812
-233782
-233776
-233775
-233723
-233707
-233704
-233700
-233691
-233683
-233663
-233654
-233649
-233645
-233579
-233576
-233556
-233554
-233549
-233539
-233536
-233531
-233529
-233524
-233486
-233480
-233460
-233453
-233428
-233427
-233385
-233376
-233365
-233359
-233343
-233337
-233288

-64001
-63978
-63958
-63956
-63934
-63903
-63884
-63857
-63853
-63781
-63779
-63770
-63744
-63688
-63666
-63650
-63645
-63623
-63620
-63617
-63592
-63575
-63546
-63513
-63496
-63472
-63440
-63413
-63384
-63370
-63328
-63306
-63284
-63278
-63264
-63259
-63241
-63238
-63235
-63194
-63128
-63123
-63085
-63084
-63023
-63000
-62944
-62862
-62833
-62814
-62764
-62698
-62679
-62673
-62652
-62588
-62584
-62581
-62564
62583
62609
62681
62700
62782
62783
62805
62807
62812
62813
62815
62828
62835
62881
62910
62981
63193
63221
63275
63297
63328
63386
63408
63419
63426
63479
63488
63689
63714
63729
63739
63755
63774
63795
63839
63845
63873
63883
63927
63930
63941
63944
63952
64034
64037
64041
64056
64066
64123
64171
64184
64193
64233
64256
64257
64263
64264
64326
64338
64378
64400
64478
64483
64504
64516
64528
64565
64569
64579
64607
64673
64687
64711
64714
64725
64754
64757
64762
64785
65045
65049
65072
65098
65154
65155
65169
65182
65202
65217
65278
65300
65358
65366
65373
65381
65386
65401
65413

-232840
-232785
-232771
-232768
-232766
-232762
-232760
-232717
-232703
-232701
-232688
-232674
-232640
-232633
-232631
-232612
-232599
-232583
-232582
-232578
-232555
-232484
-232473
-232464
-232431
-232405
-232403
-232313
-232304
-232299
-232254
-232243
-232225
-232148
-232091
-232026
-232016
-231983
-231928
-231922
-231874
-231852
-231788
-231771
-231767
-231761
-231748
-231734
-231722
-231712
-231585
-231577
-231549
-231541
-73313
-73175
-73126
-73125
-73119
-73077
-72928
-72925
-72887
-72884
-72880
-72878
-72846
-72723
-72696
-72543
-72483
-72450
-72446
-72358
-72305
-72301
-72261
-72255
-72063
-71880
-71734
-71688
-71591
-71590
-71588
-71584
-71406
-71400
-71255
-71158
-71151
-71103
-71012
-70912
-70815
-70339
-70290
-70287
-70285
-70237
-70190
-70149
-70096
-70003
-69966
-69678
-69668
-69565
-69524
-69497
-69472
-69452
-69328
-69295
-69135
-69093
-68927
-68886
-68885
-68880
-68805
-68792
-68791
-68772
-68760
-68720
-68709
-68681
-68649
-68627
-68600
-68599
-68590
-68582
-68579
-

63144
63156
63168
63226
63237
63303
63313
63333
63395
63409
63473
63476
63477
63501
63528
63575
63577
63594
63672
63735
63779
63780
63800
63814
63842
63858
63893
63905
63912
63931
63943
63964
63966
63967
63979
63980
63992
64005
64016
64019
64023
64075
64130
64137
64142
64152
64168
64223
64235
64285
64293
64296
64314
64322
64332
64351
64369
64379
64385
64391
64403
64415
64417
64419
64420
64434
64503
64530
64535
64554
64556
64583
64586
64602
64610
64612
64613
64635
64688
64727
64736
64745
64752
64831
64870
65034
65105
65159
65207
65274
65324
65337
65363
65379
65399
65424
65458
65523
65529
65618
65620
65658
65659
65708
65739
65758
65759
65763
65792
65806
65833
65849
65920
65990
65998
66015
66019
66020
66076
66091
66103
66105
66126
66173
66218
66233
66250
66292
66299
66301
66383
66406
66472
66491
66507
66509
66525
66531
66566
66597
66601
66678
66698
66718
66727
66749
66777
66793
66814
66816
66818
66830
66836
66877
66880
66887
66897
66915
66916
66926
66985
66988
66991
67024
67051
67075
6708

In [113]:
import pickle
with open("../data/ast_bag_list.pickle",'wb') as f:
    pickle.dump(ast_bag_list,f)

Load pickle:

In [3]:
import pickle

with open("../data/ast_bag_dic.pickle",'rb') as f:
    ast_bag = pickle.load(f)
with open("../data/ast_bag_list.pickle",'rb') as f:
    ast_bag_list = pickle.load(f)

In [4]:
ast_bag_list = list(ast_bag.values())

In [5]:
ast_list = []
for ast in ast_bag_list:
    ast_list.extend(list(ast.kind))

In [8]:
# convert order into string
order = [str(i) for i in set(ast_list)]
order.extend(['spelling_Alloca','spelling_SizeOf','spelling_WriteToPointer'])

In [9]:
order

['CursorKind.DECL_STMT',
 'CursorKind.CHARACTER_LITERAL',
 'CursorKind.PAREN_EXPR',
 'CursorKind.UNARY_OPERATOR',
 'CursorKind.ARRAY_SUBSCRIPT_EXPR',
 'CursorKind.LABEL_STMT',
 'CursorKind.BINARY_OPERATOR',
 'CursorKind.UNEXPOSED_ATTR',
 'CursorKind.COMPOUND_STMT',
 'CursorKind.CASE_STMT',
 'CursorKind.TRANSLATION_UNIT',
 'CursorKind.DEFAULT_STMT',
 'CursorKind.CSTYLE_CAST_EXPR',
 'CursorKind.IF_STMT',
 'CursorKind.UNEXPOSED_DECL',
 'CursorKind.STRUCT_DECL',
 'CursorKind.SWITCH_STMT',
 'CursorKind.INIT_LIST_EXPR',
 'CursorKind.UNION_DECL',
 'CursorKind.CXX_ACCESS_SPEC_DECL',
 'CursorKind.WHILE_STMT',
 'CursorKind.CLASS_DECL',
 'CursorKind.ASM_LABEL_ATTR',
 'CursorKind.FIELD_DECL',
 'CursorKind.PURE_ATTR',
 'CursorKind.FOR_STMT',
 'CursorKind.TYPE_REF',
 'CursorKind.GNU_NULL_EXPR',
 'CursorKind.GOTO_STMT',
 'CursorKind.CXX_BASE_SPECIFIER',
 'CursorKind.FUNCTION_DECL',
 'CursorKind.VAR_DECL',
 'CursorKind.PARM_DECL',
 'CursorKind.BREAK_STMT',
 'CursorKind.LABEL_REF',
 'CursorKind.RETURN_

One-hot encoding the columns:

In [10]:
def ast_dummy(ast_df):
    '''
    Given a concretised & numbered clang ast, returns a dataframe with one-hot encoded columns of node features
    for each node in the tree.
    '''
    
    ast_df = ast_df.set_index('Identifier')
#     import pdb; pdb.set_trace()
    dum_df = pd.get_dummies(ast_df, prefix=['kind', 'spelling'])

    dum_df = dum_df.drop('spelling_', axis=1)

    for col in dum_df.filter(regex='kind_*').columns:
        dum_df = dum_df.rename(columns = {col: col.replace('kind_', '')})


    #     df_merge_col = pd.merge(dum_df, matrix_df, on='Identifier')
#     set(ast_list)
    for i in order:
        if i not in dum_df.columns:
            dum_df[i]=0
    
    
    return dum_df[order]

Test for ten datapoints:

In [18]:
df = []

for key in list(ast_bag.keys())[0:10]:
    print(key)
#     ast_bag[key] = ast_dummy(ast_bag[key])
    t = ast_dummy(ast_bag[key])
    df.append([key, sparse.csr_matrix(ast_dummy(ast_bag[key]))])

-234271
-234259
-234243
-234233
-234213
-234132
-234110
-234101
-234084
-234063


In [14]:
def sparse_transfer(key):
    return [key, sparse.csr_matrix(ast_dummy(ast_bag[key]))]

Generate a list of matrices for each testcase ID:

In [21]:
feature_matrix = list(map(sparse_transfer,list(ast_bag.keys())))

Store matrices in a dataframe.

In [23]:
matrix = pd.DataFrame(feature_matrix)

In [25]:
with open("../data/feature_matrix.pickle",'wb') as f:
    pickle.dump(matrix, f)

In [144]:
# if we want to check later maybe this code is useful to look what it looks like in the dataframe with columns name
# ast_sparse = {}
# for key in list(ast_bag.keys()):
#     print(key)
#     ast_bag[key] = ast_dummy(ast_bag[key])
#     ast_sparse.update({key: sparse.csr_matrix(ast_bag[key])})

-234271
-234259
-234243
-234233
-234213
-234132
-234110
-234101
-234084
-234063
-234029
-234025
-234024
-234023
-234018
-233998
-233903
-233878
-233873
-233860
-233856
-233800
-233773
-233755
-233712
-233667
-233652
-233632
-233555
-233493
-233425
-233410
-233408
-233386
-233364
-233361
-233348
-233325
-233303
-233281
-233219
-233171
-233162
-233157
-233119
-233081
-233007
-232993
-232964
-232959
-232951
-232930
-232922
-232899
-232867
-232860
-232842
-232824
-232763
-232741
-232734
-232709
-232675
-232666
-232660
-232602
-232579
-232566
-232560
-232496
-232435
-232421
-232310
-232295
-232270
-232217
-232096
-232094
-232082
-232009
-231989
-231984
-231972
-231971
-231967
-231920
-231911
-231858
-231833
-231806
-231784
-231764
-231758
-231752
-231685
-231632
-231626
-231625
-231612
-231590
-231583
-231529
-231498
-231494
-231487
-231416
-73265
-73215
-72883
-72838
-72832
-72741
-72739
-72675
-72596
-72595
-72579
-72550
-72350
-71927
-71680
-71679
-71486
-71439
-71437
-71299
-71206
-7100

63128
63133
63162
63163
63192
63195
63203
63204
63262
63278
63284
63358
63363
63371
63396
63401
63421
63442
63512
63523
63537
63563
63567
63579
63596
63624
63626
63643
63684
63691
63698
63718
63719
63728
63796
63819
63836
63847
63938
63974
64011
64024
64040
64049
64050
64053
64064
64154
64161
64177
64220
64334
64337
64362
64404
64440
64470
64475
64495
64500
64517
64567
64573
64577
64600
64614
64616
64626
64640
64645
64646
64650
64656
64664
64696
64830
65047
65059
65079
65084
65085
65091
65116
65175
65198
65222
65284
65294
65301
65320
65339
65359
65368
65400
65435
65515
65566
65575
65581
65589
65669
65713
65722
65753
65761
65762
65766
65783
65808
65837
65864
65906
65914
65917
65925
65927
65935
65999
66002
66004
66040
66048
66101
66187
66212
66241
66261
66262
66297
66361
66396
66412
66431
66474
66508
66524
66560
66578
66599
66665
66708
66737
66752
66758
66787
66811
66829
66855
66894
66898
66919
66960
66961
66968
66975
66976
67033
67043
67064
67069
67086
67091
67119
67137
67164
67166
6716

-234085
-234039
-233967
-233958
-233912
-233880
-233864
-233789
-233774
-233742
-233639
-233601
-233573
-233525
-233471
-233467
-233416
-233402
-233369
-233362
-233354
-233352
-233336
-233324
-233310
-233251
-233232
-233228
-233206
-233176
-233175
-233159
-233156
-233123
-233104
-233083
-233039
-233024
-233009
-232994
-232991
-232961
-232954
-232925
-232909
-232900
-232893
-232884
-232864
-232856
-232820
-232819
-232815
-232792
-232759
-232754
-232731
-232708
-232693
-232655
-232654
-232651
-232650
-232643
-232617
-232603
-232576
-232572
-232552
-232546
-232534
-232528
-232525
-232508
-232500
-232467
-232463
-232461
-232459
-232454
-232443
-232417
-232402
-232294
-232249
-232233
-232214
-232073
-232031
-232022
-232020
-232019
-232013
-231900
-231875
-231859
-231808
-231776
-231762
-231727
-231715
-231713
-231690
-231677
-231676
-231673
-231621
-231608
-231599
-231584
-231526
-231503
-231500
-73354
-73213
-73176
-73173
-73023
-73022
-72931
-72695
-72686
-72544
-72493
-72402
-72353
-7230

-63886
-63862
-63835
-63811
-63738
-63699
-63660
-63653
-63651
-63607
-63603
-63600
-63599
-63570
-63557
-63555
-63554
-63435
-63357
-63355
-63334
-63312
-63297
-63251
-63240
-63237
-63213
-63201
-63193
-63181
-63171
-63142
-63134
-63131
-63115
-63078
-63027
-63012
-63009
-62998
-62908
-62882
-62839
-62821
-62812
-62809
-62804
-62796
-62775
-62707
-62674
62573
62581
62585
62601
62652
62666
62684
62786
62803
62819
62862
62863
62876
62902
62913
62940
63002
63072
63089
63136
63153
63157
63169
63249
63312
63352
63423
63425
63432
63448
63472
63482
63484
63485
63486
63525
63529
63583
63593
63602
63618
63633
63648
63695
63721
63726
63761
63771
63783
63821
63834
63888
63921
63928
63958
63962
64008
64009
64018
64022
64026
64059
64072
64080
64083
64093
64101
64103
64114
64134
64136
64164
64182
64260
64343
64344
64350
64352
64371
64374
64375
64412
64439
64441
64448
64490
64493
64523
64546
64551
64555
64615
64618
64625
64632
64636
64676
64712
64734
64735
64760
64776
64780
65030
65037
65041
65042
6

233369
233370
233378
233391
233392
233398
233432
233440
233461
233488
233492
233499
233500
233501
233553
233566
233584
233586
233589
233658
233660
233680
233719
233732
233774
233779
233799
233816
233832
233877
233893
233933
233935
233945
233973
234000
234006
234018
234081
234126
234135
234220
234229
234232
234255
-234185
-234154
-234140
-234133
-234096
-234094
-234093
-234068
-234066
-233982
-233906
-233896
-233893
-233842
-233833
-233810
-233806
-233792
-233768
-233760
-233758
-233746
-233720
-233714
-233711
-233710
-233648
-233646
-233643
-233623
-233617
-233563
-233559
-233503
-233463
-233421
-233389
-233375
-233332
-233249
-233248
-233220
-233207
-233198
-233182
-233165
-233160
-233136
-233105
-233102
-233099
-233057
-233027
-233012
-233000
-232941
-232931
-232903
-232894
-232857
-232841
-232805
-232803
-232801
-232783
-232756
-232724
-232721
-232697
-232687
-232681
-232676
-232652
-232648
-232636
-232620
-232591
-232558
-232544
-232507
-232486
-232447
-232422
-232396
-232329
-2323

-64328
-64316
-64297
-64285
-64280
-64254
-64253
-64249
-64232
-64221
-64156
-64154
-64140
-64138
-64123
-64097
-64095
-64085
-64055
-64040
-64035
-63933
-63893
-63877
-63854
-63851
-63814
-63804
-63786
-63775
-63763
-63760
-63696
-63683
-63681
-63674
-63673
-63640
-63633
-63625
-63619
-63566
-63559
-63552
-63531
-63474
-63465
-63448
-63386
-63364
-63340
-63327
-63265
-63247
-63167
-63159
-63153
-63124
-63106
-63096
-62928
-62924
-62917
-62810
-62672
-62665
-62636
-62615
62574
62575
62600
62634
62649
62657
62679
62698
62800
62804
62852
62890
62997
63020
63175
63199
63205
63238
63245
63250
63340
63346
63356
63362
63373
63383
63402
63451
63454
63481
63505
63522
63552
63553
63580
63645
63654
63662
63666
63760
63765
63778
63784
63798
63806
63815
63824
63837
63844
63849
63855
63862
63872
63922
63998
64020
64076
64078
64079
64091
64140
64143
64169
64196
64222
64273
64282
64335
64336
64365
64380
64407
64413
64459
64472
64518
64544
64568
64599
64617
64659
64670
64695
64740
64758
64766
64772
64

233804
233827
233858
233895
233932
233989
234009
234016
234061
234078
234109
234180
234181
234191
234198
234206
234208
234210
234225
234236
234254
234271
-234232
-234225
-234202
-234183
-234170
-234157
-234149
-234143
-234124
-234102
-234089
-234073
-234021
-234008
-233934
-233899
-233888
-233863
-233848
-233823
-233721
-233677
-233669
-233624
-233611
-233572
-233552
-233551
-233521
-233516
-233498
-233418
-233344
-233333
-233314
-233309
-233255
-233216
-233078
-233069
-233015
-232999
-232974
-232962
-232942
-232861
-232859
-232850
-232834
-232782
-232753
-232729
-232698
-232684
-232642
-232609
-232601
-232597
-232595
-232571
-232559
-232550
-232543
-232526
-232517
-232506
-232470
-232465
-232451
-232423
-232384
-232311
-232300
-232259
-232256
-232218
-232211
-232118
-232042
-232000
-231848
-231835
-231800
-231787
-231756
-231742
-231724
-231721
-231699
-231691
-231675
-231674
-231649
-231645
-231643
-231635
-231619
-231589
-231588
-231556
-231545
-231530
-231496
-231483
-231460
-23143

-63209
-63197
-63176
-63160
-63140
-63010
-62996
-62910
-62819
-62817
-62813
-62786
-62784
-62637
-62626
-62619
-62618
-62604
-62571
62572
62582
62586
62596
62614
62671
62675
62693
62775
62776
62778
62832
62858
62861
62879
63078
63084
63100
63116
63210
63248
63320
63324
63361
63368
63376
63411
63435
63450
63480
63504
63535
63545
63557
63558
63559
63560
63576
63581
63636
63655
63674
63675
63676
63690
63775
63792
63811
63818
63859
63898
63940
63963
63970
63977
63981
64014
64021
64097
64119
64122
64192
64200
64280
64281
64320
64329
64366
64386
64424
64430
64432
64453
64489
64498
64525
64533
64536
64552
64561
64564
64580
64593
64622
64654
64662
64697
64699
64721
64723
64737
64741
64755
64756
64781
64783
65077
65096
65124
65152
65193
65357
65518
65522
65528
65563
65583
65591
65599
65611
65613
65641
65648
65676
65679
65683
65719
65740
65821
65859
65865
65882
65923
65942
65976
66001
66008
66025
66044
66065
66089
66099
66118
66165
66194
66217
66305
66347
66388
66392
66453
66462
66479
66488
665

-233974
-233917
-233913
-233892
-233874
-233857
-233821
-233786
-233717
-233688
-233680
-233668
-233661
-233658
-233634
-233621
-233615
-233562
-233558
-233527
-233517
-233492
-233455
-233454
-233444
-233401
-233380
-233315
-233291
-233283
-233270
-233238
-233226
-233194
-233163
-233158
-233140
-233131
-233114
-233093
-233067
-233045
-233043
-233013
-232977
-232975
-232971
-232960
-232940
-232936
-232929
-232924
-232890
-232876
-232772
-232761
-232742
-232737
-232723
-232704
-232699
-232667
-232662
-232661
-232638
-232611
-232607
-232606
-232605
-232589
-232588
-232573
-232521
-232513
-232501
-232491
-232490
-232439
-232436
-232429
-232407
-232404
-232389
-232385
-232379
-232302
-232291
-232286
-232257
-232245
-232242
-232028
-231996
-231994
-231854
-231851
-231832
-231774
-231740
-231736
-231731
-231686
-231646
-231601
-231595
-231578
-231572
-231571
-231566
-231551
-231547
-231534
-231528
-231522
-231513
-231510
-231458
-73361
-73309
-73306
-73303
-73218
-73211
-73167
-73071
-73070
-

-63442
-63438
-63410
-63403
-63382
-63332
-63314
-63293
-63272
-63205
-63204
-63121
-63116
-63093
-63077
-63015
-62981
-62939
-62890
-62887
-62854
-62827
-62811
-62800
-62791
-62789
-62772
-62676
-62649
-62623
-62574
62613
62616
62631
62641
62646
62662
62663
62697
62760
62795
62821
62873
62883
62996
62999
63009
63077
63086
63125
63138
63161
63165
63183
63197
63219
63247
63264
63283
63290
63319
63331
63338
63357
63397
63445
63495
63555
63564
63569
63574
63586
63598
63608
63625
63641
63644
63647
63650
63682
63694
63799
63877
63880
63887
63895
63902
63904
63953
63971
64004
64044
64045
64073
64155
64180
64198
64214
64219
64245
64250
64272
64305
64318
64367
64370
64409
64414
64481
64487
64512
64542
64595
64598
64623
64641
64746
64778
64791
65033
65064
65114
65157
65161
65192
65272
65302
65322
65354
65398
65448
65514
65570
65598
65645
65709
65751
65772
65773
65777
65813
65816
65827
65828
65836
65856
65869
65895
65924
65947
65964
65965
65969
66030
66047
66055
66067
66097
66135
66143
66153
661

-234145
-234137
-234097
-234035
-234034
-234014
-233995
-233988
-233978
-233861
-233799
-233788
-233761
-233745
-233719
-233716
-233709
-233689
-233678
-233630
-233619
-233599
-233595
-233582
-233564
-233547
-233520
-233485
-233465
-233447
-233394
-233318
-233311
-233265
-233256
-233231
-233214
-233184
-233145
-233134
-233124
-233054
-233029
-233025
-233017
-233010
-233003
-232966
-232933
-232926
-232875
-232854
-232825
-232810
-232793
-232780
-232755
-232732
-232714
-232706
-232690
-232689
-232678
-232672
-232649
-232569
-232565
-232532
-232483
-232477
-232430
-232394
-232386
-232383
-232381
-232326
-232320
-232317
-232298
-232268
-232255
-232250
-232221
-232212
-232156
-232140
-232130
-232018
-232006
-232005
-232003
-231987
-231986
-231932
-231889
-231888
-231881
-231871
-231861
-231856
-231834
-231827
-231810
-231804
-231777
-231729
-231695
-231693
-231680
-231628
-231613
-231604
-231560
-231542
-231537
-231502
-231444
-73310
-73128
-73028
-73024
-72932
-72888
-72886
-72785
-72771
-

-63291
-63285
-63283
-63281
-63280
-63252
-63248
-63221
-63218
-63217
-63195
-63185
-63182
-63133
-63129
-63073
-63025
-63021
-63006
-63004
-62997
-62994
-62935
-62911
-62891
-62840
-62806
-62782
-62777
-62767
-62758
-62742
-62697
-62687
-62684
-62670
-62657
-62656
-62620
-62611
-62602
-62591
-62590
-62572
-62569
62563
62602
62643
62656
62817
62855
62856
62888
62909
62944
63096
63140
63159
63236
63251
63258
63354
63375
63392
63466
63489
63500
63517
63520
63527
63538
63595
63601
63619
63640
63646
63706
63737
63757
63762
63822
63863
63965
63983
64007
64036
64038
64043
64060
64077
64105
64120
64159
64208
64221
64275
64279
64300
64394
64408
64462
64477
64479
64521
64522
64575
64591
64653
64661
64682
64750
64763
64991
65043
65078
65082
65090
65101
65115
65141
65142
65156
65158
65179
65215
65286
65289
65323
65362
65383
65403
65449
65527
65560
65624
65633
65644
65654
65666
65755
65767
65775
65812
65835
65839
65898
65899
65936
65951
65960
65963
66012
66068
66072
66075
66078
66087
66107
66113
6

233840
233869
233928
233934
233956
233996
234004
234015
234019
234024
234032
234036
234066
234084
234096
234114
234145
234169
234203
234214
-234256
-234235
-234227
-234217
-234208
-234192
-234171
-234163
-234152
-234095
-234083
-234078
-234064
-234051
-234007
-234002
-233976
-233962
-233961
-233935
-233919
-233907
-233900
-233858
-233854
-233825
-233819
-233815
-233808
-233797
-233790
-233779
-233778
-233772
-233764
-233749
-233743
-233732
-233699
-233650
-233597
-233585
-233566
-233535
-233500
-233497
-233461
-233458
-233432
-233431
-233426
-233420
-233384
-233378
-233312
-233299
-233211
-233177
-233167
-233144
-233141
-233138
-233100
-233062
-233050
-233049
-233016
-233006
-232997
-232980
-232957
-232952
-232935
-232906
-232888
-232852
-232823
-232813
-232798
-232752
-232749
-232700
-232683
-232659
-232627
-232610
-232520
-232509
-232495
-232474
-232442
-232425
-232397
-232230
-232219
-232127
-232119
-232116
-232097
-232093
-232008
-231973
-231901
-231855
-231845
-231820
-231816
-231

-63774
-63768
-63746
-63745
-63732
-63723
-63663
-63639
-63608
-63601
-63597
-63581
-63571
-63541
-63535
-63528
-63502
-63500
-63439
-63418
-63414
-63407
-63406
-63372
-63345
-63321
-63304
-63273
-63216
-63196
-63169
-63161
-63125
-63104
-63087
-63014
-63011
-62931
-62780
-62762
-62693
-62683
-62682
-62677
-62651
-62648
-62622
-62616
-62601
-62597
-62595
62565
62570
62599
62635
62651
62665
62691
62836
62842
62854
62914
62939
63006
63014
63019
63025
63115
63152
63177
63198
63214
63218
63246
63273
63276
63285
63288
63299
63360
63369
63378
63398
63404
63406
63407
63410
63422
63437
63449
63465
63503
63556
63571
63621
63657
63663
63697
63715
63756
63768
63801
63808
63809
63813
63817
63820
63850
63878
63918
63920
63957
64048
64057
64131
64146
64160
64167
64210
64217
64274
64277
64283
64291
64342
64355
64356
64361
64382
64393
64436
64456
64458
64466
64485
64486
64505
64506
64553
64620
64631
64666
64671
64674
64680
64693
64722
64731
64733
64774
64871
64990
65073
65083
65123
65137
65165
65186
6

In [154]:
# with open("../data/ast_bag_with_dummy_dataframe.pickle",'wb') as f:
#     pickle.dump(ast_bag,f)

In [182]:
# temp = set(list(ast_bag[233998].columns))

In [246]:
# for i in (ast_bag[233998].columns):
#     ast_list.remove(i)

In [184]:
# temp.add('spelling_SizeOf')

In [186]:
# ast_list = temp

In [188]:
# import numpy as np
# from scipy import sparse

In [189]:
# sA = sparse.csr_matrix(ast_bag[233998]) 

In [204]:
# test = ast_bag[233998]

In [205]:
# def ast_to_sparse(data):
#     for i in set(ast_list):
#         if i not in data.columns:
#             data[i]=0
#     return data

In [206]:
# da =  ast_to_sparse(test)

In [171]:
# ast_bag[233998].filter(regex = "spelling_*")

Unnamed: 0_level_0,spelling_Alloca,spelling_WriteToPointer
Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0,0
2,0,0
3,0,0
4,0,0
5,0,0
6,0,0
7,0,0
8,0,0
9,0,0
10,0,0


In [167]:
# ast_bag[-233878].head()

Unnamed: 0_level_0,CursorKind.TRANSLATION_UNIT,CursorKind.TYPEDEF_DECL,CursorKind.STRUCT_DECL,CursorKind.FUNCTION_DECL,CursorKind.UNEXPOSED_ATTR,CursorKind.PURE_ATTR,CursorKind.PARM_DECL,CursorKind.ASM_LABEL_ATTR,CursorKind.VAR_DECL,CursorKind.COMPOUND_STMT,...,CursorKind.STRING_LITERAL,CursorKind.CHARACTER_LITERAL,CursorKind.PAREN_EXPR,CursorKind.UNARY_OPERATOR,CursorKind.TRANSLATION_UNIT,CursorKind.ARRAY_SUBSCRIPT_EXPR,CursorKind.UNEXPOSED_ATTR,CursorKind.BINARY_OPERATOR,CursorKind.LABEL_STMT,CursorKind.COMPOUND_STMT
Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [259]:
# example_node = ast_roots.iloc[0].children[19]
# dir(example_node)

In [17]:
# def generate_colnames(ast_root):
#     """
#     Given a concretised & numbered clang ast, returns a set of node kinds to be used as columns in feature matrix
#     """
#     features =  set()


#     def walk_tree_and_set_features(node):
#         out_degree = len(node.children)
#         in_degree = 1
#         degree = out_degree + in_degree
        
#         features.add(str(node.kind))


#         for child in node.children:
#             walk_tree_and_set_features(child)

#     walk_tree_and_set_features(ast_root)

#     return features


# def generate_spelling(ast_root):
#     """
#     Given a concretised & numbered clang ast, returns a set of node spellings to be used later
#     in constructing the columns in feature matrix
#     """
#     spelling =  set()


#     def walk_tree_and_set_features(node):
#         out_degree = len(node.children)
#         in_degree = 1
#         degree = out_degree + in_degree
        
#         spelling.add(node.spelling)

#         for child in node.children:
#             walk_tree_and_set_features(child)

#     walk_tree_and_set_features(ast_root)

#     return spelling

In [13]:
# colnames = ast_roots.apply(generate_colnames)
# spelling = ast_roots.apply(generate_spelling)

NameError: name 'ast_roots' is not defined

In [None]:
# final_colnames = set()
# final_colnames.update(['Identifier', 'WriteToPointer', 'SizeOf', 'Alloc'])
# for i in range(len(colnames)):
#     final_colnames.update(colnames.iloc[i])

In [None]:
# final_spelling = set()
# for i in range(len(spelling)):
#     final_spelling.update(spelling.iloc[i])

In [None]:
# final_colnames = pd.Series(list(final_colnames))

In [427]:
# for i in range(len(final_colnames)):
#     final_colnames.iloc[i] = 'kind_' + final_colnames.iloc[i]

In [19]:
# def generate_features_matrix(ast_root):
#     """
#     Given a concretised & numbered clang ast, returns a matrix of one hot encoded features of node names kind and 
#     whether it's alloc/writeToPointer/sizeOf/other, i.e. our feature matrix
#     """
#     index = []
#     kind = {}
#     spelling = {}
    
#     matrix_df = final_df.copy()

#     def walk_tree_and_set_properties(node):
#         out_degree = len(node.children)
#         in_degree = 1
#         degree = out_degree + in_degree
        
#         index.append(node.identifier)
        
#         kind[node.identifier] = node.kind
#         spelling[node.identifier] = node.spelling
        
#         if str(node.spelling) in writeToPointer_list:
#             spelling[node.identifier] = 'WriteToPointer'
        
#         elif str(node.spelling) in sizeOf_list:
#             spelling[node.identifier] = 'SizeOf'
            
#         elif str(node.spelling) in alloc_list:
#             spelling[node.identifier] = 'Alloca'
        
#         else:
#             spelling[node.identifier] = ''
        

#         for child in node.children:
#             walk_tree_and_set_properties(child)

#     walk_tree_and_set_properties(ast_root)
    
# #     return index
    
#     d = {'Identifier': index, 'kind': list(kind.values()), 'spelling': list(spelling.values())}
        
#     ast_df = pd.DataFrame(data = d)
#     ast_df = ast_df.set_index('Identifier')
    
#     dum_df = pd.get_dummies(ast_df, prefix=['kind', 'spelling'])
    
#     dum_df = dum_df.drop('spelling_', axis=1)
    
#     for col in dum_df.filter(regex='kind_*').columns:
#         dum_df = dum_df.rename(columns = {col: col.replace('kind_', '')})
    
#     matrix_df['Identifier'] = range(1,len(dum_df)+1)
#     matrix_df = matrix_df.set_index('Identifier')
#     matrix_df = matrix_df.fillna(0)
    
# #     df_merge_col = pd.merge(dum_df, matrix_df, on='Identifier')
#     for i in matrix_df.columns:
#         if i not in dum_df.columns:
#             dum_df[i]=0
# # #     for col in ['SizeOf', 'Alloc', 'WriteToPointer']:
# # #         if df_merge_col[col].isna().any():
# # #             df_merge_col[col].fillna(0)
            
# # #     df_merge_col = df_merge_col.dropna(axis='columns')
    
# #     df_merge_col = df_merge_col.set_index('Identifier')
    
# #     return df_merge_col

#     return dum_df.values

In [None]:
# eg = generate_features_matrix(ast_roots.iloc[0])

In [59]:
# mat = ast_roots.apply(generate_features_matrix)

In [61]:
# mat.iloc[1]

array([[1, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])