In [48]:
# Test script to see if scrape/stack.py works
import os
from dotenv import load_dotenv
from typing import Generator, List, Dict, Union
import tensorflow as tf
from tensorflow import keras
# CategoryEncoding - https://keras.io/api/layers/preprocessing_layers/categorical/category_encoding/#categoryencoding-class
from tensorflow.keras import layers

from util.db import Database
from transform.snippet import SnippetLexer

load_dotenv()
print(tf.test.gpu_device_name())

/device:GPU:0


2021-12-13 22:26:40.697609: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2021-12-13 22:26:40.697631: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [39]:
db = Database()

In [40]:
lexer = SnippetLexer()

def get_corpus() -> List[str]:
    corpus: set[str] = set()
    
    for doc in db.get_answers(page_size = 100):
        snippet: str = doc['snippets'].strip()
        toks, status = lexer.lex(snippet)

        if status != 'success':
            print(f"Error lexing snippet: Got status '{status}' for snippet '{snippet}'")
            continue

        if not len(toks):
            print(f"No tokens found for snippet '{snippet}'")
            continue

        for tok in toks:
            corpus.add(tok.type)

    return list(corpus)

corpus = get_corpus()
print(f'corpus has {len(corpus)} types')
corpus_dict: Dict[str, int] = dict()

for i, type in enumerate(corpus):
    corpus_dict[type] = i

corpus

Error lexing snippet: Got status 'error' for snippet '-I/usr/local/share/mylib
-lmylib
-L/usr/local/lib/

       --check[=<string>]                 - Parse one file in isolation instead of acting as a language server. Useful to investigate/reproduce crashes or configu‐
              ration problems. With --check=<filename>, attempts to parse a particular file.

E[13:12:40.787] [pp_file_not_found] Line 10: 'mylibheader.h' file not found

E[13:12:40.820] [unknown_typename] Line 187: unknown type name 'mylib_type1_t'
E[13:12:40.820] [unknown_typename] Line 202: unknown type name 'mylib_type1_t'
E[13:12:40.820] [undeclared_var_use] Line 820: use of undeclared identifier 'mylib_type2_t'
...
<snip>'
Error lexing snippet: Got status 'error' for snippet 'lineBuffer[count] = '\0';
    realloc(lineBuffer, count + 1);
    return lineBuffer;
}

char *line = readLine(file);
printf("LOG: read a line: %s\n", line);
if (strchr(line, 'a')) { puts("The line contains an a"); }
/* etc. */
free(line);
/* A

['EXTERN',
 'MODEQUAL',
 'INT_CONST_HEX',
 'COLON',
 'ARROW',
 'RESTRICT',
 'EQ',
 'LSHIFT',
 'PERIOD',
 'RBRACE',
 'STRING_LITERAL',
 'PPPRAGMA',
 'RPAREN',
 'CONTINUE',
 'CASE',
 'SIZEOF',
 'LONG',
 'INT_CONST_OCT',
 'LT',
 'BREAK',
 'RBRACKET',
 'MINUSEQUAL',
 'GT',
 'INT_CONST_CHAR',
 'FLOAT',
 'OR',
 'VOID',
 'PPPRAGMASTR',
 'XOR',
 'INT_CONST_DEC',
 'SWITCH',
 'MOD',
 'NE',
 'CHAR_CONST',
 'INT',
 'LAND',
 'PLUS',
 'UNSIGNED',
 'ELSE',
 'VOLATILE',
 'MINUS',
 'LE',
 'CONST',
 'PPHASH',
 'DIVIDE',
 'ELLIPSIS',
 'PLUSEQUAL',
 'DOUBLE',
 'MINUSMINUS',
 'LBRACE',
 'FOR',
 'ENUM',
 'DO',
 'FLOAT_CONST',
 'SEMI',
 'TYPEDEF',
 'STATIC',
 'PLUSPLUS',
 'AND',
 'RSHIFT',
 'EQUALS',
 'COMMA',
 'STRUCT',
 'RSHIFTEQUAL',
 'LBRACKET',
 'LSHIFTEQUAL',
 'CHAR',
 'LOR',
 'WHILE',
 'CONDOP',
 'GE',
 'LPAREN',
 'IF',
 'LNOT',
 'UNION',
 'RETURN',
 'TIMES',
 'TYPEID']

In [41]:
def transform(answer: Dict) -> Union[List[int], None]:
    snippet: str = answer['snippets']
    snippet = snippet.strip()
    toks, status = lexer.lex(snippet)

    if status != 'success':
        print(f"Error lexing snippet: Got status '{status}' for snippet '{snippet}'")
        return None

    if not len(toks):
        print(f"No tokens found for snippet '{snippet}'")
        return None
    
    tok_types = [tok.type for tok in toks]
    encoded_toks = [corpus_dict[tok_type] for tok_type in tok_types]
    return encoded_toks

In [47]:
# https://keras.io/api/layers/preprocessing_layers/categorical/category_encoding/#categoryencoding-class
category_layer = layers.CategoryEncoding(num_tokens=len(corpus), output_mode='one_hot')

for answer in db.get_answers(maxpages=1, page_size=1):
    print(f'Before: {answer["snippets"]}')
    encoded = transform(answer)
    print(f'After: {encoded}')
    assert encoded is not None
    x = category_layer(encoded)
    print(x)

Before:         *list = new_node;

After: [76, 77, 60, 77, 54]
tf.Tensor(
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.

In [None]:
model = keras.Sequential([
    category_layer
])