In [2]:
!python --version
!uname -msrv

Python 3.9.7
Darwin Dons-MacBook-Pro.local 21.1.0 Darwin Kernel Version 21.1.0: Wed Oct 13 17:33:01 PDT 2021; root:xnu-8019.41.5~1/RELEASE_ARM64_T6000 arm64


In [2]:
# Test script to see if scrape/stack.py works
import os
# from dotenv import load_dotenv
from typing import Generator, List, Dict, Union, Tuple
import numpy as np
import tensorflow as tf
from tensorflow import keras
import keras as k
# CategoryEncoding - https://keras.io/api/layers/preprocessing_layers/categorical/category_encoding/#categoryencoding-class
from tensorflow.keras import layers
import torch
import pandas as pd
from dotenv import load_dotenv
# from sklearn.model_selection import train_test_split
from scrape.types import StackOverflowAnswer, is_stackoverflow_answer
from util.db import Database
from transform.snippet import SnippetLexer, LexerStatus

In [3]:
load_dotenv()
print(tf.test.gpu_device_name())
db = Database()

/device:GPU:0
Metal device set to: Apple M1 Pro

systemMemory: 16.00 GB
maxCacheSize: 5.33 GB



2021-12-16 00:37:54.588911: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2021-12-16 00:37:54.589018: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [4]:
# The number of tokens that must be included in each feature vector. Snippets
# with more tokens will be truncated, and snippets with fewer tokens will be
# padded with zeros. (right-padded)
toks_per_seq: int = 2500

# The dimensionality of the LSTM's hidden state. This is the size of the
# output(final) vector
hidden_layer_size: int = 500

In [5]:
# Load in question and answer data. Limiting the amount of data to be loaded
# from the database makes the training process faster, which is useful for 
# development and debugging.

dataset_size = 1000
answers = pd.DataFrame.from_records(list(db.answers.find().limit(dataset_size)), index='_id')
questions = pd.DataFrame.from_records(list(db.questions.find().limit(dataset_size)), index='_id')

In [6]:
# Clean up the answers a lil bit
answers['snippets'] = answers['snippets'].str.strip()
answers.head(2)

Unnamed: 0_level_0,answer_id,author_id,author_username,is_accepted,is_highest_scored,page_pos,question_has_highest_accepted_answer,question_id,score,snippets,source
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
70142619,70142619,4593267.0,chqrlie,False,True,1,False,70142439,1,*list = new_node;,https://stackoverflow.com/a/70142619
70142654,70142654,17067764.0,misfit,False,False,2,False,70142439,0,(*list) = new_node;,https://stackoverflow.com/a/70142654


In [7]:
# Clean the questions up a lil bit, and remove columns we won't need
questions.drop(columns=['content_license', 'creation_date', 'last_activity_date', 'last_edit_date', 'owner', 'protected_date', 'community_owned_date', 'migrated_from', 'locked_date'], inplace=True, errors='ignore')
questions.head(2)

Unnamed: 0_level_0,answer_count,is_answered,link,question_id,score,tags,title,view_count,accepted_answer_id,closed_date,closed_reason
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
70142439,2,True,https://stackoverflow.com/questions/70142439/l...,70142439,1,"[arrays, c, pointers, struct, linked-list]",linked list not inserting new values &quot;C l...,41,,,
70141741,1,True,https://stackoverflow.com/questions/70141741/t...,70141741,2,"[c, dynamic-memory-allocation, trim, c-strings...",trim function halve the memory size to remove ...,40,70141881.0,,


In [8]:
# Join the questions and answers together on the question id
df = answers.merge(questions[['score', 'title', 'view_count', 'closed_date', 'closed_reason']], how='inner', left_on='question_id', right_on='_id', suffixes=('_answer', '_question'))
df.head()

Unnamed: 0,answer_id,author_id,author_username,is_accepted,is_highest_scored,page_pos,question_has_highest_accepted_answer,question_id,score_answer,snippets,source,score_question,title,view_count,closed_date,closed_reason
0,70142619,4593267.0,chqrlie,False,True,1,False,70142439,1,*list = new_node;,https://stackoverflow.com/a/70142619,1,linked list not inserting new values &quot;C l...,41,,
1,70142654,17067764.0,misfit,False,False,2,False,70142439,0,(*list) = new_node;,https://stackoverflow.com/a/70142654,1,linked list not inserting new values &quot;C l...,41,,
2,3501681,2411320.0,gsamaras,False,True,1,False,3501338,352,#define _GNU_SOURCE\n#include <stdio.h>\n#incl...,https://stackoverflow.com/a/3501681,231,C read file line by line,1024701,,
3,3502293,169346.0,jeremyp,False,False,8,False,3501338,4,lineBuffer[count] = '\0';\n\nchar buffer [BUFF...,https://stackoverflow.com/a/3502293,231,C read file line by line,1024701,,
4,3501425,387076.0,gilles-so-stop-being-evil,False,False,3,False,3501338,23,lineBuffer[count] = '\0';\n realloc(lineBuf...,https://stackoverflow.com/a/3501425,231,C read file line by line,1024701,,


In [9]:
# Temporarily truncate the dataset to only include the first 1000 answers to
# make development faster
# df = df.head(1000)

In [10]:
# This is OK because questions with zero score were filtered out during scraping
df['score_ratio'] = df['score_answer'] / df['score_question']
score_ratio_min: float = df['score_ratio'].min()
score_ratio_max: float = df['score_ratio'].max()
score_ratio_diff = score_ratio_max - score_ratio_min

# Normalize score ratio between -1 and 1
df['score_ratio_norm'] = (2 * (df['score_ratio'] - score_ratio_min) / score_ratio_diff) - 1

df.head()

Unnamed: 0,answer_id,author_id,author_username,is_accepted,is_highest_scored,page_pos,question_has_highest_accepted_answer,question_id,score_answer,snippets,source,score_question,title,view_count,closed_date,closed_reason,score_ratio,score_ratio_norm
0,70142619,4593267.0,chqrlie,False,True,1,False,70142439,1,*list = new_node;,https://stackoverflow.com/a/70142619,1,linked list not inserting new values &quot;C l...,41,,,1.0,-0.5
1,70142654,17067764.0,misfit,False,False,2,False,70142439,0,(*list) = new_node;,https://stackoverflow.com/a/70142654,1,linked list not inserting new values &quot;C l...,41,,,0.0,-0.75
2,3501681,2411320.0,gsamaras,False,True,1,False,3501338,352,#define _GNU_SOURCE\n#include <stdio.h>\n#incl...,https://stackoverflow.com/a/3501681,231,C read file line by line,1024701,,,1.52381,-0.369048
3,3502293,169346.0,jeremyp,False,False,8,False,3501338,4,lineBuffer[count] = '\0';\n\nchar buffer [BUFF...,https://stackoverflow.com/a/3502293,231,C read file line by line,1024701,,,0.017316,-0.745671
4,3501425,387076.0,gilles-so-stop-being-evil,False,False,3,False,3501338,23,lineBuffer[count] = '\0';\n realloc(lineBuf...,https://stackoverflow.com/a/3501425,231,C read file line by line,1024701,,,0.099567,-0.725108


In [11]:
# Turn the code snippets into tokens
lexer = SnippetLexer()

def tokenize(snippet: str) -> Tuple[List, LexerStatus]:
  toks, status = lexer.lex(snippet)
  toks = np.array([tok.type for tok in toks], dtype=str)

  # Right pad the tokens util they are toks_per_seq long
  pad_amount = toks_per_seq - len(toks)
  if pad_amount > 0:
    toks = np.pad(toks, (0, pad_amount), 'constant', constant_values='<PAD>')

  return toks.T, status

df[['tokens', 'lexer_status']] = pd.DataFrame(df['snippets'].apply(tokenize).tolist(), index=df.index)
df.head()


Unnamed: 0,answer_id,author_id,author_username,is_accepted,is_highest_scored,page_pos,question_has_highest_accepted_answer,question_id,score_answer,snippets,source,score_question,title,view_count,closed_date,closed_reason,score_ratio,score_ratio_norm,tokens,lexer_status
0,70142619,4593267.0,chqrlie,False,True,1,False,70142439,1,*list = new_node;,https://stackoverflow.com/a/70142619,1,linked list not inserting new values &quot;C l...,41,,,1.0,-0.5,"[TIMES, TYPEID, EQUALS, TYPEID, SEMI, <PAD>, <...",success
1,70142654,17067764.0,misfit,False,False,2,False,70142439,0,(*list) = new_node;,https://stackoverflow.com/a/70142654,1,linked list not inserting new values &quot;C l...,41,,,0.0,-0.75,"[LPAREN, TIMES, TYPEID, RPAREN, EQUALS, TYPEID...",success
2,3501681,2411320.0,gsamaras,False,True,1,False,3501338,352,#define _GNU_SOURCE\n#include <stdio.h>\n#incl...,https://stackoverflow.com/a/3501681,231,C read file line by line,1024701,,,1.52381,-0.369048,"[PPHASH, TYPEID, TYPEID, PPHASH, TYPEID, LT, T...",success
3,3502293,169346.0,jeremyp,False,False,8,False,3501338,4,lineBuffer[count] = '\0';\n\nchar buffer [BUFF...,https://stackoverflow.com/a/3502293,231,C read file line by line,1024701,,,0.017316,-0.745671,"[TYPEID, LBRACKET, TYPEID, RBRACKET, EQUALS, C...",success
4,3501425,387076.0,gilles-so-stop-being-evil,False,False,3,False,3501338,23,lineBuffer[count] = '\0';\n realloc(lineBuf...,https://stackoverflow.com/a/3501425,231,C read file line by line,1024701,,,0.099567,-0.725108,"[TYPEID, LBRACKET, TYPEID, RBRACKET, EQUALS, C...",error


In [12]:
# Remove snippets that failed to tokenize
df_start_len = len(df)
df = df[df['lexer_status'] == 'success']
df_filtered_len = len(df)
removed_percent = (df_start_len - df_filtered_len) / df_start_len * 100
print(f'Filtered out {removed_percent:.2f}% of the dataset because they failed tokenization')

Filtered out 18.50% of the dataset because they failed tokenization


In [13]:
def get_corpus():
    corpus: set[str] = set()
    for tok_list in df['tokens']:
        # assert type(tok_list) == list, f'Expected list, got {type(tok_list)}'
        assert len(tok_list) == toks_per_seq
        for tok in tok_list:
            corpus.add(tok)

    return np.array(list(corpus))

corpus = get_corpus()
corpus_len = len(corpus)
print(f'corpus has {corpus_len} token types')
corpus_dict: Dict[str, int] = dict()

for i, type in enumerate(corpus):
    corpus_dict[type] = i

print(corpus)

corpus has 98 token types
['CONDOP' 'GE' 'INLINE' 'IF' 'LONG' 'AUTO' '<PAD>' 'LBRACE' 'REGISTER'
 'RBRACKET' 'PPPRAGMA' 'STATIC' 'GOTO' 'INT' 'COMMA' '__INT128' 'ANDEQUAL'
 'OREQUAL' 'AND' 'INT_CONST_BIN' 'DO' 'MINUSMINUS' 'LSHIFT'
 'U32CHAR_CONST' 'STRUCT' 'WHILE' 'ELSE' 'CONTINUE' 'LAND' 'PPPRAGMASTR'
 'LOR' 'OR' 'ARROW' 'STRING_LITERAL' 'SHORT' 'EQUALS' 'PLUS' '_BOOL' 'NOT'
 'ENUM' 'PERIOD' 'RBRACE' 'LBRACKET' 'MOD' 'OFFSETOF' 'EQ' 'RSHIFT' 'CHAR'
 'XOR' 'INT_CONST_OCT' 'SIZEOF' 'INT_CONST_CHAR' 'ELLIPSIS' 'UNION'
 'INT_CONST_DEC' 'RPAREN' 'VOLATILE' 'SIGNED' 'CASE' 'FLOAT_CONST'
 'RETURN' 'RESTRICT' 'SEMI' 'VOID' 'INT_CONST_HEX' 'MINUSEQUAL'
 'TIMESEQUAL' 'DIVIDE' 'COLON' 'DOUBLE' 'CHAR_CONST' 'DEFAULT' 'EXTERN'
 'LT' 'TYPEDEF' 'LPAREN' 'GT' 'LE' 'MINUS' 'UNSIGNED' 'NE' 'TIMES' 'LNOT'
 'PPHASH' 'FOR' 'RSHIFTEQUAL' 'MODEQUAL' 'PLUSPLUS' 'DIVEQUAL'
 'LSHIFTEQUAL' 'PLUSEQUAL' 'SWITCH' 'HEX_FLOAT_CONST' 'FLOAT' 'BREAK'
 'XOREQUAL' 'CONST' 'TYPEID']


In [14]:
# https://keras.io/api/layers/preprocessing_layers/categorical/category_encoding/#categoryencoding-class
# category_layer = layers.CategoryEncoding(num_tokens=corpus_len, output_mode='one_hot', name='token-encoder')

def tok_to_int(tok: str) -> int:
    return corpus_dict[tok]
toks_to_nums = np.vectorize(tok_to_int)

df['encoded_tokens'] = df['tokens'].apply(toks_to_nums)
df.head()

Unnamed: 0,answer_id,author_id,author_username,is_accepted,is_highest_scored,page_pos,question_has_highest_accepted_answer,question_id,score_answer,snippets,...,score_question,title,view_count,closed_date,closed_reason,score_ratio,score_ratio_norm,tokens,lexer_status,encoded_tokens
0,70142619,4593267.0,chqrlie,False,True,1,False,70142439,1,*list = new_node;,...,1,linked list not inserting new values &quot;C l...,41,,,1.0,-0.5,"[TIMES, TYPEID, EQUALS, TYPEID, SEMI, <PAD>, <...",success,"[81, 97, 35, 97, 62, 6, 6, 6, 6, 6, 6, 6, 6, 6..."
1,70142654,17067764.0,misfit,False,False,2,False,70142439,0,(*list) = new_node;,...,1,linked list not inserting new values &quot;C l...,41,,,0.0,-0.75,"[LPAREN, TIMES, TYPEID, RPAREN, EQUALS, TYPEID...",success,"[75, 81, 97, 55, 35, 97, 62, 6, 6, 6, 6, 6, 6,..."
2,3501681,2411320.0,gsamaras,False,True,1,False,3501338,352,#define _GNU_SOURCE\n#include <stdio.h>\n#incl...,...,231,C read file line by line,1024701,,,1.52381,-0.369048,"[PPHASH, TYPEID, TYPEID, PPHASH, TYPEID, LT, T...",success,"[83, 97, 97, 83, 97, 73, 97, 40, 97, 76, 83, 9..."
3,3502293,169346.0,jeremyp,False,False,8,False,3501338,4,lineBuffer[count] = '\0';\n\nchar buffer [BUFF...,...,231,C read file line by line,1024701,,,0.017316,-0.745671,"[TYPEID, LBRACKET, TYPEID, RBRACKET, EQUALS, C...",success,"[97, 42, 97, 9, 35, 70, 62, 47, 97, 42, 97, 9,..."
5,15626701,1017417.0,lefteris-e,False,False,16,False,3501338,0,const char* func x(){\n char line[100];\n ...,...,231,C read file line by line,1024701,,,0.0,-0.75,"[CONST, CHAR, TIMES, TYPEID, TYPEID, LPAREN, R...",success,"[96, 47, 81, 97, 97, 75, 55, 7, 47, 97, 42, 54..."


In [15]:
# tok_preprocessor = layers.CategoryEncoding(num_tokens=corpus_len, output_mode='one_hot')(inputs)

model: k.Sequential = keras.Sequential([
    # layers.Input(shape=(toks_per_seq, corpus_len)), # TODO
    layers.LSTM(hidden_layer_size, return_sequences=False, name='tok-lstm', input_shape=(toks_per_seq, corpus_len)),
    # TODO: If 1 layer doesn't do it, uncomment dis boi
    # layers.Dense(corpus_len, activation='relu', name='post-lstm-1'),
    layers.Dense(1, name='output')
])
# model(encoded)
# model.build((toks_per_seq, corpus_len))
model.compile(optimizer='sgd', loss='mse', metrics=['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 tok-lstm (LSTM)             (None, 500)               1198000   
                                                                 
 output (Dense)              (None, 1)                 501       
                                                                 
Total params: 1,198,501
Trainable params: 1,198,501
Non-trainable params: 0
_________________________________________________________________


2021-12-16 00:37:57.459565: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2021-12-16 00:37:57.459582: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [16]:
num_epochs = 2
validation_split = 0.2
batch_size=32
toks = np.array(df['encoded_tokens'].tolist())

# print(toks.shape)
# print(toks)

X = tf.one_hot(toks, corpus_len, dtype=np.float32)
y = tf.convert_to_tensor(df['score_ratio_norm'])

history = model.fit(X, y, epochs=num_epochs, validation_split=validation_split, batch_size=batch_size, verbose=2, shuffle=True)
history

2021-12-16 00:37:57.948974: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 1/2


2021-12-16 00:37:58.305740: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2021-12-16 00:37:58.455100: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2021-12-16 00:38:16.731987: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
