In [1]:
import sys
sys.path.insert(0, '..')

from src.data_processing import load_data, parse_data_frame
data = load_data('a', 'train').sample(100)
data['code_tree'] = parse_data_frame(data)
data.drop(['code', 'language'], axis=1)

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,label,code_tree
122910,0,"[(x, [tensor([108., 0., 8., 1., 469.]), ..."
173898,1,"[(x, [tensor([108., 0., 3., 1., 418.]), ..."
176171,1,"[(x, [tensor([108., 0., 1., 1., 298.]), ..."
302760,0,"[(x, [tensor([108., 0., 2., 1., 867.]), ..."
109626,0,"[(x, [tensor([108., 0., 7., 1., 357.]), ..."
...,...,...
317298,1,"[(x, [tensor([108., 0., 4., 1., 339.]), ..."
325954,1,"[(x, [tensor([1.0800e+02, 0.0000e+00, 3.0000e+..."
313785,1,"[(x, [tensor([1.0800e+02, 0.0000e+00, 1.1000e+..."
59535,1,"[(x, [tensor([1.0800e+02, 0.0000e+00, 7.0000e+..."


In [2]:
from torch import nn
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, global_mean_pool

class GraphClassifier(nn.Module):
    def __init__(self, in_channels: int, num_classes: int):
        super().__init__()
        self.c1 = GCNConv(in_channels, 2048)
        self.c2 = GCNConv(2048, 1024)
        self.h = nn.Linear(1024, 512)
        self.o = nn.Linear(512, num_classes)
        self.dropout = 0.1

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        if hasattr(data, "batch"):
            batch = data.batch
        else:
            batch = torch.zeros(x.size(0), dtype=torch.long, device=x.device)

        x = self.c1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)

        x = self.c2(x, edge_index)
        x = F.relu(x)

        x = global_mean_pool(x, batch)
        x = F.relu(self.h(x))
        x = self.o(x)
        return x

In [3]:
from torch_geometric.loader import DataLoader
from tqdm.auto import trange

graphs = []
for _, row in data.iterrows():
    g = row.code_tree
    g.y = torch.tensor([row.label], dtype=torch.long)
    graphs.append(g)

loader = DataLoader(graphs, batch_size=16, shuffle=True)

model = GraphClassifier(5, 2)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = torch.nn.CrossEntropyLoss()
iterator = trange(25, desc="epoch 0 loss: <infinite>")
for epoch in iterator:
    total_loss = 0
    model.train()

    for batch in loader:
        optimizer.zero_grad()
        out = model(batch)
        loss = criterion(out, batch.y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    iterator.set_description(f"epoch {epoch+1} loss: {total_loss:.4f}")

pred = model(data.iloc[42].code_tree)
print(f'Model prediction: {pred} vs label: {data.iloc[42].label}')

epoch 25 loss: 3.2243: 100%|██████████| 25/25 [00:38<00:00,  1.53s/it]

Model prediction: tensor([[-2.3947, -0.1470]], grad_fn=<AddmmBackward0>) vs label: 1





In [4]:
validation = load_data('a', 'val').sample(100)
validation['code_tree'] = parse_data_frame(validation)
hits = 0
guesses = []
for _, row in validation.iterrows():
    pred = torch.argmax(model(row.code_tree))
    guesses.append(pred)
    if pred == row.label:
        hits += 1

print(f'Model accuracy on test set is: {hits / len(validation) * 100}%')
print(f'Total number of code that look like human: {sum(1 for i in guesses if not i)}')

Model accuracy on test set is: 80.0%
Total number of code that look like human: 65
