In [1]:

import json
import itertools
from chromadb.utils import embedding_functions
from tqdm import tqdm
from sklearn.cluster import HDBSCAN
import sys; sys.path.insert(0, '../')
from utils.helper_functions import clean_code
from Clusterers.title_generator import TitleGenerator
import numpy as np

In [2]:
# Load data from JSON file
file_path = '/home/ryounis/Documents/Zurich/PEACHLab/backend/data/disaster_tweet_test_2.json'
with open(file_path, 'r') as file: data = json.load(file)
cells = [cell for cell in data["cells"] if cell["cell_type"] == "code" and len(cell["source"])]
print(f"Number of cells: {len(cells)}")
cells[0]

FileNotFoundError: [Errno 2] No such file or directory: '/home/ryounis/Documents/Zurich/PEACHLab/backend/data/disaster_tweet_test_2.json'

In [4]:
import torch
from transformers import AutoTokenizer, AutoModel

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
_tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
_model = AutoModel.from_pretrained("microsoft/codebert-base")
_model.to(device)

RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropou

In [5]:
def embed(code: str, summary: str = None) -> torch.Tensor:
    """
    Embeddes the given code and summary (if provided).

    Args:
        code (str): The code to be embedded.
        summary (str, optional): An NL-summary of the code for better embedding. Defaults to None.

    Returns:
        torch.Tensor: The tokenized code and summary (if provided) embeddings.
    """
    
    code_tokens = _tokenizer.tokenize(code)
    # code_tokens = [] # TODO
    tokens = []
    if summary:
        nl_tokens = _tokenizer.tokenize(summary)
        tokens = [_tokenizer.cls_token] + nl_tokens + [_tokenizer.sep_token]
    tokens += code_tokens + [_tokenizer.eos_token]
    tokens_ids = _tokenizer.convert_tokens_to_ids(tokens)
    context_embeddings = _model(torch.tensor(tokens_ids)[None,:].to(device))[0]
    return context_embeddings.detach().numpy().tolist()


In [14]:
import torch
from transformers import AutoTokenizer, AutoModel

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
_tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
_model = AutoModel.from_pretrained("microsoft/codebert-base")
_model.to(device)

def process_code(code_str, max_length=512, stride=256):
    # Tokenization with chunking for long code strings
    tokens = _tokenizer(code_str, return_tensors='pt', max_length=max_length, stride=stride, truncation=True).to(device)
    
    outputs = []
    for i in range(0, tokens.input_ids.size(1), stride):
        chunk = tokens.input_ids[:, i:i + max_length]
        attention_mask_chunk = tokens.attention_mask[:, i:i + max_length]
        output_chunk = _model(chunk, attention_mask=attention_mask_chunk)[0]
        outputs.append(torch.mean(output_chunk, dim=1))  # Average over the sequence length
    
    # Average across all chunks
    final_output = torch.mean(torch.stack(outputs), dim=0)
    return final_output

def process_summary(summary_str):
    # Tokenization without chunking for short summary strings
    tokens = _tokenizer(summary_str, return_tensors='pt', truncation=True, max_length=512).to(device)
    
    # Directly pass through the model
    outputs = _model(**tokens)
    summary_output = torch.mean(outputs.last_hidden_state, dim=1)  # Average over the sequence length
    return summary_output

def embed_cell(code_str: str, desc_str: str):
    code_embedding = process_code(code_str)
    desc_embedding = process_summary(desc_str)
    return torch.mean(torch.stack([code_embedding, desc_embedding]), dim=0).detach().numpy().reshape(-1).tolist()

# Example inputs
code_str = "Your long code snippet here..."
summary_str = "A brief summary of the code."

# Process the code and summary separately
code_embedding = process_code(code_str)  # This should return [1, 768]
summary_embedding = process_summary(summary_str)  # This should return [1, 768]

# Combine the embeddings if needed (e.g., by concatenating or averaging)
final_embedding = torch.mean(torch.stack([code_embedding, summary_embedding]), dim=0)

# final_embedding now contains the combined representation
print(final_embedding.shape)  # Should be [1, 768]


torch.Size([1, 768])


In [15]:
from utils.helper_functions import clean_code
import numpy as np

idx = 5
code = cells[idx]["source"]
desc = cells[idx]["metadata"]["desc"]
embedding = embed_cell(code, desc)
np.array(embedding).shape

(768,)

In [16]:
for cell in tqdm(cells):
    cell["metadata"]["embeddings"] = embed_cell(clean_code(cell["source"]), cell["metadata"]["desc"])

new_file_path = "/home/ryounis/Documents/Zurich/PEACHLab/backend/data/embedded_disaster_tweets3.json"
with open(new_file_path, "w") as file: json.dump(data, file)

100%|██████████| 803/803 [01:51<00:00,  7.21it/s]


In [None]:
new_file_path = "/home/ryounis/Documents/Zurich/PEACHLab/backend/data/embedded_disaster_tweets2.json"
with open(new_file_path, "r") as file: data = json.load(file)

In [17]:
embeddings = [cell["metadata"]["embeddings"] for cell in data["cells"] if cell["cell_type"] == "code" and len(cell["source"])]

In [34]:
import numpy as np
clusterer = HDBSCAN(
    min_cluster_size=4,
    min_samples=4,
    cluster_selection_epsilon=.1,
    max_cluster_size=None,
    alpha=1.0
)



clusters = {}
# grouped_cells = itertools.groupby(cells, lambda x: x["metadata"]["class"])
grouped_cells = {k: list(v) for k, v in itertools.groupby(cells, lambda x: x["metadata"]["class"])}
for key, group in grouped_cells.items(): 
    print(f"{key}: {len(group)}")
    X = [cell["metadata"]["embeddings"] for cell in group]
    clusterer.fit(X)
    clusters[key] = clusterer.labels_
    print(f"{key}: {len(set(clusterer.labels_))}\n")

    for i, cluster in enumerate(clusterer.labels_):
        group[i]["metadata"]["cluster"] = cluster


Data Export: 44
Data Export: 3

Data Extraction: 66
Data Extraction: 5

Data Transform: 271
Data Transform: 16

Debug: 10
Debug: 1

Environment: 74
Environment: 5

Exploratory Data Analysis: 94
Exploratory Data Analysis: 5

Hyperparam Tuning: 16
Hyperparam Tuning: 3

Model Evaluation: 39
Model Evaluation: 3

Model Interpretation: 20
Model Interpretation: 4

Model Train: 71
Model Train: 6

Other: 31
Other: 2

Visualization: 67
Visualization: 4



In [32]:
clusters

{'Data Export': array([ 0, -1, -1, -1,  0,  0,  0, -1, -1,  0, -1,  0,  1, -1,  1, -1, -1,
         1,  1,  0, -1,  0,  0,  1, -1,  1, -1,  1,  0, -1,  1,  0, -1,  0,
        -1, -1,  1,  1, -1,  1,  0, -1, -1,  1]),
 'Data Extraction': array([ 2, -1,  2,  2,  1,  2,  3,  1,  2,  1,  2, -1,  2,  0, -1,  1, -1,
         2,  2,  2,  3,  2,  0,  1, -1,  3,  2, -1, -1, -1, -1, -1, -1, -1,
         2, -1, -1, -1,  2,  1,  3,  3,  1, -1, -1,  2,  0,  1,  3,  2,  0,
         1,  3, -1, -1, -1,  3,  3,  3,  3, -1,  2, -1,  2, -1,  2]),
 'Data Transform': array([-1, -1, -1,  8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  7, -1,  7,
        -1,  7,  7, -1, -1, -1, -1, -1, -1, 12, -1, -1, -1,  8,  0, -1, -1,
        -1,  9, 10,  2, -1, -1, -1, -1,  4, -1, -1, -1, -1, -1, -1, 12, -1,
        -1, -1, -1, -1, -1, -1, -1, -1,  8, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, 12, 12, -1, 11,  6, 11,  6,  8,  6, 11,  6, -1, -1, -1, -1,
        -1, -1, 12,  1,  9, 10, -1,  2, -1, -1,  8, -1, -1, -1, -1, 

In [36]:
data["cells"][1]

{'cell_type': 'code',
 'execution_count': None,
 'metadata': {'start_cell': True,
  'cell_id': 14,
  'class': 'Data Export',
  'subclass': 'save_to_csv',
  'subclass_id': 25,
  'predicted_subclass_probability': 0.999161,
  'notebook_id': 2,
  'desc': 'This code exports the preprocessed training and testing datasets to CSV files.',
  'embeddings': [-0.29024219512939453,
   0.40751779079437256,
   0.2908839285373688,
   0.09821147471666336,
   -0.1225736066699028,
   -0.220683291554451,
   -0.10773362219333649,
   0.21706432104110718,
   0.26601797342300415,
   0.41933661699295044,
   -0.27640753984451294,
   0.7368866205215454,
   -0.11890184879302979,
   -0.31496381759643555,
   0.6846251487731934,
   0.07755774259567261,
   0.34362536668777466,
   0.42988091707229614,
   0.00030302442610263824,
   0.1580064594745636,
   -0.20486080646514893,
   -0.02505023404955864,
   0.5175704956054688,
   -0.6334050297737122,
   0.15057049691677094,
   0.4327700436115265,
   -0.2546074390411377,
  