In [None]:
import pandas as pd

code4ML_path = "/home/ryounis/Documents/Zurich/PEACHLab/data/Code4ML"

kernels_meta = pd.read_csv(f"{code4ML_path}/kernels_meta.csv")
markup_data = pd.read_csv(f"{code4ML_path}/markup_data.csv")
vertices = pd.read_csv(f"{code4ML_path}/vertices.csv")
code_blocks = pd.read_csv(f"{code4ML_path}/code_blocks.csv")
data_preds = pd.read_csv(f"{code4ML_path}/data_with_preds.csv")

code_blocks

In [None]:
kernels_count = kernels_meta.groupby('comp_name').size()
n_kernels_per_comp = kernels_meta['comp_name'].value_counts()
N_HIGHEST_COMP = 4
top_comp = kernels_count.nlargest(N_HIGHEST_COMP).index[N_HIGHEST_COMP-1]
print(f"Competition with most kernels: '{top_comp}'  ({kernels_count.nlargest(N_HIGHEST_COMP).iloc[N_HIGHEST_COMP-1]} kernels)\n")
for name in n_kernels_per_comp.index:
    print(f"{name}: {n_kernels_per_comp[name]} kernels")

In [None]:
kernels = kernels_meta[kernels_meta['comp_name'] == top_comp]
top_comp_code_blocks = code_blocks[code_blocks['kernel_id'].isin(kernels['kernel_id'])]
merged_df = pd.merge(top_comp_code_blocks, kernels_meta, on='kernel_id')
top_comp_code_blocks = merged_df.drop(columns=['kaggle_score', 'kaggle_comments', 'kaggle_upvotes', 'comp_name'])
top_comp_code_blocks


In [None]:
merged_df = pd.merge(top_comp_code_blocks.merge(data_preds, on='code_blocks_index'), vertices, left_on='predicted_graph_vertex_id', right_on='graph_vertex_id')
merged_df.drop(['code_blocks_index', 'predicted_graph_vertex_id'], axis=1, inplace=True)
merged_df

In [None]:
test_dataset_dir = "../../data/test_datasets"
merged_df.to_csv(f'{test_dataset_dir}/{top_comp}.csv', index=False)

In [None]:
import sys
sys.path.insert(0, '../')
from utils.constants import BLANK_IPYNB_JSON
import json
import os

test_ipynb_dir = f"../../data/test_datasets/{top_comp}"
if not os.path.exists(test_ipynb_dir):
    os.makedirs(test_ipynb_dir)

distinct_kernel_links = merged_df['kernel_link'].unique()
notebooks = []
for i, kernel_link in enumerate(distinct_kernel_links):
    print(f"Notebook {i+1}/{len(distinct_kernel_links)}", end='\r')
    kernel_df = merged_df[merged_df['kernel_link'] == kernel_link]
    print(f"Num cells kernel_df: {len(kernel_df)}")
    
    ipynb = {
        "cells": [],
        "metadata": {
            "kernelspec": {
                "display_name": "Python 3",
                "language": "python",
                "name": "python3"
            },
            "language_info": {
                "codemirror_mode": {
                    "name": "ipython",
                    "version": 3
                },
                "file_extension": ".py",
                "mimetype": "text/x-python",
                "name": "python",
                "nbconvert_exporter": "python",
                "pygments_lexer": "ipython3",
                "version": "3.10.12"
            }
        },
        "nbformat": 4,
        "nbformat_minor": 4    
    }
    print(f"Num cells ipynb: {len(ipynb['cells'])}")
    for row in kernel_df.iterrows():
        ipynb['cells'].append({
            "cell_type": "code",
            "execution_count": None,
            "metadata": {
                "class": row[1]["graph_vertex_class"],
                "subclass": row[1]["graph_vertex_subclass"],
                "subclass_id": row[1]["graph_vertex_id"],
                "predicted_subclass_probability": row[1]["predicted_graph_vertex__probability"],
                "notebook_id": row[1]['kernel_id'],
            },
            "source": row[1]['code_block']
        })
    notebooks.append(ipynb)
    with open(f"{test_ipynb_dir}/{kernel_link.split('/')[-1]}.ipynb", "w") as f:json.dump(ipynb, f)

In [None]:
import sys
sys.path.insert(0, '../')
from utils.constants import FIRST_LAYER_LABELS, SECOND_LAYER_LABELS, BLANK_IPYNB_JSON
from Classifiers.GPTClassifier import GPTClassifier
with open('../../secrets/api_key.txt', 'r') as f: api_key = f'{f.read()}'

LABELS = FIRST_LAYER_LABELS
# LABELS = SECOND_LAYER_LABELS


print(f"Initializing classifier...")
prompt = f"""You will be given each code cell of the same jupyter notebook of a machine learning task.
First, classify the code into one {', '.join(LABELS[:-1])} or {LABELS[-1]}.
Consider the previously classified code snippets for context.
Then, describe what the code snippet does in strictly one sentence.
Explain your reasoning for the classification and then output the desired format at the end.
Desired format:
Class: <class_label>
Description: <desctiption_sentence>
""" 
classifier = GPTClassifier(api_key=api_key, prompt=prompt, labels=LABELS)

In [None]:
BLANK_IPYNB_JSON["cells"] = []
BLANK_IPYNB_JSON

In [None]:
from utils.helper_functions import notebook_extract_code, notebook_add_class_labels

notebooks = []
for kernel_id in merged_df['kernel_id'].unique()[:20]:
    notebook_json = BLANK_IPYNB_JSON.copy()
    notebook_json['cells'] = []
    for row in merged_df[merged_df['kernel_id'] == kernel_id].iterrows():
        notebook_json['cells'].append({
            "cell_type": "code",
            "execution_count": None,
            "metadata": {
                "graph_vertex_id": row[1]["graph_vertex_id"],
                "predicted_graph_vertex__probability": row[1]["predicted_graph_vertex__probability"],
                "notebook_id": row[1]['kernel_id'],
            },
            "source": row[1]['code_block']
        })
    notebooks.append(notebook_json)
# notebook_code = notebook_extract_code(notebook_json)
# cell_labels = classifier.classify_notebook(notebook_code)
# notebook_json = notebook_add_class_labels(notebook_json, cell_labels)


In [None]:
labeled_notebooks = []
notebook_cell_labels = []
for i, notebook_json in enumerate(notebooks):
    print(f"Notebook {i+1}/{len(notebooks)}", end='\r')
    
    notebook_code = notebook_extract_code(notebook_json)
    cell_labels = classifier.classify_notebook(notebook_code)
    notebook_json = notebook_add_class_labels(notebook_json, cell_labels)
    labeled_notebooks.append(notebook_json)
    notebook_cell_labels.append(cell_labels)

In [None]:
embeddings = []
for notebook in notebook_cell_labels:
    for elem in notebook:
        embeddings.append(elem[1])
embeddings

In [None]:
from sklearn.cluster import HDBSCAN

clusterer: HDBSCAN = HDBSCAN(
    min_cluster_size=4,                 # Minimum number of samples to form a cluster
    min_samples=2,                      # Minimum number of samples in a neighborhood to be considered as a core point
    cluster_selection_epsilon=0,     # If 2 clusters are less than epsilon apart, they get merged
)

clusterer.fit(embeddings)
for label in set(clusterer.labels_):
    print(f"Cluster {label}: {len([x for x in clusterer.labels_ if x == label])} cells")
clusterer.labels_

In [None]:
counter = 0
for notebook in labeled_notebooks:
    for cell in notebook['cells']:
        cell['metadata']['cluster_label'] = clusterer.labels_[counter]
        counter += 1

In [None]:
labeled_notebooks[0]["cells"]
true_labels = []
predicted_labels = []
for notebook in labeled_notebooks:
    for cell in notebook["cells"]:
        true_labels.append(cell["metadata"]["graph_vertex_id"])
        predicted_labels.append(cell["metadata"]["cluster_label"])

In [None]:
from scipy.optimize import linear_sum_assignment
import numpy as np

def count_misclustered_elements(true_labels, predicted_labels):
    # Convert the labels to numpy arrays for easier manipulation
    true_labels = np.array(true_labels)
    predicted_labels = np.array(predicted_labels)
    
    unique_true_labels = np.unique(true_labels)
    unique_predicted_labels = np.unique(predicted_labels)
    
    cost_matrix = np.zeros((len(unique_true_labels), len(unique_predicted_labels)), dtype=int)
    
    for i, true_label in enumerate(unique_true_labels):
        for j, predicted_label in enumerate(unique_predicted_labels):
            cost_matrix[i, j] = np.sum((true_labels == true_label) & (predicted_labels != predicted_label))
    
    row_ind, col_ind = linear_sum_assignment(cost_matrix)
    
    misclustered_count = cost_matrix[row_ind, col_ind].sum()
    
    return misclustered_count

misclustered_count = count_misclustered_elements(true_labels, predicted_labels)
print(f"Number of misclustered elements: {misclustered_count}/{len(true_labels)}")
print(f"Score: {1 - misclustered_count/len(true_labels)} %")


In [None]:
for notebook in labeled_notebooks:
    for cell in notebook['cells']:
        cell['metadata']['cluster_label'] = clusterer.labels_[counter]
        counter += 1