In [31]:
task_name = 'CHEMBL1243966'

import os
import pickle

base_path = '/FS-MOL/datasets/fs-mol-merged-cleaned/valid'

task_files = os.listdir(base_path)

matched_task_files = [f for f in task_files if task_name in f]

tasks = [pickle.load(open(f'{base_path}/{path}', 'rb')) for path in matched_task_files]

labels = [sample.label for task in tasks for sample in task.samples]
pos_count = sum([t.item() for t in labels])
print(f'Positive Count: {pos_count}, Total Count: {len(labels)}, Positive Ratio: {pos_count/len(labels)}')

Positive Count: 100, Total Count: 201, Positive Ratio: 0.4975124378109453


In [33]:
import gzip
import json
from pathlib import Path

def parse_jsongz(p):
# Open the gzipped JSONL file
    with gzip.open(p, 'rt', encoding='utf-8') as file:
        # Iterate over each line in the file
        return [json.loads(line.strip()) for line in file]

fsmol_root_dir = Path(f'/FS-MOL/datasets/fs-mol')

fsmol_tasks_dir = fsmol_root_dir / 'valid'

tasks = os.listdir(fsmol_tasks_dir)

task_files = [fsmol_tasks_dir / task for task in tasks if task_name in task]


tasks = [parse_jsongz(task_file) for task_file in task_files]

samples = [sample for task in tasks for sample in task]



In [None]:

[s for s in samples if s['SMILES'] == 'CN1C(O)=C(C(=O)Nc2ccc(Cl)cc2)c2cc(Cl)ccc2S1(=O)=O']

In [5]:
import pandas as pd
import wandb
import json

api = wandb.Api()
table_name = 'metrics_table.table.json'

artifact_name = "dest/molecular_representation_comparison/run-kx8gm23s-metrics_table:v34"

artifact = api.artifact(artifact_name)

artifact_path = artifact.download()

table_name = "metrics_table.table.json"
table_path = f"{artifact_path}/{table_name}"
with open(table_path) as file:
    json_dict = json.load(file)

data = pd.DataFrame(json_dict["data"], columns=json_dict["columns"])

[34m[1mwandb[0m:   1 of 1 files downloaded.  


In [6]:
import altair as alt

# Define the list of task_names to highlight
highlight_tasks = []  # Replace with your desired task names

# Check if highlight_tasks is empty
if highlight_tasks:
    # Create a condition that checks if task_name is in highlight_tasks
    highlight_condition = alt.FieldOneOfPredicate(field='task_name', oneOf=highlight_tasks)
    
    # Define color and opacity encodings with conditional highlighting
    color_encoding = alt.condition(
        highlight_condition,
        alt.Color('task_name:N', title='Task Name', scale=alt.Scale(scheme='category20')),
        alt.value('lightgray')  # Non-highlighted tasks will be colored light gray
    )
    opacity_encoding = alt.condition(
        highlight_condition,
        alt.value(1),          # Highlighted tasks with full opacity
        alt.value(0.5)         # Non-highlighted tasks with reduced opacity
    )
else:
    # Use the original color encoding without highlighting
    color_encoding = alt.Color(
        'task_name:N',
        title='Task Name',
        scale=alt.Scale(scheme='category20'),
        legend=alt.Legend(orient='right')  # Position the legend on the right
    )
    opacity_encoding = alt.value(1)  # Full opacity for all tasks

# Modify the chart
chart = alt.Chart(data).mark_boxplot(extent='min-max', orient='horizontal').encode(
    y=alt.Y(
        'task_name:O',
        title='Category (sorted by median)',
        sort=alt.SortField('mid_box_optimistic_delta_auc_pr', order='ascending')
    ),
    x=alt.X('optimistic_delta_auc_pr:Q', title='Value'),
    color=color_encoding,
    opacity=opacity_encoding
).properties(
    title='Boxplot with Highlighted Tasks'
)

# Display the chart
chart


In [18]:
import altair as alt

# Altair plot configuration with unique colors for each task
chart = alt.Chart(data).mark_boxplot(extent='min-max', orient='horizontal').encode(
    y=alt.Y(
        'task_name:O',
        title='Category (sorted by median)',
        sort=alt.SortField('mid_box_optimistic_delta_auc_pr', order='ascending')
    ),
    x=alt.X('optimistic_delta_auc_pr:Q', title='Value'),
    color=alt.Color(
        'task_name:N',
        title='Task Name',
        scale=alt.Scale(scheme='category20'),
        legend=alt.Legend(orient='right')  # Position the legend on the right
    )
).properties(
    title='Boxplot with Unique Colors for Each Task'
)

# Display the chart
chart


In [3]:
import altair as alt

task_name = 'CHEMBL1614259'

# Altair plot configuration with conditional coloring
chart = alt.Chart(data).mark_boxplot(extent='min-max', orient='horizontal').encode(
    y=alt.Y('task_name:O', title='Category (sorted by median)',
            sort=alt.SortField('mid_box_optimistic_delta_auc_pr', order='ascending')),
    x=alt.X('optimistic_delta_auc_pr:Q', title='Value'),
    color=alt.condition(
        alt.datum.task_name == task_name,
        alt.value('red'),       # Color for 'CHEMBL2219137'
        alt.value('steelblue')  # Default color for other categories
    )
).properties(
    title='A simple line plot'
)

# Display the chart
chart

# Worse Fingerprint Performing = CHEMBL3888867
# 3D Worse Performing = CHEMBL1614259