In [1]:
from polaris.dataset import ColumnAnnotation, DatasetV1
from polaris.hub.client import PolarisHubClient
from polaris.benchmark import SingleTaskBenchmarkSpecification
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
client = PolarisHubClient()
client.login()

[32m2024-10-18 04:35:13.043[0m | [32m[1mSUCCESS [0m | [36mpolaris.hub.client[0m:[36mlogin[0m:[36m263[0m - [32m[1mYou are successfully logged in to the Polaris Hub.[0m


In [3]:
# df = pd.read_csv("dna/OneOmics_CHO_Global_Spectral_Library_with_gene_ids_filled3.csv")
df = pd.read_csv("chodata.csv")
# client.upload_dataset(dataset=dataset)

In [4]:
from polaris.dataset import ColumnAnnotation

# Additional meta-data on the column level
# Of course, for a real dataset we should annotate all columns.
annotations = {
    "log_prec_y": ColumnAnnotation(
        desription="Expression value on log scale clamped from 0 to 1",
    ),
    "Gene ID": ColumnAnnotation(desription="Gene ID"),
    "sequence": ColumnAnnotation(desription="raw DNA sequence"),
    "description": ColumnAnnotation(desription="gene identification data"),
}


In [5]:
from polaris.dataset import Dataset
from polaris.utils.types import HubOwner

dataset = Dataset(
    # The table is the core data-structure required to construct a dataset
    table=df,
    # Additional meta-data on the dataset level.
    name="cho-dna-expression-prediction-dataset",
    description="Collection of 11066 DNA samples from Chinese Hamster Ovary cells with corresponding processed expression values (log scale clamped 0 to 1)",
    source="https://github.com/RJain12/choformer",
    annotations=annotations,
    tags=["DNA", "Genomics"],
    owner="vishrut64",
    license="MIT",
    user_attributes={"year": "2024"},
)


In [3]:
import polaris as po

dataset = po.load_dataset("vishrut64/cho-dna-expression-prediction-dataset")

⠹ Fetching artifact... 

[32m2024-10-18 04:35:21.945[0m | [1mINFO    [0m | [36mpolaris.mixins._checksum[0m:[36mverify_checksum[0m:[36m65[0m - [1mTo verify the checksum, we need to recompute it. This can be slow for large datasets.[0m


✅ SUCCESS: [1mFetched artifact.[0m
 


  self._color = self._set_color(value) if value else value


In [4]:
import pickle

with open("splits_dna.pkl", "rb") as f:
    train, test = pickle.load(f)

train = list(set(train))
test = list(set(test))

li = []

for i in range(len(train)+len(test)):
    if i in train and i in test: li.append(i)

for item in li: test.remove(item)
test = list(set(test))
train = list(set(train))

In [5]:
    
bench = SingleTaskBenchmarkSpecification(
    dataset=dataset,
    target_cols="log_prec_y",
    input_cols=["Gene ID", "sequence"],
    split=(train, test),
    metrics="mean_absolute_error", 
    owner="vishrut64",
    name="cho-dna-expression-prediction-dataset-task",
    description="DNA CHO gene expression benchmark task with decile-based splits for balanced training and test datasets"
)

In [6]:
client.upload_benchmark(benchmark=bench, access="public")

⠋ Uploading artifact...

[32m2024-10-18 04:35:38.252[0m | [1mINFO    [0m | [36mpolaris.mixins._checksum[0m:[36mmd5sum[0m:[36m27[0m - [1mComputing the checksum. This can be slow for large datasets.[0m


💥 ERROR: Failed to upload benchmark. 


  self._color = self._set_color(value) if value else value


PolarisHubError: The request to the Polaris Hub failed. The Hub responded with:
{
  "message": "Benchmark 'cho-dna-expression-prediction-task', with slug 'cho-dna-expression-prediction-task', already exists"
}


In [6]:
client.upload_dataset(dataset, access="public")

⠋ Uploading artifact...

[32m2024-10-18 04:33:08.959[0m | [1mINFO    [0m | [36mpolaris.mixins._checksum[0m:[36mmd5sum[0m:[36m27[0m - [1mComputing the checksum. This can be slow for large datasets.[0m


⠙                       

  self._color = self._set_color(value) if value else value


⠦                       

ValueError: Terminal size 80 is too small to display spinner with the given settings.

⠼                       

In [15]:
import polaris as po

# Load the dataset from the Hub
dataset = po.load_dataset("mlls/bend-variant-effects-disease")

# Get information on the dataset size
dataset.size()

# Load a datapoint in memory
dataset.get_data(
    row=dataset.rows[0],
    col=dataset.columns[0],
)

# Or, similarly:
dataset[dataset.rows[0], dataset.columns[0]]

# Get an entire row
dataset[0]


⠼ Fetching artifact... 

[32m2024-10-18 03:32:03.558[0m | [1mINFO    [0m | [36mpolaris._artifact[0m:[36m_validate_version[0m:[36m66[0m - [1mThe version of Polaris that was used to create the artifact (dev) is different from the currently installed version of Polaris (0.8.6).[0m
[32m2024-10-18 03:32:03.563[0m | [1mINFO    [0m | [36mpolaris.mixins._checksum[0m:[36mverify_checksum[0m:[36m65[0m - [1mTo verify the checksum, we need to recompute it. This can be slow for large datasets.[0m


✅ SUCCESS: [1mFetched artifact.[0m
 


  self._color = self._set_color(value) if value else value


{'sequence_wt': 'GGACGGGAAGCGGGCTGGGAAGTCGGGCCGAGGTGGGTGTGGGGTTCGGGGTGTATTTCGTCCACGAGCCGGGGAGGGGGTACTGGCCCTGCCGCTGACTGCGCGCAGAAGCGTGCCGCTCCCTCACAGGGTCTGCCTCGGCTCTGCTCGCAGGGAAAAGTCTGAAGACGCTTATGTCCAAGGGGATCCTGCAGGTGCATCCTCCGATCTGCGACTGCCCGGGCTGCCGAATATCCTCCCCGGTGGTGAGATGCGGGGCTCGGTTGGGGCTGGGAGTTACTCTCCCCTGCGGAGCTTGTCCCTGCGGTTTTCAGGGTTTTCAGGATCGAGAGTCCTAACCTCACCCCTGCGGGTGTGCTGGAGGGAGCCTCCGAAGGGCAGGGGGAAGCGGCTTTACCTCGTGCTCTCCCAGCCCTTCTACCTGGACGGGGGAGGAGTCCTCGGGCACCCGAGCGCCCTCCCCGGTGGAGACAGGGGGGCCGCGCTTGTCTTAGAGCCTCCCCTTGGGTGCC',
 'sequence_var': 'GGACGGGAAGCGGGCTGGGAAGTCGGGCCGAGGTGGGTGTGGGGTTCGGGGTGTATTTCGTCCACGAGCCGGGGAGGGGGTACTGGCCCTGCCGCTGACTGCGCGCAGAAGCGTGCCGCTCCCTCACAGGGTCTGCCTCGGCTCTGCTCGCAGGGAAAAGTCTGAAGACGCTTATGTCCAAGGGGATCCTGCAGGTGCATCCTCCGATCTGCGACTGCCCGGGCTGCCGAATATCCTCCCCGGTGGTGAGATGCGGAGCTCGGTTGGGGCTGGGAGTTACTCTCCCCTGCGGAGCTTGTCCCTGCGGTTTTCAGGGTTTTCAGGATCGAGAGTCCTAACCTCACCCCTGCGGGTGTGCTGGAGGGAGCCTCCGAAGGGCAGGGGGAAGCGGCTTTACCTCGTGCTCTCCCAGCCCTTCTACCTGGACGGGGGAGGAGTCCTCGGGCACCC