# Goals

* Add Chris' datasets, which lack SRA experiment accession numbers
* Datasets lacking an SRA experiment accession:
  * `/processed_datasets/scRecount/cellxgene/`
    * `marmoset_brain_cell_atlas`, `AIDB`, `brain_cell_atlas`, `neocortex_atlas`


In [2]:
import os
from pathlib import Path
import pandas as pd
import plotnine as pn
from pypika import Query, Table, functions as fn

from SRAgent.db.connect import db_connect

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 4)
pn.theme_set(pn.theme_bw())
os.environ["DYNACONF"] = "prod"

In [4]:
# get base of github repo
base_dir = !git rev-parse --show-toplevel
base_dir = Path(base_dir[0])
base_dir

PosixPath('/home/nickyoungblut/dev/python/scBaseCount_analysis')

In [5]:
data_base_dir = Path('/processed_datasets/scRecount/cellxgene')
data_base_dir.exists()

True

# AIDB

In [6]:
aidb_data_dir = data_base_dir / "AIDB"
aidb_data_dir.exists()

True

In [10]:
# find all matrix files
matrix_info = []
for f in aidb_data_dir.rglob("matrix.mtx.gz"):
    sample_name = f.parts[5]
    feature = f.parts[7]
    processing = f.parent.name
    matrix_info.append([sample_name, feature, processing, f])

matrix_info = pd.DataFrame(matrix_info, columns=["sample_name", "feature", "processing", "path"])
matrix_info

Unnamed: 0,sample_name,feature,processing,path
0,KR_SGI_B012_H159,GeneFull,filtered,/processed_datasets/scRecount/cellxgene/AIDB/K...
1,KR_SGI_B012_H159,GeneFull,raw,/processed_datasets/scRecount/cellxgene/AIDB/K...
...,...,...,...,...
3006,012_L002_5GE8099,GeneFull,filtered,/processed_datasets/scRecount/cellxgene/AIDB/0...
3007,012_L002_5GE8099,GeneFull,raw,/processed_datasets/scRecount/cellxgene/AIDB/0...


In [27]:
# find all summary files
sum_info = []
for f in aidb_data_dir.rglob("Summary.csv"):
    feature = f.parent.stem
    sample_name = f.parts[-4]
    sum_info.append([sample_name, feature, f])

sum_info = pd.DataFrame(sum_info, columns=["sample_name", "feature", "path"])
sum_info

Unnamed: 0,sample_name,feature,path
0,KR_SGI_B012_H159,Velocyto,/processed_datasets/scRecount/cellxgene/AIDB/KR_SGI_B012_H159/Solo.out/Velocyto/Summary.csv
1,KR_SGI_B012_H159,GeneFull,/processed_datasets/scRecount/cellxgene/AIDB/KR_SGI_B012_H159/Solo.out/GeneFull/Summary.csv
...,...,...,...
1877,012_L002_5GE8099,GeneFull_ExonOverIntron,/processed_datasets/scRecount/cellxgene/AIDB/012_L002_5GE8099/Solo.out/GeneFull_ExonOverIntron/Summary.csv
1878,012_L002_5GE8099,GeneFull,/processed_datasets/scRecount/cellxgene/AIDB/012_L002_5GE8099/Solo.out/GeneFull/Summary.csv


In [28]:
# filter sum_info to just those with filtered matrix
sum_info_f = sum_info[sum_info["sample_name"].isin(matrix_info["sample_name"])]
sum_info_f

Unnamed: 0,sample_name,feature,path
0,KR_SGI_B012_H159,Velocyto,/processed_datasets/scRecount/cellxgene/AIDB/KR_SGI_B012_H159/Solo.out/Velocyto/Summary.csv
1,KR_SGI_B012_H159,GeneFull,/processed_datasets/scRecount/cellxgene/AIDB/KR_SGI_B012_H159/Solo.out/GeneFull/Summary.csv
...,...,...,...
1877,012_L002_5GE8099,GeneFull_ExonOverIntron,/processed_datasets/scRecount/cellxgene/AIDB/012_L002_5GE8099/Solo.out/GeneFull_ExonOverIntron/Summary.csv
1878,012_L002_5GE8099,GeneFull,/processed_datasets/scRecount/cellxgene/AIDB/012_L002_5GE8099/Solo.out/GeneFull/Summary.csv


In [29]:
# number of unique sample_names
sum_info_f["sample_name"].nunique()

376

In [30]:
# find duplicates
x = sum_info_f[sum_info_f["feature"] == "GeneFull_Ex50pAS"]
df_dup = x[x["sample_name"].duplicated()].sort_values("sample_name")

pd.options.display.max_rows = 100
pd.options.display.max_colwidth = None
df_dup

Unnamed: 0,sample_name,feature,path


In [31]:
# reset display
pd.options.display.max_rows = 4

# Create ARC accessions

In [32]:
# for each sample_name, create an "ARC" accession (e.g., "ARC0000001")
unique_sample_names = sum_info_f["sample_name"].unique()
df = pd.DataFrame(unique_sample_names, columns=["sample_name"])
accessions = []
for i, row in df.iterrows():
    i += 1
    accessions.append(f"ARC{i:07d}")
df["accession"] = accessions

sum_info_f = sum_info_f.merge(df, on="sample_name", how="left")
sum_info_f

Unnamed: 0,sample_name,feature,path,accession
0,KR_SGI_B012_H159,Velocyto,/processed_datasets/scRecount/cellxgene/AIDB/KR_SGI_B012_H159/Solo.out/Velocyto/Summary.csv,ARC0000001
1,KR_SGI_B012_H159,GeneFull,/processed_datasets/scRecount/cellxgene/AIDB/KR_SGI_B012_H159/Solo.out/GeneFull/Summary.csv,ARC0000001
...,...,...,...,...
1877,012_L002_5GE8099,GeneFull_ExonOverIntron,/processed_datasets/scRecount/cellxgene/AIDB/012_L002_5GE8099/Solo.out/GeneFull_ExonOverIntron/Summary.csv,ARC0000376
1878,012_L002_5GE8099,GeneFull,/processed_datasets/scRecount/cellxgene/AIDB/012_L002_5GE8099/Solo.out/GeneFull/Summary.csv,ARC0000376


In [33]:
# write output
outfile = base_dir /  'data' / 'chris_data' / '2025-07-03_aidb_add-meta-star.csv'
sum_info_f.to_csv(outfile, index=False)
print(f"File written to {outfile}")

File written to /home/nickyoungblut/dev/python/scBaseCount_analysis/data/chris_data/2025-07-03_aidb_add-meta-star.csv


### Upload to database and `prodC` GCP bucket location

In [34]:
csv_file = base_dir / 'data' / 'chris_data' / '2025-07-03_aidb_add-meta-star.csv'
print(csv_file.exists())

exe = base_dir / 'scripts' / 'ChrisDataNoSRX2SQL-DB.py'
print(exe.exists())

True
True


#### Test Run

In [37]:
slurm_job = f"""#!/bin/bash
#SBATCH --job-name=ChrisDataNoSRX2SQLDB  
#SBATCH --output=ChrisDataNoSRX2SQLDB.out
#SBATCH --error=ChrisDataNoSRX2SQLDB.err
#SBATCH --ntasks=1 
#SBATCH --cpus-per-task=1
#SBATCH --mem=8G
#SBATCH --time=8:00:00

{exe} \\
  --tenant test \\
  --max-datasets 2 \\
  --samples-table {csv_file} \\
  --ignore-missing \\
  --gcp-bucket gs://arc-ctc-screcounter/test/prodC/ \\
  --database sra \\
  --is-illumina yes \\
  --is-single-cell yes \\
  --is-paired-end yes \\
  --lib-prep 10x_Genomics \\
  --tech-10x 5_prime_gex \\
  --cell-prep single_cell \\
  --organism "Home sapiens" \\
  --tissue "blood" \\
  --tissue-ontology-term-id "UBERON:0000178" \\
  --czi-collection-name "Asian immune diversity atlas" \\
  --czi-collection-id "ced320a1-29f3-47c1-a735-513c7084d508" \\
  --notes "Processed manually by Chris"
"""

job_file = "ChrisDataNoSRX2SQL-DB.sh"
with open(job_file, "w") as f:
    f.write(slurm_job)

!chmod +x ChrisDataNoSRX2SQL-DB.sh
!cat ChrisDataNoSRX2SQL-DB.sh

#!/bin/bash
#SBATCH --job-name=ChrisDataNoSRX2SQLDB  
#SBATCH --output=ChrisDataNoSRX2SQLDB.out
#SBATCH --error=ChrisDataNoSRX2SQLDB.err
#SBATCH --ntasks=1 
#SBATCH --cpus-per-task=1
#SBATCH --mem=8G
#SBATCH --time=8:00:00

/home/nickyoungblut/dev/python/scBaseCount_analysis/scripts/ChrisDataNoSRX2SQL-DB.py \
  --tenant test \
  --max-datasets 2 \
  --samples-table /home/nickyoungblut/dev/python/scBaseCount_analysis/data/chris_data/2025-07-03_aidb_add-meta-star.csv \
  --ignore-missing \
  --gcp-bucket gs://arc-ctc-screcounter/test/prodC/ \
  --database sra \
  --is-illumina yes \
  --is-single-cell yes \
  --is-paired-end yes \
  --lib-prep 10x_Genomics \
  --tech-10x 5_prime_gex \
  --cell-prep single_cell \
  --organism "Home sapiens" \
  --tissue "blood" \
  --tissue-ontology-term-id "UBERON:0000178" \
  --czi-collection-name "Asian immune diversity atlas" \
  --czi-collection-id "ced320a1-29f3-47c1-a735-513c7084d508" \
  --notes "Processed manually by Chris"


In [38]:
# submit job
!sbatch ChrisDataNoSRX2SQL-DB.sh

Submitted batch job 935510


In [41]:
# check for errors
!tail ChrisDataNoSRX2SQLDB.err

#### Full run

In [None]:
slurm_job = f"""#!/bin/bash
#SBATCH --job-name=ChrisDataNoSRX2SQLDB  
#SBATCH --output=ChrisDataNoSRX2SQLDB.out
#SBATCH --error=ChrisDataNoSRX2SQLDB.err
#SBATCH --ntasks=1 
#SBATCH --cpus-per-task=1
#SBATCH --mem=12G
#SBATCH --time=72:00:00

{exe} \\
  --tenant prod \\
  --samples-table {csv_file} \\
  --ignore-missing \\
  --gcp-bucket gs://arc-ctc-screcounter/prodC/ \\
  --database sra \\
  --is-illumina yes \\
  --is-single-cell yes \\
  --is-paired-end yes \\
  --lib-prep 10x_Genomics \\
  --tech-10x 5_prime_gex \\
  --cell-prep single_cell \\
  --organism "Home sapiens" \\
  --tissue "blood" \\
  --tissue-ontology-term-id "UBERON:0000178" \\
  --czi-collection-name "Asian immune diversity atlas" \\
  --czi-collection-id "ced320a1-29f3-47c1-a735-513c7084d508" \\
  --notes "Processed manually by Chris"
"""

job_file = "ChrisDataNoSRX2SQL-DB.sh"
with open(job_file, "w") as f:
    f.write(slurm_job)

!chmod +x ChrisDataNoSRX2SQL-DB.sh
!cat ChrisDataNoSRX2SQL-DB.sh

In [None]:
# submit job
!sbatch ChrisDataNoSRX2SQL-DB.sh

In [None]:
# check for errors
!tail -n 20 ChrisDataNoSRX2SQLDB.err