# Goals

* Reprocess Replogle et al., 2022 dataset
  * Just cell count data
* Reprocess will make the data consistent with the scBaseCount dataset
* Reprocessing from SRA read data
* Bioprojects
  * `PRJNA83156`
  * `PRJNA1100571`

In [1]:
import os
from glob import glob
import pandas as pd

In [2]:
# get base directory of repo
repo_base_dir = !git rev-parse --show-toplevel
repo_base_dir = repo_base_dir[0]

# `PRJNA83156`

## SRA metadata

### Get metadata

In [None]:
# get srx info for the bioproject
!{repo_base_dir}/scripts/bioproject2srx.py PRJNA83156 > {repo_base_dir}/data/Replogle2022/PRJNA83156_SRX-SRR.csv

In [8]:
meta_file = os.path.join(repo_base_dir, "data", "Replogle2022", "PRJNA83156_SRX-SRR.csv")
meta_file

'/home/nickyoungblut/dev/python/scBaseCount_analysis/data/Replogle2022/PRJNA83156_SRX-SRR.csv'

### Summarize

In [9]:
meta = pd.read_csv(meta_file)
meta

Unnamed: 0,exp_name,srx,srr
0,KD6_seq1_essential_mRNA_lane_1_S49_L004,SRX15390082,SRR19330645
1,KD6_seq1_essential_mRNA_lane_1_S49_L001,SRX15390083,SRR19330644
2,KD6_seq1_essential_sgRNA_lane_1_S1_L002,SRX15390084,SRR19330643
3,KD6_seq1_essential_mRNA_lane_9_S33_L004,SRX15390085,SRR19330642
4,KD6_seq1_essential_mRNA_lane_9_S33_L003,SRX15390086,SRR19330641
...,...,...,...
7878,KD6_6_essential,SRX15703621,SRR19653804
7879,KD6_7_essential,SRX15703622,SRR19653803
7880,KD6_8_essential,SRX15703623,SRR19653802
7881,KD6_9_essential,SRX15703624,SRR19653801


In [10]:
# repeat names?
df = meta['exp_name'].value_counts().reset_index()
df[df['count'] > 1]

Unnamed: 0,exp_name,count


In [11]:
# checking RPE1
df = meta[meta['exp_name'].str.contains("RPE1")].sort_values("exp_name")
df

Unnamed: 0,exp_name,srx,srr
5666,RD7_seq1_RPE1_mRNA_10_1_S10_L001,SRX15542269,SRR19489445
5141,RD7_seq1_RPE1_mRNA_10_1_S10_L002,SRX15541744,SRR19488970
5670,RD7_seq1_RPE1_mRNA_10_1_S10_L003,SRX15542273,SRR19489441
5661,RD7_seq1_RPE1_mRNA_10_1_S10_L004,SRX15542264,SRR19489450
5143,RD7_seq1_RPE1_mRNA_10_2_S66_L001,SRX15541746,SRR19488968
...,...,...,...
5657,RD7_seq1_RPE1_sgRNA_9_3_S149_L004,SRX15542260,SRR19489454
5650,RD7_seq1_RPE1_sgRNA_9_4_S205_L001,SRX15542253,SRR19489461
5117,RD7_seq1_RPE1_sgRNA_9_4_S205_L002,SRX15541720,SRR19488994
5648,RD7_seq1_RPE1_sgRNA_9_4_S205_L003,SRX15542251,SRR19489463


In [12]:
# format the mRNA samples
meta_f = meta.copy()
meta_f[['exp_name', 'lane', 'X']] = meta_f['exp_name'].str.extract(r"(.+)_((L[0-9]+))$")
meta_f = meta_f[meta_f['lane'].notna()]
meta_f = meta_f[meta_f['exp_name'].str.contains('mRNA')].reset_index(drop=True)
meta_f = meta_f.drop(columns=['X']).sort_values(['exp_name', 'srx'])
meta_f 

Unnamed: 0,exp_name,srx,srr,lane
10,KD6_seq1_essential_mRNA_lane_10_S34,SRX15390098,SRR19330629,L004
11,KD6_seq1_essential_mRNA_lane_10_S34,SRX15390099,SRR19330628,L001
12,KD6_seq1_essential_mRNA_lane_10_S34,SRX15390100,SRR19330627,L003
13,KD6_seq1_essential_mRNA_lane_10_S34,SRX15390101,SRR19330626,L002
6,KD6_seq1_essential_mRNA_lane_10_S58,SRX15390093,SRR19330634,L001
...,...,...,...,...
2912,RD7_seq1_RPE1_mRNA_9_3_S121,SRX15542257,SRR19489457,L002
2646,RD7_seq1_RPE1_mRNA_9_4_S177,SRX15541700,SRR19489014,L002
2647,RD7_seq1_RPE1_mRNA_9_4_S177,SRX15541701,SRR19489013,L003
2656,RD7_seq1_RPE1_mRNA_9_4_S177,SRX15541711,SRR19489003,L001


In [13]:
# number of SRR per SRX
meta_f['exp_name'].value_counts().reset_index().describe()

Unnamed: 0,count
count,894.0
mean,4.0
std,0.0
min,4.0
25%,4.0
50%,4.0
75%,4.0
max,4.0


In [14]:
# use first srx for each exp_name
meta_f['srx'] = meta_f.groupby('exp_name')['srx'].transform('first')
meta_f

Unnamed: 0,exp_name,srx,srr,lane
10,KD6_seq1_essential_mRNA_lane_10_S34,SRX15390098,SRR19330629,L004
11,KD6_seq1_essential_mRNA_lane_10_S34,SRX15390098,SRR19330628,L001
12,KD6_seq1_essential_mRNA_lane_10_S34,SRX15390098,SRR19330627,L003
13,KD6_seq1_essential_mRNA_lane_10_S34,SRX15390098,SRR19330626,L002
6,KD6_seq1_essential_mRNA_lane_10_S58,SRX15390093,SRR19330634,L001
...,...,...,...,...
2912,RD7_seq1_RPE1_mRNA_9_3_S121,SRX15541702,SRR19489457,L002
2646,RD7_seq1_RPE1_mRNA_9_4_S177,SRX15541700,SRR19489014,L002
2647,RD7_seq1_RPE1_mRNA_9_4_S177,SRX15541700,SRR19489013,L003
2656,RD7_seq1_RPE1_mRNA_9_4_S177,SRX15541700,SRR19489003,L001


In [15]:
# format
meta_f['organism'] = 'Homo sapiens'
meta_f = meta_f.rename(columns={'srx' : 'sample', 'srr' : 'accession' }).sort_values('sample')
meta_f

Unnamed: 0,exp_name,sample,accession,lane,organism
0,KD6_seq1_essential_mRNA_lane_1_S49,SRX15390082,SRR19330645,L004,Homo sapiens
1,KD6_seq1_essential_mRNA_lane_1_S49,SRX15390082,SRR19330644,L001,Homo sapiens
50,KD6_seq1_essential_mRNA_lane_1_S49,SRX15390082,SRR19330566,L003,Homo sapiens
594,KD6_seq1_essential_mRNA_lane_1_S49,SRX15390082,SRR19331148,L002,Homo sapiens
2,KD6_seq1_essential_mRNA_lane_9_S33,SRX15390085,SRR19330642,L004,Homo sapiens
...,...,...,...,...,...
3474,RD7_seq1_RPE1_mRNA_54_4_S194,SRX15550255,SRR19497729,L004,Homo sapiens
3563,RD7_seq1_RPE1_mRNA_47_4_S187,SRX15550444,SRR19497533,L002,Homo sapiens
3560,RD7_seq1_RPE1_mRNA_47_4_S187,SRX15550444,SRR19497536,L003,Homo sapiens
3556,RD7_seq1_RPE1_mRNA_47_4_S187,SRX15550444,SRR19497540,L004,Homo sapiens


In [None]:
outfile = os.path.join(repo_base_dir, 'data', 'Replogle2022', 'PRJNA83156_SRX-SRR_mRNA.csv')
meta_f.to_csv(outfile, index=False)

## scRecounter: essential

> Ran in a tmux session

```bash
nextflow run main.nf \
  -ansi-log false \
  -profile conda,trace,report,dev,chimera \
  --use_database false \
  --keep_raw_h5ad false \
  --accessions /home/nickyoungblut/dev/python/scBaseCount_analysis/data/Replogle2022/PRJNA83156_SRX-SRR_mRNA_essential.csv \
  --output_dir /processed_datasets/scRecount/scRecounter/Replogle2022_essential \
  > runs/Replogle2022_essential.log 2>&1
```

### Summarize results

In [65]:
nextflow_out_dir = os.path.join("/processed_datasets/scRecount/scRecounter/Replogle2022_essential", 'STAR')

In [66]:
# filter to 'essential'
meta_ff = meta_f[meta_f['exp_name'].str.contains('essential')].reset_index(drop=True)
meta_ff

Unnamed: 0,exp_name,sample,accession,lane,organism
0,KD6_seq1_essential_mRNA_lane_1_S49,SRX15390082,SRR19330645,L004,Homo sapiens
1,KD6_seq1_essential_mRNA_lane_1_S49,SRX15390082,SRR19330644,L001,Homo sapiens
2,KD6_seq1_essential_mRNA_lane_1_S49,SRX15390082,SRR19330566,L003,Homo sapiens
3,KD6_seq1_essential_mRNA_lane_1_S49,SRX15390082,SRR19331148,L002,Homo sapiens
4,KD6_seq1_essential_mRNA_lane_9_S33,SRX15390085,SRR19330642,L004,Homo sapiens
...,...,...,...,...,...
379,KD6_seq1_essential_mRNA_lane_17_S65,SRX15391059,SRR19331153,L001,Homo sapiens
380,KD6_seq1_essential_mRNA_lane_17_S41,SRX15391063,SRR19331151,L003,Homo sapiens
381,KD6_seq1_essential_mRNA_lane_17_S41,SRX15391063,SRR19331152,L001,Homo sapiens
382,KD6_seq1_essential_mRNA_lane_17_S41,SRX15391063,SRR19331149,L004,Homo sapiens


In [67]:
# number of unique srx
len(meta_ff['sample'].unique())

96

In [None]:
# find all star output dirs
star_out_dirs = glob(os.path.join(nextflow_out_dir, "SRX*"))
len(star_out_dirs)

79

In [69]:
# missing from essential
srx = [os.path.basename(x) for x in star_out_dirs]
meta_fff = meta_ff[~meta_ff['sample'].isin(srx)].reset_index(drop=True)
# number of unique srx
print(f"Missing SRX count: {len(meta_fff['sample'].unique())}")

Missing SRX count: 17


In [73]:
# missing samples
sorted(meta_fff['sample'].unique().tolist())

['SRX15390337',
 'SRX15390349',
 'SRX15390362',
 'SRX15390375',
 'SRX15390384',
 'SRX15390421',
 'SRX15390427',
 'SRX15390429',
 'SRX15390554',
 'SRX15390611',
 'SRX15390615',
 'SRX15390637',
 'SRX15390645',
 'SRX15390654',
 'SRX15390667',
 'SRX15390696',
 'SRX15390909']

In [74]:
# write to file
outfile = os.path.join(repo_base_dir, 'data', 'Replogle2022', 'PRJNA83156_SRX-SRR_mRNA_essential-missing.csv')
meta_fff.to_csv(outfile, index=False)

#### Essential-missing run

```bash
nextflow run main.nf \
  -ansi-log false \
  -profile conda,trace,report,dev,chimera \
  --use_database false \
  --keep_raw_h5ad false \
  --accessions /home/nickyoungblut/dev/python/scBaseCount_analysis/data/Replogle2022/PRJNA83156_SRX-SRR_mRNA_essential-missing.csv \
  --output_dir /processed_datasets/scRecount/scRecounter/Replogle2022_essential-missing \
  > runs/Replogle2022_essential-missing.log 2>&1
```

> Manually merged the missing samples into the essential run

In [76]:
# check for all h5ad files
h5ad_files = glob(os.path.join(nextflow_out_dir, "*", "h5ad", "filtered", "Gene.h5ad"))
len(h5ad_files)

92

In [80]:
# missing h5ad files
srx = [x.split("/")[-4] for x in h5ad_files]
meta_fff = meta_ff[~meta_ff['sample'].isin(srx)]
meta_fff

Unnamed: 0,exp_name,sample,accession,lane,organism
60,KD6_seq1_essential_mRNA_lane_26_S74,SRX15390163,SRR19330561,L001,Homo sapiens
61,KD6_seq1_essential_mRNA_lane_26_S74,SRX15390163,SRR19330563,L003,Homo sapiens
62,KD6_seq1_essential_mRNA_lane_26_S74,SRX15390163,SRR19330564,L004,Homo sapiens
63,KD6_seq1_essential_mRNA_lane_26_S74,SRX15390163,SRR19330562,L002,Homo sapiens
96,KD6_seq1_essential_mRNA_lane_47_S71,SRX15390206,SRR19330518,L001,Homo sapiens
97,KD6_seq1_essential_mRNA_lane_47_S71,SRX15390206,SRR19330519,L003,Homo sapiens
98,KD6_seq1_essential_mRNA_lane_47_S71,SRX15390206,SRR19330520,L002,Homo sapiens
99,KD6_seq1_essential_mRNA_lane_47_S71,SRX15390206,SRR19330521,L004,Homo sapiens
116,KD6_seq1_essential_mRNA_lane_7_S31,SRX15390275,SRR19330441,L002,Homo sapiens
117,KD6_seq1_essential_mRNA_lane_7_S31,SRX15390275,SRR19330452,L003,Homo sapiens


In [81]:
meta_fff['sample'].unique()

array(['SRX15390163', 'SRX15390206', 'SRX15390275', 'SRX15390401'],
      dtype=object)

In [82]:
# write missing samples to file
outfile = os.path.join(repo_base_dir, 'data', 'Replogle2022', 'PRJNA83156_SRX-SRR_mRNA_essential-missing2.csv')
meta_fff.to_csv(outfile, index=False)

#### Essential-missing2 run

```bash
nextflow run main.nf \
  -ansi-log false \
  -profile conda,trace,report,dev,chimera \
  --use_database false \
  --keep_raw_h5ad false \
  --accessions /home/nickyoungblut/dev/python/scBaseCount_analysis/data/Replogle2022/PRJNA83156_SRX-SRR_mRNA_essential-missing2.csv \
  --output_dir /processed_datasets/scRecount/scRecounter/Replogle2022_essential-missing2 \
  > runs/Replogle2022_essential-missing2.log 2>&1
```

> Manually merged the missing samples into the essential-missing run

In [84]:
# check for all h5ad files
h5ad_files = glob(os.path.join(nextflow_out_dir, "*", "h5ad", "filtered", "Gene.h5ad"))
len(h5ad_files)

96

## scRecounter: non-essential (other)

> Ran in a tmux session

```bash
nextflow run main.nf \
  -ansi-log false \
  -profile conda,trace,report,dev,chimera \
  --use_database false \
  --keep_raw_h5ad false \
  --accessions /home/nickyoungblut/dev/python/scBaseCount_analysis/data/Replogle2022/PRJNA83156_SRX-SRR_mRNA_other.csv \
  --output_dir /processed_datasets/scRecount/scRecounter/Replogle2022_other \
  > runs/Replogle2022_other.log 2>&1
```

### Summarize results

In [24]:
nextflow_out_dir = os.path.join("/processed_datasets/scRecount/scRecounter/Replogle2022_other", 'STAR')

In [16]:
# filter to non-essential
meta_ff = meta_f[~meta_f['exp_name'].str.contains('essential')].reset_index(drop=True)
meta_ff

Unnamed: 0,exp_name,sample,accession,lane,organism
0,KD8_seq1_p1_mRNA_4_S5,SRX15390228,SRR19330499,L001,Homo sapiens
1,KD8_seq1_p1_mRNA_4_S5,SRX15390228,SRR19330498,L002,Homo sapiens
2,KD8_seq1_p1_mRNA_4_S5,SRX15390228,SRR19330497,L003,Homo sapiens
3,KD8_seq1_p1_mRNA_4_S5,SRX15390228,SRR19330496,L004,Homo sapiens
4,KD8_seq2_p1_mRNA_4_S5,SRX15390232,SRR19330492,L001,Homo sapiens
...,...,...,...,...,...
3187,RD7_seq1_RPE1_mRNA_54_4_S194,SRX15550255,SRR19497729,L004,Homo sapiens
3188,RD7_seq1_RPE1_mRNA_47_4_S187,SRX15550444,SRR19497533,L002,Homo sapiens
3189,RD7_seq1_RPE1_mRNA_47_4_S187,SRX15550444,SRR19497536,L003,Homo sapiens
3190,RD7_seq1_RPE1_mRNA_47_4_S187,SRX15550444,SRR19497540,L004,Homo sapiens


In [17]:
# number of unique srx
len(meta_ff['sample'].unique())

798

In [21]:
# find all star output dirs
star_out_dirs = glob(os.path.join(nextflow_out_dir, "SRX*"))
len(star_out_dirs)

798

In [22]:
# missing from essential
srx = [os.path.basename(x) for x in star_out_dirs]
meta_fff = meta_ff[~meta_ff['sample'].isin(srx)].reset_index(drop=True)
# number of unique srx
print(f"Missing SRX count: {len(meta_fff['sample'].unique())}")

Missing SRX count: 0


In [25]:
# check for all h5ad files
h5ad_files = glob(os.path.join(nextflow_out_dir, "*", "h5ad", "filtered", "Gene.h5ad"))
len(h5ad_files)

798

# `PRJNA1100571`

In [4]:
bioproject_id = 'PRJNA1100571'

In [14]:
# get srx info for the bioproject
!{repo_base_dir}/scripts/bioproject2srx.py --email nick.youngblut@arcinstitute.org {bioproject_id} > {repo_base_dir}/data/Replogle2022/{bioproject_id}_SRX-SRR.csv

esearch of bioproject for: PRJNA1100571
elink of bioproject for: PRJNA1100571
  Total SRA records: 2240
efetch of sra for: 32614908
efetch of sra for: 32614907
efetch of sra for: 32614906
efetch of sra for: 32614905
efetch of sra for: 32614904
efetch of sra for: 32614903
efetch of sra for: 32614902
efetch of sra for: 32614901
efetch of sra for: 32614900
efetch of sra for: 32614899
efetch of sra for: 32614898
efetch of sra for: 32614897
efetch of sra for: 32614896
efetch of sra for: 32614895
efetch of sra for: 32614894
efetch of sra for: 32614893
efetch of sra for: 32614892
efetch of sra for: 32614891
efetch of sra for: 32614890
efetch of sra for: 32614889
efetch of sra for: 32614888
efetch of sra for: 32614887
efetch of sra for: 32614886
efetch of sra for: 32614885
efetch of sra for: 32614884
efetch of sra for: 32614883
efetch of sra for: 32614882
efetch of sra for: 32614881
efetch of sra for: 32614880
efetch of sra for: 32614879
efetch of sra for: 32614878
efetch of sra for: 32614877


## Get metadata

In [15]:
meta_file = os.path.join(repo_base_dir, "data", "Replogle2022", f"{bioproject_id}_SRX-SRR.csv")
meta_file

'/home/nickyoungblut/dev/python/scBaseCount_analysis/data/Replogle2022/PRJNA1100571_SRX-SRR.csv'

### Summarize

In [25]:
meta = pd.read_csv(meta_file)
meta

Unnamed: 0,exp_name,srx,srr
0,jurkat_mRNA_28_4_L002,SRX24257841,SRR28690026
1,jurkat_sgRNA_28_4_L002,SRX24257842,SRR28690025
2,jurkat_mRNA_28_4_L003,SRX24257843,SRR28690024
3,jurkat_sgRNA_28_4_L003,SRX24257844,SRR28690023
4,jurkat_mRNA_28_4_L004,SRX24257845,SRR28690022
...,...,...,...
2235,hepg2_mRNA_11_L004,SRX24302170,SRR28736022
2236,hepg2_sgRNA_11_L004,SRX24302171,SRR28736021
2237,hepg2_mRNA_12_L001,SRX24302172,SRR28736020
2238,hepg2_sgRNA_12_L001,SRX24302173,SRR28736019


In [26]:
# repeat names?
df = meta['exp_name'].value_counts().reset_index()
df[df['count'] > 1]

Unnamed: 0,exp_name,count


In [27]:
# format the mRNA samples
meta_f = meta.copy()
meta_f[['exp_name', 'lane', 'X']] = meta_f['exp_name'].str.extract(r"(.+)_((L[0-9]+))$")
meta_f = meta_f[meta_f['lane'].notna()]
meta_f = meta_f[meta_f['exp_name'].str.contains('mRNA')].reset_index(drop=True)
meta_f = meta_f.drop(columns=['X']).sort_values(['exp_name', 'srx']).reset_index(drop=True)
meta_f 

Unnamed: 0,exp_name,srx,srr,lane
0,hepg2_mRNA_1,SRX24301727,SRR28736017,L001
1,hepg2_mRNA_1,SRX24301935,SRR28735809,L002
2,hepg2_mRNA_1,SRX24302013,SRR28736179,L003
3,hepg2_mRNA_1,SRX24302059,SRR28736133,L004
4,hepg2_mRNA_10,SRX24302155,SRR28736037,L001
...,...,...,...,...
1115,jurkat_mRNA_9_3,SRX24258573,SRR28690314,L004
1116,jurkat_mRNA_9_4,SRX24258576,SRR28690311,L001
1117,jurkat_mRNA_9_4,SRX24258578,SRR28690309,L002
1118,jurkat_mRNA_9_4,SRX24258580,SRR28690307,L003


In [28]:
# number of SRR per SRX
meta_f['exp_name'].value_counts().reset_index().describe()

Unnamed: 0,count
count,280.0
mean,4.0
std,0.0
min,4.0
25%,4.0
50%,4.0
75%,4.0
max,4.0


In [29]:
# use first srx for each exp_name
meta_f['srx'] = meta_f.groupby('exp_name')['srx'].transform('first')
meta_f

Unnamed: 0,exp_name,srx,srr,lane
0,hepg2_mRNA_1,SRX24301727,SRR28736017,L001
1,hepg2_mRNA_1,SRX24301727,SRR28735809,L002
2,hepg2_mRNA_1,SRX24301727,SRR28736179,L003
3,hepg2_mRNA_1,SRX24301727,SRR28736133,L004
4,hepg2_mRNA_10,SRX24302155,SRR28736037,L001
...,...,...,...,...
1115,jurkat_mRNA_9_3,SRX24258370,SRR28690314,L004
1116,jurkat_mRNA_9_4,SRX24258576,SRR28690311,L001
1117,jurkat_mRNA_9_4,SRX24258576,SRR28690309,L002
1118,jurkat_mRNA_9_4,SRX24258576,SRR28690307,L003


In [30]:
# format
meta_f['organism'] = 'Homo sapiens'
meta_f = meta_f.rename(columns={'srx' : 'sample', 'srr' : 'accession' }).sort_values('sample').reset_index(drop=True)
meta_f

Unnamed: 0,exp_name,sample,accession,lane,organism
0,jurkat_mRNA_28_4,SRX24257841,SRR28690024,L003,Homo sapiens
1,jurkat_mRNA_28_4,SRX24257841,SRR28690026,L002,Homo sapiens
2,jurkat_mRNA_28_4,SRX24257841,SRR28690022,L004,Homo sapiens
3,jurkat_mRNA_28_4,SRX24257841,SRR28690029,L001,Homo sapiens
4,jurkat_mRNA_3_4,SRX24257848,SRR28690038,L001,Homo sapiens
...,...,...,...,...,...
1115,hepg2_mRNA_10,SRX24302155,SRR28736037,L001,Homo sapiens
1116,hepg2_mRNA_11,SRX24302164,SRR28736026,L002,Homo sapiens
1117,hepg2_mRNA_11,SRX24302164,SRR28736028,L001,Homo sapiens
1118,hepg2_mRNA_11,SRX24302164,SRR28736022,L004,Homo sapiens


In [33]:
# number of unique srx (sample)
meta_f['sample'].unique().shape

(280,)

In [32]:
# write to file
outfile = os.path.join(repo_base_dir, 'data', 'Replogle2022', f'{bioproject_id}_SRX-SRR_mRNA.csv')
meta_f.to_csv(outfile, index=False)

## scRecounter

```bash
nextflow run main.nf \
  -ansi-log false \
  -profile conda,trace,report,dev,chimera \
  --use_database false \
  --keep_raw_h5ad false \
  --accessions /home/nickyoungblut/dev/python/scBaseCount_analysis/data/Replogle2022/PRJNA1100571_SRX-SRR_mRNA.csv \
  --output_dir /processed_datasets/scRecount/scRecounter/Replogle2022_PRJNA1100571 \
  > runs/Replogle2022_PRJNA1100571.log 2>&1
```

### Summarize results

In [6]:
nextflow_out_dir = os.path.join("/processed_datasets/scRecount/scRecounter/Replogle2022_PRJNA1100571", 'STAR')

In [7]:
# find all star output dirs
star_out_dirs = glob(os.path.join(nextflow_out_dir, "SRX*"))
len(star_out_dirs)

280

In [None]:
# missing from essential
srx = [os.path.basename(x) for x in star_out_dirs]
meta_fff = meta_ff[~meta_ff['sample'].isin(srx)].reset_index(drop=True)
# number of unique srx
print(f"Missing SRX count: {len(meta_fff['sample'].unique())}")

Missing SRX count: 17


In [None]:
# missing samples
sorted(meta_fff['sample'].unique().tolist())

['SRX15390337',
 'SRX15390349',
 'SRX15390362',
 'SRX15390375',
 'SRX15390384',
 'SRX15390421',
 'SRX15390427',
 'SRX15390429',
 'SRX15390554',
 'SRX15390611',
 'SRX15390615',
 'SRX15390637',
 'SRX15390645',
 'SRX15390654',
 'SRX15390667',
 'SRX15390696',
 'SRX15390909']

In [None]:
# write to file
outfile = os.path.join(repo_base_dir, 'data', 'Replogle2022', 'PRJNA83156_SRX-SRR_mRNA_essential-missing.csv')
meta_fff.to_csv(outfile, index=False)

# Dataset summary

In [9]:
# set options to just print 4 rows
pd.set_option('display.max_rows', 4)

In [10]:
datasets = ["Replogle2022_essential", "Replogle2022_other", "Replogle2022_PRJNA1100571"]
summary_dfs = []
for dataset in datasets:
    nextflow_out_dir = os.path.join("/processed_datasets/scRecount/scRecounter", dataset, "STAR")
    # find all star output dirs
    summary_files = glob(os.path.join(nextflow_out_dir, "SRX*", "summary", "combined.csv"))
    #print(len(summary_files))
    # read in summary files
    summary_dfs.append(
        pd.concat([pd.read_csv(x) for x in summary_files])
    )

# combine all summary dataframes
summary_df = pd.concat(summary_dfs)
summary_df

Unnamed: 0,feature,estimated_number_of_cells,fraction_of_unique_reads_in_cells,mean_reads_per_cell,mean_umi_per_cell,mean_feature_per_cell,median_reads_per_cell,median_umi_per_cell,median_feature_per_cell,number_of_reads,...,reads_mapped_to_velocyto__unique_multiple_velocyto,reads_with_valid_barcodes,sequencing_saturation,total_feature_detected,umis_in_cells,unique_reads_in_cells_mapped_to_gene,unique_reads_in_cells_mapped_to_genefull,unique_reads_in_cells_mapped_to_genefull_ex50pas,unique_reads_in_cells_mapped_to_genefull_exonoverintron,sample
0,Gene,12288,0.906709,8342.0,7467.0,2586.0,8000.0,7156.0,2633.0,185374541,...,,0.976479,0.103122,23037.0,91764373,102508409.0,,,,SRX15390667
1,GeneFull,12450,0.918815,10803.0,9668.0,3650.0,10453.0,9348.0,3750.0,185374541,...,,0.975909,0.103335,28034.0,120375815,,134506074.0,,,SRX15390667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3,GeneFull_ExonOverIntron,12409,0.956585,4413.0,3280.0,1988.0,3912.0,2907.0,1877.0,69284062,...,,0.981650,0.255467,27456.0,40705287,,,,54766485.0,SRX24257896
4,Velocyto,0,,,,,,,,69284062,...,0.0,0.983232,-inf,,0,,,,,SRX24257896


In [11]:
# filter to just `GeneFull_Ex50pAS` for `feature` column
summary_df_f = summary_df[summary_df['feature'] == 'GeneFull_Ex50pAS']
summary_df_f

Unnamed: 0,feature,estimated_number_of_cells,fraction_of_unique_reads_in_cells,mean_reads_per_cell,mean_umi_per_cell,mean_feature_per_cell,median_reads_per_cell,median_umi_per_cell,median_feature_per_cell,number_of_reads,...,reads_mapped_to_velocyto__unique_multiple_velocyto,reads_with_valid_barcodes,sequencing_saturation,total_feature_detected,umis_in_cells,unique_reads_in_cells_mapped_to_gene,unique_reads_in_cells_mapped_to_genefull,unique_reads_in_cells_mapped_to_genefull_ex50pas,unique_reads_in_cells_mapped_to_genefull_exonoverintron,sample
2,GeneFull_Ex50pAS,12456,0.918303,11016.0,9858.0,3681.0,10664.0,9539.0,3781.0,185374541,...,,0.975953,0.103431,29035.0,122796317,,,137225560.0,,SRX15390667
2,GeneFull_Ex50pAS,13616,0.929253,10734.0,9583.0,3620.0,10326.0,9225.0,3717.0,190845115,...,,0.977768,0.105701,28924.0,130488348,,,146157149.0,,SRX15390616
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2,GeneFull_Ex50pAS,12070,0.957497,5100.0,3701.0,2117.0,4572.0,3317.0,2024.0,77097683,...,,0.982754,0.273061,27232.0,44671802,,,61568574.0,,SRX24258398
2,GeneFull_Ex50pAS,12406,0.956609,4399.0,3270.0,1981.0,3903.0,2898.0,1870.0,69284062,...,,0.981678,0.255459,27353.0,40571726,,,54586397.0,,SRX24257896


In [16]:
pd.set_option('display.max_rows', 10)

In [17]:
# summarize number of cells
print(summary_df_f['estimated_number_of_cells'].sum())
summary_df_f['estimated_number_of_cells'].describe()

15434942


count     1174.000000
mean     13147.310051
std       4672.226274
min        421.000000
25%      11004.000000
50%      12572.500000
75%      14663.250000
max      49764.000000
Name: estimated_number_of_cells, dtype: float64

In [21]:
# summarize reads
reads_bil = summary_df_f['number_of_reads'].sum() / 1e9
print(f"{reads_bil:.2f} billion reads")
summary_df_f['number_of_reads'].describe()

185.83 billion reads


count    1.174000e+03
mean     1.582907e+08
std      8.351201e+07
min      1.382970e+05
25%      8.178377e+07
50%      1.542568e+08
75%      2.239959e+08
max      4.250947e+08
Name: number_of_reads, dtype: float64

# session info

In [85]:
!conda list

# packages in environment at /home/nickyoungblut/miniforge3/envs/asmbl:
#
# Name                    Version                   Build  Channel
_libgcc_mutex             0.1                 conda_forge    conda-forge
_openmp_mutex             4.5                       2_gnu    conda-forge
aiohappyeyeballs          2.4.6              pyhd8ed1ab_0    conda-forge
aiohttp                   3.11.12         py312h178313f_0    conda-forge
aiosignal                 1.3.2              pyhd8ed1ab_0    conda-forge
alsa-lib                  1.2.11               hd590300_1    conda-forge
anyio                     4.8.0              pyhd8ed1ab_0    conda-forge
appdirs                   1.4.4              pyhd8ed1ab_1    conda-forge
asttokens                 2.4.1              pyhd8ed1ab_0    conda-forge
attrs                     25.1.0             pyh71513ae_0    conda-forge
bash                      5.2.21               h7f99829_0    conda-forge
bash_kernel               0.9.3              pyh4f82c71_