# Goals

* Reprocess Replogle et al., 2022 dataset
  * Just cell count data
* Reprocess will make the data consistent with the scBaseCount dataset
* Reprocessing from SRA read data

In [4]:
import os
import pandas as pd

# SRA metadata

In [5]:
# get base directory of repo
repo_base_dir = !git rev-parse --show-toplevel
repo_base_dir = repo_base_dir[0]

### Get metadata

In [6]:
meta_file = os.path.join(repo_base_dir, "data", "Replogle2022", "PRJNA83156_SRX-SRR.csv")
meta_file

'/home/nickyoungblut/dev/python/scBaseCount_analysis/data/Replogle2022/PRJNA83156_SRX-SRR.csv'

In [7]:
#!{repo_base_dir}/scripts/bioproject2srx.py PRJNA83156 > {repo_base_dir}/data/Replogle2022/PRJNA83156_SRX-SRR.csv

### Summarize

In [58]:
meta = pd.read_csv(meta_file)
meta

Unnamed: 0,exp_name,srx,srr
0,KD6_seq1_essential_mRNA_lane_1_S49_L004,SRX15390082,SRR19330645
1,KD6_seq1_essential_mRNA_lane_1_S49_L001,SRX15390083,SRR19330644
2,KD6_seq1_essential_sgRNA_lane_1_S1_L002,SRX15390084,SRR19330643
3,KD6_seq1_essential_mRNA_lane_9_S33_L004,SRX15390085,SRR19330642
4,KD6_seq1_essential_mRNA_lane_9_S33_L003,SRX15390086,SRR19330641
...,...,...,...
7878,KD6_6_essential,SRX15703621,SRR19653804
7879,KD6_7_essential,SRX15703622,SRR19653803
7880,KD6_8_essential,SRX15703623,SRR19653802
7881,KD6_9_essential,SRX15703624,SRR19653801


In [59]:
# repeat names?
df = meta['exp_name'].value_counts().reset_index()
df[df['count'] > 1]

Unnamed: 0,exp_name,count


In [60]:
# checking RPE1
df = meta[meta['exp_name'].str.contains("RPE1")].sort_values("exp_name")
df

Unnamed: 0,exp_name,srx,srr
5666,RD7_seq1_RPE1_mRNA_10_1_S10_L001,SRX15542269,SRR19489445
5141,RD7_seq1_RPE1_mRNA_10_1_S10_L002,SRX15541744,SRR19488970
5670,RD7_seq1_RPE1_mRNA_10_1_S10_L003,SRX15542273,SRR19489441
5661,RD7_seq1_RPE1_mRNA_10_1_S10_L004,SRX15542264,SRR19489450
5143,RD7_seq1_RPE1_mRNA_10_2_S66_L001,SRX15541746,SRR19488968
...,...,...,...
5657,RD7_seq1_RPE1_sgRNA_9_3_S149_L004,SRX15542260,SRR19489454
5650,RD7_seq1_RPE1_sgRNA_9_4_S205_L001,SRX15542253,SRR19489461
5117,RD7_seq1_RPE1_sgRNA_9_4_S205_L002,SRX15541720,SRR19488994
5648,RD7_seq1_RPE1_sgRNA_9_4_S205_L003,SRX15542251,SRR19489463


In [61]:
# format the mRNA samples
meta_f = meta.copy()
meta_f[['exp_name', 'lane', 'X']] = meta_f['exp_name'].str.extract(r"(.+)_((L[0-9]+))$")
meta_f = meta_f[meta_f['lane'].notna()]
meta_f = meta_f[meta_f['exp_name'].str.contains('mRNA')].reset_index(drop=True)
meta_f = meta_f.drop(columns=['X']).sort_values(['exp_name', 'srx'])
meta_f 

Unnamed: 0,exp_name,srx,srr,lane
10,KD6_seq1_essential_mRNA_lane_10_S34,SRX15390098,SRR19330629,L004
11,KD6_seq1_essential_mRNA_lane_10_S34,SRX15390099,SRR19330628,L001
12,KD6_seq1_essential_mRNA_lane_10_S34,SRX15390100,SRR19330627,L003
13,KD6_seq1_essential_mRNA_lane_10_S34,SRX15390101,SRR19330626,L002
6,KD6_seq1_essential_mRNA_lane_10_S58,SRX15390093,SRR19330634,L001
...,...,...,...,...
2912,RD7_seq1_RPE1_mRNA_9_3_S121,SRX15542257,SRR19489457,L002
2646,RD7_seq1_RPE1_mRNA_9_4_S177,SRX15541700,SRR19489014,L002
2647,RD7_seq1_RPE1_mRNA_9_4_S177,SRX15541701,SRR19489013,L003
2656,RD7_seq1_RPE1_mRNA_9_4_S177,SRX15541711,SRR19489003,L001


In [62]:
# number of SRR per SRX
meta_f['exp_name'].value_counts().reset_index().describe()

Unnamed: 0,count
count,894.0
mean,4.0
std,0.0
min,4.0
25%,4.0
50%,4.0
75%,4.0
max,4.0


In [63]:
# use first srx for each exp_name
meta_f['srx'] = meta_f.groupby('exp_name')['srx'].transform('first')
meta_f

Unnamed: 0,exp_name,srx,srr,lane
10,KD6_seq1_essential_mRNA_lane_10_S34,SRX15390098,SRR19330629,L004
11,KD6_seq1_essential_mRNA_lane_10_S34,SRX15390098,SRR19330628,L001
12,KD6_seq1_essential_mRNA_lane_10_S34,SRX15390098,SRR19330627,L003
13,KD6_seq1_essential_mRNA_lane_10_S34,SRX15390098,SRR19330626,L002
6,KD6_seq1_essential_mRNA_lane_10_S58,SRX15390093,SRR19330634,L001
...,...,...,...,...
2912,RD7_seq1_RPE1_mRNA_9_3_S121,SRX15541702,SRR19489457,L002
2646,RD7_seq1_RPE1_mRNA_9_4_S177,SRX15541700,SRR19489014,L002
2647,RD7_seq1_RPE1_mRNA_9_4_S177,SRX15541700,SRR19489013,L003
2656,RD7_seq1_RPE1_mRNA_9_4_S177,SRX15541700,SRR19489003,L001


In [64]:
# format
meta_f['organism'] = 'Homo sapiens'
meta_f = meta_f.rename(columns={'srx' : 'sample', 'srr' : 'accession' }).sort_values('sample')
meta_f

Unnamed: 0,exp_name,sample,accession,lane,organism
0,KD6_seq1_essential_mRNA_lane_1_S49,SRX15390082,SRR19330645,L004,Homo sapiens
1,KD6_seq1_essential_mRNA_lane_1_S49,SRX15390082,SRR19330644,L001,Homo sapiens
50,KD6_seq1_essential_mRNA_lane_1_S49,SRX15390082,SRR19330566,L003,Homo sapiens
594,KD6_seq1_essential_mRNA_lane_1_S49,SRX15390082,SRR19331148,L002,Homo sapiens
2,KD6_seq1_essential_mRNA_lane_9_S33,SRX15390085,SRR19330642,L004,Homo sapiens
...,...,...,...,...,...
3474,RD7_seq1_RPE1_mRNA_54_4_S194,SRX15550255,SRR19497729,L004,Homo sapiens
3563,RD7_seq1_RPE1_mRNA_47_4_S187,SRX15550444,SRR19497533,L002,Homo sapiens
3560,RD7_seq1_RPE1_mRNA_47_4_S187,SRX15550444,SRR19497536,L003,Homo sapiens
3556,RD7_seq1_RPE1_mRNA_47_4_S187,SRX15550444,SRR19497540,L004,Homo sapiens


In [None]:
outfile = os.path.join(repo_base_dir, 'data', 'Replogle2022', 'PRJNA83156_SRX-SRR_mRNA.csv')
meta_f.to_csv(outfile, index=False)

# scRecounter

> Ran in a tmux session

#### Mini run

```bash
nextflow run main.nf \
  -ansi-log false \
  -profile conda,trace,report,dev,chimera \
  --use_database false \
  --keep_raw_h5ad false \
  --accessions /home/nickyoungblut/dev/python/scBaseCount_analysis/data/Replogle2022/PRJNA83156_SRX-SRR_mRNA_mini.csv \
  --output_dir /processed_datasets/scRecount/scRecounter/Replogle2022_mini \
  > runs/Replogle2022_mini.log 2>&1
```

#### Essential run

```bash
nextflow run main.nf \
  -ansi-log false \
  -profile conda,trace,report,dev,chimera \
  --use_database false \
  --keep_raw_h5ad false \
  --accessions /home/nickyoungblut/dev/python/scBaseCount_analysis/data/Replogle2022/PRJNA83156_SRX-SRR_mRNA_essential.csv \
  --output_dir /processed_datasets/scRecount/scRecounter/Replogle2022_essential \
  > runs/Replogle2022_essential.log 2>&1
```

#### Non-essential (other) run

```bash
nextflow run main.nf \
  -ansi-log false \
  -profile conda,trace,report,dev,chimera \
  --use_database false \
  --keep_raw_h5ad false \
  --accessions /home/nickyoungblut/dev/python/scBaseCount_analysis/data/Replogle2022/PRJNA83156_SRX-SRR_mRNA_other.csv \
  --output_dir /processed_datasets/scRecount/scRecounter/Replogle2022_other \
  > runs/Replogle2022_other.log 2>&1
```

## Summarize results

In [26]:
from glob import glob

### Essential

In [65]:
nextflow_out_dir = os.path.join("/processed_datasets/scRecount/scRecounter/Replogle2022_essential", 'STAR')

In [66]:
# filter to 'essential'
meta_ff = meta_f[meta_f['exp_name'].str.contains('essential')].reset_index(drop=True)
meta_ff

Unnamed: 0,exp_name,sample,accession,lane,organism
0,KD6_seq1_essential_mRNA_lane_1_S49,SRX15390082,SRR19330645,L004,Homo sapiens
1,KD6_seq1_essential_mRNA_lane_1_S49,SRX15390082,SRR19330644,L001,Homo sapiens
2,KD6_seq1_essential_mRNA_lane_1_S49,SRX15390082,SRR19330566,L003,Homo sapiens
3,KD6_seq1_essential_mRNA_lane_1_S49,SRX15390082,SRR19331148,L002,Homo sapiens
4,KD6_seq1_essential_mRNA_lane_9_S33,SRX15390085,SRR19330642,L004,Homo sapiens
...,...,...,...,...,...
379,KD6_seq1_essential_mRNA_lane_17_S65,SRX15391059,SRR19331153,L001,Homo sapiens
380,KD6_seq1_essential_mRNA_lane_17_S41,SRX15391063,SRR19331151,L003,Homo sapiens
381,KD6_seq1_essential_mRNA_lane_17_S41,SRX15391063,SRR19331152,L001,Homo sapiens
382,KD6_seq1_essential_mRNA_lane_17_S41,SRX15391063,SRR19331149,L004,Homo sapiens


In [67]:
# number of unique srx
len(meta_ff['sample'].unique())

96

In [None]:
# find all star output dirs
star_out_dirs = glob(os.path.join(nextflow_out_dir, "SRX*"))
len(star_out_dirs)

79

In [69]:
# missing from essential
srx = [os.path.basename(x) for x in star_out_dirs]
meta_fff = meta_ff[~meta_ff['sample'].isin(srx)].reset_index(drop=True)
# number of unique srx
print(f"Missing SRX count: {len(meta_fff['sample'].unique())}")

Missing SRX count: 17


In [73]:
# missing samples
sorted(meta_fff['sample'].unique().tolist())

['SRX15390337',
 'SRX15390349',
 'SRX15390362',
 'SRX15390375',
 'SRX15390384',
 'SRX15390421',
 'SRX15390427',
 'SRX15390429',
 'SRX15390554',
 'SRX15390611',
 'SRX15390615',
 'SRX15390637',
 'SRX15390645',
 'SRX15390654',
 'SRX15390667',
 'SRX15390696',
 'SRX15390909']

In [74]:
# write to file
outfile = os.path.join(repo_base_dir, 'data', 'Replogle2022', 'PRJNA83156_SRX-SRR_mRNA_essential-missing.csv')
meta_fff.to_csv(outfile, index=False)

#### Essential-missing run

```bash
nextflow run main.nf \
  -ansi-log false \
  -profile conda,trace,report,dev,chimera \
  --use_database false \
  --keep_raw_h5ad false \
  --accessions /home/nickyoungblut/dev/python/scBaseCount_analysis/data/Replogle2022/PRJNA83156_SRX-SRR_mRNA_essential-missing.csv \
  --output_dir /processed_datasets/scRecount/scRecounter/Replogle2022_essential-missing \
  > runs/Replogle2022_essential-missing.log 2>&1
```

> Manually merged the missing samples into the essential run

In [76]:
# check for all h5ad files
h5ad_files = glob(os.path.join(nextflow_out_dir, "*", "h5ad", "filtered", "Gene.h5ad"))
len(h5ad_files)

92

In [80]:
# missing h5ad files
srx = [x.split("/")[-4] for x in h5ad_files]
meta_fff = meta_ff[~meta_ff['sample'].isin(srx)]
meta_fff

Unnamed: 0,exp_name,sample,accession,lane,organism
60,KD6_seq1_essential_mRNA_lane_26_S74,SRX15390163,SRR19330561,L001,Homo sapiens
61,KD6_seq1_essential_mRNA_lane_26_S74,SRX15390163,SRR19330563,L003,Homo sapiens
62,KD6_seq1_essential_mRNA_lane_26_S74,SRX15390163,SRR19330564,L004,Homo sapiens
63,KD6_seq1_essential_mRNA_lane_26_S74,SRX15390163,SRR19330562,L002,Homo sapiens
96,KD6_seq1_essential_mRNA_lane_47_S71,SRX15390206,SRR19330518,L001,Homo sapiens
97,KD6_seq1_essential_mRNA_lane_47_S71,SRX15390206,SRR19330519,L003,Homo sapiens
98,KD6_seq1_essential_mRNA_lane_47_S71,SRX15390206,SRR19330520,L002,Homo sapiens
99,KD6_seq1_essential_mRNA_lane_47_S71,SRX15390206,SRR19330521,L004,Homo sapiens
116,KD6_seq1_essential_mRNA_lane_7_S31,SRX15390275,SRR19330441,L002,Homo sapiens
117,KD6_seq1_essential_mRNA_lane_7_S31,SRX15390275,SRR19330452,L003,Homo sapiens


In [81]:
meta_fff['sample'].unique()

array(['SRX15390163', 'SRX15390206', 'SRX15390275', 'SRX15390401'],
      dtype=object)

In [82]:
# write missing samples to file
outfile = os.path.join(repo_base_dir, 'data', 'Replogle2022', 'PRJNA83156_SRX-SRR_mRNA_essential-missing2.csv')
meta_fff.to_csv(outfile, index=False)

#### Essential-missing2 run

```bash
nextflow run main.nf \
  -ansi-log false \
  -profile conda,trace,report,dev,chimera \
  --use_database false \
  --keep_raw_h5ad false \
  --accessions /home/nickyoungblut/dev/python/scBaseCount_analysis/data/Replogle2022/PRJNA83156_SRX-SRR_mRNA_essential-missing2.csv \
  --output_dir /processed_datasets/scRecount/scRecounter/Replogle2022_essential-missing2 \
  > runs/Replogle2022_essential-missing2.log 2>&1
```

> Manually merged the missing samples into the essential-missing run

In [84]:
# check for all h5ad files
h5ad_files = glob(os.path.join(nextflow_out_dir, "*", "h5ad", "filtered", "Gene.h5ad"))
len(h5ad_files)

96

# session info

In [85]:
!conda list

# packages in environment at /home/nickyoungblut/miniforge3/envs/asmbl:
#
# Name                    Version                   Build  Channel
_libgcc_mutex             0.1                 conda_forge    conda-forge
_openmp_mutex             4.5                       2_gnu    conda-forge
aiohappyeyeballs          2.4.6              pyhd8ed1ab_0    conda-forge
aiohttp                   3.11.12         py312h178313f_0    conda-forge
aiosignal                 1.3.2              pyhd8ed1ab_0    conda-forge
alsa-lib                  1.2.11               hd590300_1    conda-forge
anyio                     4.8.0              pyhd8ed1ab_0    conda-forge
appdirs                   1.4.4              pyhd8ed1ab_1    conda-forge
asttokens                 2.4.1              pyhd8ed1ab_0    conda-forge
attrs                     25.1.0             pyh71513ae_0    conda-forge
bash                      5.2.21               h7f99829_0    conda-forge
bash_kernel               0.9.3              pyh4f82c71_