Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

major code changes and logical improvements in analysis steps #61

Merged
merged 27 commits into from
May 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion screenpro/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,6 @@
from .ngs import Counter
from .assays import PooledScreens, GImaps

__version__ = "0.3.1"
__version__ = "0.3.2"
__author__ = "Abe Arab"
__email__ = 'abea@arcinstitute.org' # "abarbiology@gmail.com"
55 changes: 42 additions & 13 deletions screenpro/assays.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@
import numpy as np
import pandas as pd
import anndata as ad
import scanpy as sc

from pydeseq2 import preprocessing
from .phenoscore import runPhenoScore, runPhenoScoreForReplicate
from .utils import ann_score_df
from copy import copy
Expand All @@ -16,16 +18,16 @@ class PooledScreens(object):
pooledScreens class for processing CRISPR screen datasets
"""

def __init__(self, adata, transformation='log2(x+1)', test='ttest', n_reps=3):
def __init__(self, adata, fc_transformation='log2(x+1)', test='ttest', n_reps=3):
"""
Args:
adata (AnnData): AnnData object with adata.X as a matrix of sgRNA counts
transformation (str): transformation to apply to the data before calculating phenotype scores
fc_transformation (str): fold change transformation to apply for calculating phenotype scores
test (str): statistical test to use for calculating phenotype scores
"""
self.adata = adata
self.pdata = None
self.transformation = transformation
self.fc_transformation = fc_transformation
self.test = test
self.n_reps = n_reps
self.phenotypes = {}
Expand All @@ -41,7 +43,7 @@ def __init__(self, adata, transformation='log2(x+1)', test='ttest', n_reps=3):

def copy(self):
return copy(self)

def _add_phenotype_results(self, phenotype_name):
if phenotype_name in self.phenotype_names:
raise ValueError(f"Phenotype '{phenotype_name}' already exists in self.phenotype_names!")
Expand Down Expand Up @@ -75,7 +77,20 @@ def _calculateGrowthFactor(self, untreated, treated, db_rate_col):

return out

def calculateDrugScreen(self, t0, untreated, treated, db_untreated, db_treated, score_level, db_rate_col='pop_doublings', run_name=None):
def countNormalization(self):
"""
Normalize the counts data in adata.X
"""
self.adata.layers['raw_counts'] = self.adata.X.copy()

# normalize counts by sequencing depth
norm_counts, size_factors = preprocessing.deseq2_norm(self.adata.X)
# update adata object
self.adata.obs['size_factors'] = size_factors
self.adata.layers['seq_depth_norm'] = norm_counts
self.adata.X = self.adata.layers['seq_depth_norm']

def calculateDrugScreen(self, t0, untreated, treated, db_untreated, db_treated, score_level, db_rate_col='pop_doublings', run_name=None, **kwargs):
"""
Calculate `gamma`, `rho`, and `tau` phenotype scores for a drug screen dataset in a given `score_level`.
To normalize by growth rate, the doubling rate of the untreated and treated conditions are required.
Expand All @@ -89,23 +104,27 @@ def calculateDrugScreen(self, t0, untreated, treated, db_untreated, db_treated,
score_level (str): name of the score level
db_rate_col (str): column name for the doubling rate, default is 'pop_doublings'
run_name (str): name for the phenotype calculation run
**kwargs: additional arguments to pass to runPhenoScore
"""
# calculate phenotype scores: gamma, tau, rho
gamma_name, gamma = runPhenoScore(
self.adata, cond1=t0, cond2=untreated, growth_rate=db_untreated,
n_reps=self.n_reps,
transformation=self.transformation, test=self.test, score_level=score_level
transformation=self.fc_transformation, test=self.test, score_level=score_level,
**kwargs
)
tau_name, tau = runPhenoScore(
self.adata, cond1=t0, cond2=treated, growth_rate=db_treated,
n_reps=self.n_reps,
transformation=self.transformation, test=self.test, score_level=score_level
transformation=self.fc_transformation, test=self.test, score_level=score_level,
**kwargs
)
# TO-DO: warning / error if db_untreated and db_treated are too close, i.e. growth_rate ~= 0.
rho_name, rho = runPhenoScore(
self.adata, cond1=untreated, cond2=treated, growth_rate=np.abs(db_untreated - db_treated),
n_reps=self.n_reps,
transformation=self.transformation, test=self.test, score_level=score_level
transformation=self.fc_transformation, test=self.test, score_level=score_level,
**kwargs
)

if not run_name: run_name = score_level
Expand All @@ -125,9 +144,17 @@ def calculateDrugScreen(self, t0, untreated, treated, db_untreated, db_treated,

# get replicate level phenotype scores
pdata_df = pd.concat([
runPhenoScoreForReplicate(self,'T0', untreated,'gamma',growth_factor_table).add_prefix('gamma_'),
runPhenoScoreForReplicate(self,'T0', treated,'tau',growth_factor_table).add_prefix('tau_'),
runPhenoScoreForReplicate(self ,untreated,treated,'rho',growth_factor_table).add_prefix('rho_')
runPhenoScoreForReplicate(
self.adata, x_label = x_label, y_label = y_label, score = score_label,
transformation=self.fc_transformation,
growth_factor_table=growth_factor_table
).add_prefix(f'{score_label}_')

for x_label, y_label, score_label in [
('T0', untreated, 'gamma'),
('T0', treated, 'tau'),
(untreated, treated, 'rho')
]
],axis=1).T
# add .pdata
self.pdata = ad.AnnData(
Expand All @@ -136,7 +163,7 @@ def calculateDrugScreen(self, t0, untreated, treated, db_untreated, db_treated,
var=self.adata.var
)

def calculateFlowBasedScreen(self, low_bin, high_bin, score_level, run_name=None):
def calculateFlowBasedScreen(self, low_bin, high_bin, score_level, run_name=None, **kwargs):
"""
Calculate phenotype scores for a flow-based screen dataset.

Expand All @@ -145,11 +172,13 @@ def calculateFlowBasedScreen(self, low_bin, high_bin, score_level, run_name=None
high_bin (str): name of the high bin condition
score_level (str): name of the score level
run_name (str): name for the phenotype calculation run
**kwargs: additional arguments to pass to runPhenoScore
"""
# calculate phenotype scores
delta_name, delta = runPhenoScore(
self.adata, cond1=low_bin, cond2=high_bin, n_reps=self.n_reps,
transformation=self.transformation, test=self.test, score_level=score_level
transformation=self.fc_transformation, test=self.test, score_level=score_level,
**kwargs
)

if not run_name: run_name = score_level
Expand Down
8 changes: 8 additions & 0 deletions screenpro/ngs/counter.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,14 @@ def load_library(self, library_path, sep='\t', index_col=0, protospacer_length=1
elif self.cas_type == 'cas12':
raise NotImplementedError("Cas12 library is not yet implemented.")

# Check if the library has duplicate sequences and remove them
if library.duplicated('sequence').any():
shape_before_dedup = library.shape[0]
library = library.drop_duplicates(subset='sequence', keep='first')
shape_after_dedup = library.shape[0]
if verbose:
print(f"Warning: {shape_before_dedup - shape_after_dedup} duplicate sgRNA sequences found and removed.")

# covert to polar DataFrame
library = pl.from_pandas(library)

Expand Down
Loading
Loading