Skip to content

Commit

Permalink
Merge pull request #37 from ArcInstitute/single-guide-design
Browse files Browse the repository at this point in the history
add features for processing single guide libraries
  • Loading branch information
abearab committed Mar 4, 2024
2 parents b712685 + 64d6618 commit a9546c3
Show file tree
Hide file tree
Showing 5 changed files with 144 additions and 49 deletions.
2 changes: 1 addition & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ dependencies:
- pyarrow
- ipykernel
- mscorefonts
- rust
- rust>=1.72
- pip
- pip:
- click
Expand Down
1 change: 1 addition & 0 deletions screenpro/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from . import plotting as pl
from . import phenoScore as ps
from . import utils
from . import ngs

from .__version__ import __version__
from copy import copy
Expand Down
138 changes: 110 additions & 28 deletions screenpro/phenoScore.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,38 +6,43 @@
import pandas as pd
from pydeseq2 import preprocessing
from .phenoStats import matrixStat, getFDR
from .utils import find_low_counts


def calculateDelta(x, y, math, ave):
def calculateDelta(x, y, math, level):
"""
Calculate log ratio of y / x.
`ave` == 'all' (i.e. averaged across all values, oligo and replicates)
`ave` == 'col' (i.e. averaged across columns, replicates)
`level` == 'all' (i.e. averaged across all values, oligo and replicates)
`level` == 'col' (i.e. averaged across columns, replicates)
Args:
x (np.array): array of values
y (np.array): array of values
math (str): math to use for calculating score
ave (str): average method
level (str): level to use for calculating score
Returns:
np.array: array of log ratio values
"""
if ave == 'all':
# check if math is implemented
if math not in ['log2(x+1)', 'log10', 'log1p']:
raise ValueError(f'math "{math}" not recognized')

if level == 'all':
# average across all values
if math == 'log2(x+1)':
return np.mean(np.log2(y+1) - np.log2(x+1))
elif math == 'log10':
return np.mean(np.log10(y) - np.log10(x))
elif math == 'log1p':
return np.mean(np.log1p(y) - np.log1p(x))
elif ave == 'row':
elif level == 'row':
# average across rows
if math == 'log2(x+1)':
return np.mean(np.log2(y+1) - np.log2(x+1), axis=0)
elif math == 'log10':
return np.mean(np.log10(y) - np.log10(x), axis=0)
elif math == 'log1p':
return np.mean(np.log1p(y) - np.log1p(x), axis=0)
elif ave == 'col':
elif level == 'col':
# average across columns
if math == 'log2(x+1)':
return np.mean(np.log2(y+1) - np.log2(x+1), axis=1)
Expand All @@ -47,7 +52,7 @@ def calculateDelta(x, y, math, ave):
return np.mean(np.log1p(y) - np.log1p(x), axis=1)


def calculatePhenotypeScore(x, y, x_ctrl, y_ctrl, growth_rate, math, ave):
def calculatePhenotypeScore(x, y, x_ctrl, y_ctrl, growth_rate, math, level):
"""
Calculate phenotype score normalized by negative control and growth rate.
Args:
Expand All @@ -57,21 +62,21 @@ def calculatePhenotypeScore(x, y, x_ctrl, y_ctrl, growth_rate, math, ave):
y_ctrl (np.array): array of values
growth_rate (int): growth rate
math (str): math to use for calculating score
ave (str): average method
level (str): level to use for calculating score
Returns:
np.array: array of scores
"""
# calculate control median and std
ctrl_median = np.median(calculateDelta(x=x_ctrl, y=y_ctrl, math=math, ave=ave))
ctrl_median = np.median(calculateDelta(x=x_ctrl, y=y_ctrl, math=math, level=level))

# calculate delta
delta = calculateDelta(x=x, y=y, math=math, ave=ave)
delta = calculateDelta(x=x, y=y, math=math, level=level)

# calculate score
return (delta - ctrl_median) / growth_rate


def matrixTest(x, y, x_ctrl, y_ctrl, math, ave_reps, test = 'ttest', growth_rate = 1):
def matrixTest(x, y, x_ctrl, y_ctrl, math, level, test = 'ttest', growth_rate = 1):
"""
Calculate phenotype score and p-values comparing `y` vs `x` matrices.
Args:
Expand All @@ -80,31 +85,28 @@ def matrixTest(x, y, x_ctrl, y_ctrl, math, ave_reps, test = 'ttest', growth_rate
x_ctrl (np.array): array of values
y_ctrl (np.array): array of values
math (str): math to use for calculating score
ave_reps (bool): average replicates
level (str): level to use for calculating score and p-value
test (str): test to use for calculating p-value
growth_rate (int): growth rate
Returns:
np.array: array of scores
np.array: array of p-values
"""
# check if average across replicates
ave = 'col' if ave_reps else 'all'

# calculate growth score
scores = calculatePhenotypeScore(
x = x, y = y, x_ctrl = x_ctrl, y_ctrl = y_ctrl,
growth_rate = growth_rate, math = math,
ave = ave
level = level
)

# compute p-value
p_values = matrixStat(x, y, test=test, ave_reps=ave_reps)
p_values = matrixStat(x, y, test=test, level = level)

return scores, p_values


def runPhenoScore(adata, cond1, cond2, math, test, score_level,
growth_rate=1, n_reps=2,
def runPhenoScore(adata, cond1, cond2, math, score_level, test,
growth_rate=1, n_reps=2, keep_top_n = None,num_pseudogenes=None,
get_z_score=False,ctrl_label='negCtrl'):
"""
Calculate phenotype score and p-values when comparing `cond2` vs `cond1`.
Expand All @@ -131,6 +133,12 @@ def runPhenoScore(adata, cond1, cond2, math, test, score_level,
# check if count_layer exists
if 'seq_depth_norm' not in adata.layers.keys():
seqDepthNormalization(adata)

# evaluate library table to get targets and riase error if not present
required_columns = ['target', 'sequence']
missing_columns = list(set(required_columns) - set(adata.var.columns))
if len(missing_columns) > 0:
raise ValueError(f"Missing required columns in library table: {missing_columns}")

# calc phenotype score and p-value
if score_level == 'compare_reps':
Expand All @@ -149,33 +157,107 @@ def runPhenoScore(adata, cond1, cond2, math, test, score_level,
# calculate growth score and p_value
scores, p_values = matrixTest(
x=x, y=y, x_ctrl=x_ctrl, y_ctrl=y_ctrl,
math=math, ave_reps=True, test=test,
math=math, level='col', test=test,
growth_rate=growth_rate
)
# get adjusted p-values
adj_p_values = getFDR(p_values)

# get targets
targets = adata.var.index.str.split('_[-,+]_').str[0].to_list()
targets = adata.var['target'].to_list()

# combine results into a dataframe
result = pd.concat([
pd.Series(targets, index=adata.var.index, name='target'),
pd.Series(scores, index=adata.var.index, name='score'),
], axis=1)

if get_z_score:
# z-score normalization
result['z_score'] = calculateZScorePhenotypeScore(result, ctrl_label=ctrl_label)
# add p-values
result[f'{test} pvalue'] = p_values
result['BH adj_pvalue'] = adj_p_values

return result_name, result

elif score_level in ['compare_oligos', 'compare_oligos_and_reps']:
return None
elif score_level in ['compare_guides']:
if n_reps == 2:
pass
else:
raise ValueError(f'n_reps "{n_reps}" not recognized')

find_low_counts(adata)
adata = adata[:,~adata.var.low_count].copy()

adata.var.drop(columns='low_count',inplace=True)

# generate pseudo gene labels
generatePseudoGeneLabels(adata, num_pseudogenes=num_pseudogenes, ctrl_label=ctrl_label)
# drop categories from target column!
adata.var['target'] = adata.var['target'].to_list()
# replace values in target columns for negative control with pseudogene label
adata.var.loc[
adata.var.targetType.eq(ctrl_label), 'target'
] = adata.var.loc[adata.var.targetType.eq(ctrl_label), 'pseudoLabel']
# drop pseudoLabel column
adata.var.drop(columns='pseudoLabel', inplace=True)

# prep counts for phenoScore calculation
df_cond1 = adata[adata.obs.query(f'condition=="{cond1}"').index].to_df().T
df_cond2 = adata[adata.obs.query(f'condition=="{cond2}"').index].to_df().T # get control values

# get control values
x_ctrl = df_cond1[adata.var.targetType.eq(ctrl_label)].to_numpy()
y_ctrl = df_cond2[adata.var.targetType.eq(ctrl_label)].to_numpy()

targets = []
scores = []
p_values = []

# group by target genes or pseudogenes to aggregate counts for score calculation
for target_name, target_group in adata.var.groupby('target'):
# convert to numpy arrays
x = df_cond1.loc[target_group.index,:].to_numpy()
y = df_cond2.loc[target_group.index,:].to_numpy()
# Sort and find top n guide per target, see #18
if keep_top_n:
x.sort()
y.sort()
x = x[:keep_top_n]
y = y[:keep_top_n]

# calculate growth score and p_value
target_scores, target_p_values = matrixTest(
x=x, y=y, x_ctrl=x_ctrl, y_ctrl=y_ctrl,
math=math, level='row', test=test,
growth_rate=growth_rate
)

scores.append(target_scores)
p_values.append(target_p_values)
targets.append(target_name)

# combine results into a dataframe
result = pd.concat({
'replicate_1':pd.concat([
pd.Series([s1 for s1,_ in scores], index=targets, name='score'),
pd.Series([p1 for p1,_ in p_values], index=targets, name=f'{test} pvalue'),

],axis=1),
'replicate_2':pd.concat([
pd.Series([s2 for _,s2 in scores], index=targets, name='score'),
pd.Series([p2 for _,p2 in p_values], index=targets, name=f'{test} pvalue'),
],axis=1),
'replicate_ave':pd.concat([
pd.Series([np.mean([s1,s2]) for s1,s2 in scores], index=targets, name='score'),
pd.Series([np.mean([p1,p2]) for p1,p2 in p_values], index=targets, name=f'{test} pvalue'),
],axis=1)
},axis=1)

else:
raise ValueError(f'score_level "{score_level}" not recognized')


return result_name, result


def seqDepthNormalization(adata):
Expand Down Expand Up @@ -259,7 +341,7 @@ def runPhenoScoreForReplicate(screen, x_label, y_label, score, growth_factor_tab

growth_rate=growth_factor_table.query(f'score=="{score}" & replicate=={replicate}')['growth_factor'].values[0],
math=screen.math,
ave='row' # there is only one column so `row` option here is equivalent to the value before averaging.
level='row' # there is only one column so `row` option here is equivalent to the value before averaging.
)

if get_z_score:
Expand Down
15 changes: 9 additions & 6 deletions screenpro/phenoStats.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,29 +7,32 @@
from statsmodels.stats.multitest import multipletests


def matrixStat(x, y, test, ave_reps):
def matrixStat(x, y, test, level):
"""
Get p-values comparing `y` vs `x` matrices.
Args:
x (np.array): array of values
y (np.array): array of values
test (str): test to use for calculating p-value
ave_reps (bool): average replicates
level (str): level at which to calculate p-value
Returns:
np.array: array of p-values
"""
# calculate p-values
if test == 'MW':
# run Mann-Whitney U rank test
pass
raise ValueError('Mann-Whitney U rank test not implemented')
elif test == 'ttest':
# run ttest
if ave_reps:
# average across replicates
if level == 'col':
p_value = ttest_rel(y, x, axis=1)[1]
else:
elif level == 'row':
p_value = ttest_rel(y, x, axis=0)[1]
elif level == 'all':
# average across all values
p_value = ttest_rel(y, x)[1]
else:
raise ValueError(f'Level "{level}" not recognized')
return p_value
else:
raise ValueError(f'Test "{test}" not recognized')
Expand Down
37 changes: 23 additions & 14 deletions screenpro/plotting.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,31 +63,34 @@ def draw_threshold(x, threshold, pseudo_sd):
return threshold * pseudo_sd * (1 if x > 0 else -1) / abs(x)


def prep_data(df_in, threshold):
def prep_data(df_in, threshold, ctrl_label):
df = df_in.copy()

df = ann_score_df(df, threshold=threshold)
df = ann_score_df(df, threshold=threshold, ctrl_label = ctrl_label)

df['-log10(pvalue)'] = np.log10(df.pvalue) * -1

return df


def plot_volcano(ax, df_in, threshold, up_hit='resistance_hit', down_hit='sensitivity_hit', xlim_l=-5, xlim_r=5,
def plot_volcano(ax, df_in, threshold, up_hit='resistance_hit', down_hit='sensitivity_hit',
ctrl_label = 'no-targeting',
dot_size=1,
xlim_l=-5, xlim_r=5,
ylim=6):
df = prep_data(df_in, threshold)
df = prep_data(df_in, threshold, ctrl_label)

# Scatter plot for each category
ax.scatter(df.loc[df['label'] == 'target_non_hit', 'score'],
df.loc[df['label'] == 'target_non_hit', '-log10(pvalue)'],
alpha=0.1, s=1, c='black', label='target_non_hit')
alpha=0.1, s=dot_size, c='black', label='target_non_hit')
ax.scatter(df.loc[df['label'] == up_hit, 'score'], df.loc[df['label'] == up_hit, '-log10(pvalue)'],
alpha=0.9, s=1, c='#fcae91', label=up_hit)
alpha=0.9, s=dot_size, c='#fcae91', label=up_hit)
ax.scatter(df.loc[df['label'] == down_hit, 'score'], df.loc[df['label'] == down_hit, '-log10(pvalue)'],
alpha=0.9, s=1, c='#bdd7e7', label=down_hit)
alpha=0.9, s=dot_size, c='#bdd7e7', label=down_hit)
ax.scatter(df.loc[df['label'] == 'non-targeting', 'score'],
df.loc[df['label'] == 'non-targeting', '-log10(pvalue)'],
alpha=0.1, s=1, c='gray', label='non-targeting')
alpha=0.1, s=dot_size, c='gray', label='non-targeting')

# Set x-axis and y-axis labels
ax.set_xlabel('phenotype score')
Expand All @@ -103,8 +106,10 @@ def plot_volcano(ax, df_in, threshold, up_hit='resistance_hit', down_hit='sensit
ax.legend()


def label_as_black(ax, df_in, label, threshold, size=2, size_txt=None, t_x=.5, t_y=-0.1):
df = prep_data(df_in, threshold)
def label_as_black(ax, df_in, label, threshold, size=2, size_txt=None,
ctrl_label = 'no-targeting',
t_x=.5, t_y=-0.1):
df = prep_data(df_in, threshold, ctrl_label)

target_data = df[df['target'] == label]

Expand All @@ -122,8 +127,10 @@ def label_as_black(ax, df_in, label, threshold, size=2, size_txt=None, t_x=.5, t
color='black', size=size_txt)


def label_sensitivity_hit(ax, df_in, label, threshold, size=2, size_txt=None, t_x=-.5, t_y=-0.1):
df = prep_data(df_in, threshold)
def label_sensitivity_hit(ax, df_in, label, threshold, size=2, size_txt=None,
ctrl_label = 'no-targeting',
t_x=.5, t_y=-0.1):
df = prep_data(df_in, threshold, ctrl_label)

target_data = df[df['target'] == label]

Expand All @@ -141,8 +148,10 @@ def label_sensitivity_hit(ax, df_in, label, threshold, size=2, size_txt=None, t_
color='black', size=size_txt)


def label_resistance_hit(ax, df_in, label, threshold, size=2, size_txt=None, t_x=.5, t_y=-0.1):
df = prep_data(df_in, threshold)
def label_resistance_hit(ax, df_in, label, threshold, size=2, size_txt=None,
ctrl_label = 'no-targeting',
t_x=.5, t_y=-0.1):
df = prep_data(df_in, threshold, ctrl_label)

target_data = df[df['target'] == label]

Expand Down

0 comments on commit a9546c3

Please sign in to comment.