Merge pull request #37 from ArcInstitute/single-guide-design

add features for processing single guide libraries
ArcInstitute · Mar 4, 2024 · a9546c3 · a9546c3
2 parents b712685 + 64d6618
commit a9546c3
Show file tree

Hide file tree

Showing 5 changed files with 144 additions and 49 deletions.
diff --git a/environment.yml b/environment.yml
@@ -17,7 +17,7 @@ dependencies:
   - pyarrow
   - ipykernel
   - mscorefonts
-  - rust
+  - rust>=1.72
   - pip
   - pip:
       - click

diff --git a/screenpro/__init__.py b/screenpro/__init__.py
@@ -3,6 +3,7 @@
 from . import plotting as pl
 from . import phenoScore as ps
 from . import utils
+from . import ngs
 
 from .__version__ import __version__
 from copy import copy

diff --git a/screenpro/phenoScore.py b/screenpro/phenoScore.py
@@ -6,38 +6,43 @@
 import pandas as pd
 from pydeseq2 import preprocessing
 from .phenoStats import matrixStat, getFDR
+from .utils import find_low_counts
 
 
-def calculateDelta(x, y, math, ave):
+def calculateDelta(x, y, math, level):
     """
     Calculate log ratio of y / x.
-    `ave` == 'all' (i.e. averaged across all values, oligo and replicates)
-    `ave` == 'col' (i.e. averaged across columns, replicates)
+    `level` == 'all' (i.e. averaged across all values, oligo and replicates)
+    `level` == 'col' (i.e. averaged across columns, replicates)
     Args:
         x (np.array): array of values
         y (np.array): array of values
         math (str): math to use for calculating score
-        ave (str): average method
+        level (str): level to use for calculating score
     Returns:
         np.array: array of log ratio values
     """
-    if ave == 'all':
+    # check if math is implemented
+    if math not in ['log2(x+1)', 'log10', 'log1p']:
+        raise ValueError(f'math "{math}" not recognized')
+
+    if level == 'all':
         # average across all values
         if math == 'log2(x+1)':
             return np.mean(np.log2(y+1) - np.log2(x+1))
         elif math == 'log10':
             return np.mean(np.log10(y) - np.log10(x))
         elif math == 'log1p':
             return np.mean(np.log1p(y) - np.log1p(x))
-    elif ave == 'row':
+    elif level == 'row':
         # average across rows
         if math == 'log2(x+1)':
             return np.mean(np.log2(y+1) - np.log2(x+1), axis=0)
         elif math == 'log10':
             return np.mean(np.log10(y) - np.log10(x), axis=0)
         elif math == 'log1p':
             return np.mean(np.log1p(y) - np.log1p(x), axis=0)
-    elif ave == 'col':
+    elif level == 'col':
         # average across columns
         if math == 'log2(x+1)':
             return np.mean(np.log2(y+1) - np.log2(x+1), axis=1)
@@ -47,7 +52,7 @@ def calculateDelta(x, y, math, ave):
             return np.mean(np.log1p(y) - np.log1p(x), axis=1)
 
 
-def calculatePhenotypeScore(x, y, x_ctrl, y_ctrl, growth_rate, math, ave):
+def calculatePhenotypeScore(x, y, x_ctrl, y_ctrl, growth_rate, math, level):
     """
     Calculate phenotype score normalized by negative control and growth rate.
     Args:
@@ -57,21 +62,21 @@ def calculatePhenotypeScore(x, y, x_ctrl, y_ctrl, growth_rate, math, ave):
         y_ctrl (np.array): array of values
         growth_rate (int): growth rate
         math (str): math to use for calculating score
-        ave (str): average method
+        level (str): level to use for calculating score
     Returns:
         np.array: array of scores
     """
     # calculate control median and std
-    ctrl_median = np.median(calculateDelta(x=x_ctrl, y=y_ctrl, math=math, ave=ave))
+    ctrl_median = np.median(calculateDelta(x=x_ctrl, y=y_ctrl, math=math, level=level))
 
     # calculate delta
-    delta = calculateDelta(x=x, y=y, math=math, ave=ave)
+    delta = calculateDelta(x=x, y=y, math=math, level=level)
 
     # calculate score
     return (delta - ctrl_median) / growth_rate
 
 
-def matrixTest(x, y, x_ctrl, y_ctrl, math, ave_reps, test = 'ttest', growth_rate = 1):
+def matrixTest(x, y, x_ctrl, y_ctrl, math, level, test = 'ttest', growth_rate = 1):
     """
     Calculate phenotype score and p-values comparing `y` vs `x` matrices.
     Args:
@@ -80,31 +85,28 @@ def matrixTest(x, y, x_ctrl, y_ctrl, math, ave_reps, test = 'ttest', growth_rate
         x_ctrl (np.array): array of values
         y_ctrl (np.array): array of values
         math (str): math to use for calculating score
-        ave_reps (bool): average replicates
+        level (str): level to use for calculating score and p-value
         test (str): test to use for calculating p-value
         growth_rate (int): growth rate
     Returns:
         np.array: array of scores
         np.array: array of p-values
     """
-    # check if average across replicates
-    ave = 'col' if ave_reps else 'all'
-
     # calculate growth score
     scores = calculatePhenotypeScore(
         x = x, y = y, x_ctrl = x_ctrl, y_ctrl = y_ctrl,
         growth_rate = growth_rate, math = math,
-        ave = ave
+        level = level
     )
 
     # compute p-value
-    p_values = matrixStat(x, y, test=test, ave_reps=ave_reps)
+    p_values = matrixStat(x, y, test=test, level = level)
 
     return scores, p_values
 
 
-def runPhenoScore(adata, cond1, cond2, math, test, score_level,
-                  growth_rate=1, n_reps=2, 
+def runPhenoScore(adata, cond1, cond2, math, score_level, test,
+                  growth_rate=1, n_reps=2, keep_top_n = None,num_pseudogenes=None,
                   get_z_score=False,ctrl_label='negCtrl'):
     """
     Calculate phenotype score and p-values when comparing `cond2` vs `cond1`.
@@ -131,6 +133,12 @@ def runPhenoScore(adata, cond1, cond2, math, test, score_level,
     # check if count_layer exists
     if 'seq_depth_norm' not in adata.layers.keys():
         seqDepthNormalization(adata)
+
+    # evaluate library table to get targets and riase error if not present
+    required_columns = ['target', 'sequence']
+    missing_columns = list(set(required_columns) - set(adata.var.columns))
+    if len(missing_columns) > 0:
+        raise ValueError(f"Missing required columns in library table: {missing_columns}")
 
     # calc phenotype score and p-value
     if score_level == 'compare_reps':
@@ -149,33 +157,107 @@ def runPhenoScore(adata, cond1, cond2, math, test, score_level,
         # calculate growth score and p_value
         scores, p_values = matrixTest(
             x=x, y=y, x_ctrl=x_ctrl, y_ctrl=y_ctrl,
-            math=math, ave_reps=True, test=test, 
+            math=math, level='col', test=test, 
             growth_rate=growth_rate
         )
         # get adjusted p-values
         adj_p_values = getFDR(p_values)
+
         # get targets
-        targets = adata.var.index.str.split('_[-,+]_').str[0].to_list()
-        
+        targets = adata.var['target'].to_list()
+
         # combine results into a dataframe
         result = pd.concat([
             pd.Series(targets, index=adata.var.index, name='target'),
             pd.Series(scores, index=adata.var.index, name='score'),
         ], axis=1)
+
         if get_z_score:
             # z-score normalization
             result['z_score'] = calculateZScorePhenotypeScore(result, ctrl_label=ctrl_label)
         # add p-values
         result[f'{test} pvalue'] = p_values
         result['BH adj_pvalue'] = adj_p_values
-
-        return result_name, result
 
-    elif score_level in ['compare_oligos', 'compare_oligos_and_reps']:
-        return None
+    elif score_level in ['compare_guides']:
+        if n_reps == 2:
+            pass
+        else:
+            raise ValueError(f'n_reps "{n_reps}" not recognized')
+
+        find_low_counts(adata)
+        adata = adata[:,~adata.var.low_count].copy()
+
+        adata.var.drop(columns='low_count',inplace=True)
+
+        # generate pseudo gene labels
+        generatePseudoGeneLabels(adata, num_pseudogenes=num_pseudogenes, ctrl_label=ctrl_label)
+        # drop categories from target column!
+        adata.var['target'] = adata.var['target'].to_list()
+        # replace values in target columns for negative control with pseudogene label
+        adata.var.loc[
+            adata.var.targetType.eq(ctrl_label), 'target'
+        ] = adata.var.loc[adata.var.targetType.eq(ctrl_label), 'pseudoLabel']
+        # drop pseudoLabel column 
+        adata.var.drop(columns='pseudoLabel', inplace=True)
+
+        # prep counts for phenoScore calculation
+        df_cond1 = adata[adata.obs.query(f'condition=="{cond1}"').index].to_df().T
+        df_cond2 = adata[adata.obs.query(f'condition=="{cond2}"').index].to_df().T        # get control values
+
+        # get control values
+        x_ctrl = df_cond1[adata.var.targetType.eq(ctrl_label)].to_numpy()
+        y_ctrl = df_cond2[adata.var.targetType.eq(ctrl_label)].to_numpy()
+
+        targets = []
+        scores = []
+        p_values = []
+
+        # group by target genes or pseudogenes to aggregate counts for score calculation
+        for target_name, target_group in adata.var.groupby('target'):
+            # convert to numpy arrays
+            x = df_cond1.loc[target_group.index,:].to_numpy()
+            y = df_cond2.loc[target_group.index,:].to_numpy()
+            # Sort and find top n guide per target, see #18
+            if keep_top_n:
+                x.sort()
+                y.sort()
+                x = x[:keep_top_n]
+                y = y[:keep_top_n]
+
+            # calculate growth score and p_value
+            target_scores, target_p_values = matrixTest(
+                x=x, y=y, x_ctrl=x_ctrl, y_ctrl=y_ctrl,
+                math=math, level='row', test=test,
+                growth_rate=growth_rate
+            )
+
+            scores.append(target_scores)
+            p_values.append(target_p_values)
+            targets.append(target_name)
+
+        # combine results into a dataframe
+        result = pd.concat({
+            'replicate_1':pd.concat([
+                pd.Series([s1 for s1,_ in scores], index=targets, name='score'),
+                pd.Series([p1 for p1,_ in p_values], index=targets, name=f'{test} pvalue'),
+
+            ],axis=1),
+            'replicate_2':pd.concat([
+                pd.Series([s2 for _,s2 in scores], index=targets, name='score'),
+                pd.Series([p2 for _,p2 in p_values], index=targets, name=f'{test} pvalue'),
+            ],axis=1),
+            'replicate_ave':pd.concat([
+                pd.Series([np.mean([s1,s2]) for s1,s2 in scores], index=targets, name='score'),
+                pd.Series([np.mean([p1,p2]) for p1,p2 in p_values], index=targets, name=f'{test} pvalue'),
+            ],axis=1)
+        },axis=1)
 
     else:
         raise ValueError(f'score_level "{score_level}" not recognized')
+
+
+    return result_name, result
 
 
 def seqDepthNormalization(adata):
@@ -259,7 +341,7 @@ def runPhenoScoreForReplicate(screen, x_label, y_label, score, growth_factor_tab
 
             growth_rate=growth_factor_table.query(f'score=="{score}" & replicate=={replicate}')['growth_factor'].values[0],
             math=screen.math,
-            ave='row'  # there is only one column so `row` option here is equivalent to the value before averaging.
+            level='row'  # there is only one column so `row` option here is equivalent to the value before averaging.
         )
 
         if get_z_score:

diff --git a/screenpro/phenoStats.py b/screenpro/phenoStats.py
@@ -7,29 +7,32 @@
 from statsmodels.stats.multitest import multipletests
 
 
-def matrixStat(x, y, test, ave_reps):
+def matrixStat(x, y, test, level):
     """
     Get p-values comparing `y` vs `x` matrices.
     Args:
         x (np.array): array of values
         y (np.array): array of values
         test (str): test to use for calculating p-value
-        ave_reps (bool): average replicates
+        level (str): level at which to calculate p-value
     Returns:
         np.array: array of p-values
     """
     # calculate p-values
     if test == 'MW':
         # run Mann-Whitney U rank test
-        pass
+        raise ValueError('Mann-Whitney U rank test not implemented')
     elif test == 'ttest':
         # run ttest
-        if ave_reps:
-            # average across replicates
+        if level == 'col':
             p_value = ttest_rel(y, x, axis=1)[1]
-        else:
+        elif level == 'row':
+            p_value = ttest_rel(y, x, axis=0)[1]
+        elif level == 'all':
             # average across all values
             p_value = ttest_rel(y, x)[1]
+        else:
+            raise ValueError(f'Level "{level}" not recognized')
         return p_value
     else:
         raise ValueError(f'Test "{test}" not recognized')

diff --git a/screenpro/plotting.py b/screenpro/plotting.py
@@ -63,31 +63,34 @@ def draw_threshold(x, threshold, pseudo_sd):
     return threshold * pseudo_sd * (1 if x > 0 else -1) / abs(x)
 
 
-def prep_data(df_in, threshold):
+def prep_data(df_in, threshold, ctrl_label):
     df = df_in.copy()
 
-    df = ann_score_df(df, threshold=threshold)
+    df = ann_score_df(df, threshold=threshold, ctrl_label = ctrl_label)
 
     df['-log10(pvalue)'] = np.log10(df.pvalue) * -1
 
     return df
 
 
-def plot_volcano(ax, df_in, threshold, up_hit='resistance_hit', down_hit='sensitivity_hit', xlim_l=-5, xlim_r=5,
+def plot_volcano(ax, df_in, threshold, up_hit='resistance_hit', down_hit='sensitivity_hit', 
+                 ctrl_label = 'no-targeting',
+                 dot_size=1,
+                 xlim_l=-5, xlim_r=5,
                  ylim=6):
-    df = prep_data(df_in, threshold)
+    df = prep_data(df_in, threshold, ctrl_label)
 
     # Scatter plot for each category
     ax.scatter(df.loc[df['label'] == 'target_non_hit', 'score'],
                df.loc[df['label'] == 'target_non_hit', '-log10(pvalue)'],
-               alpha=0.1, s=1, c='black', label='target_non_hit')
+               alpha=0.1, s=dot_size, c='black', label='target_non_hit')
     ax.scatter(df.loc[df['label'] == up_hit, 'score'], df.loc[df['label'] == up_hit, '-log10(pvalue)'],
-               alpha=0.9, s=1, c='#fcae91', label=up_hit)
+               alpha=0.9, s=dot_size, c='#fcae91', label=up_hit)
     ax.scatter(df.loc[df['label'] == down_hit, 'score'], df.loc[df['label'] == down_hit, '-log10(pvalue)'],
-               alpha=0.9, s=1, c='#bdd7e7', label=down_hit)
+               alpha=0.9, s=dot_size, c='#bdd7e7', label=down_hit)
     ax.scatter(df.loc[df['label'] == 'non-targeting', 'score'],
                df.loc[df['label'] == 'non-targeting', '-log10(pvalue)'],
-               alpha=0.1, s=1, c='gray', label='non-targeting')
+               alpha=0.1, s=dot_size, c='gray', label='non-targeting')
 
     # Set x-axis and y-axis labels
     ax.set_xlabel('phenotype score')
@@ -103,8 +106,10 @@ def plot_volcano(ax, df_in, threshold, up_hit='resistance_hit', down_hit='sensit
     ax.legend()
 
 
-def label_as_black(ax, df_in, label, threshold, size=2, size_txt=None, t_x=.5, t_y=-0.1):
-    df = prep_data(df_in, threshold)
+def label_as_black(ax, df_in, label, threshold, size=2, size_txt=None, 
+                   ctrl_label = 'no-targeting',
+                   t_x=.5, t_y=-0.1):
+    df = prep_data(df_in, threshold, ctrl_label)
 
     target_data = df[df['target'] == label]
 
@@ -122,8 +127,10 @@ def label_as_black(ax, df_in, label, threshold, size=2, size_txt=None, t_x=.5, t
                     color='black', size=size_txt)
 
 
-def label_sensitivity_hit(ax, df_in, label, threshold, size=2, size_txt=None, t_x=-.5, t_y=-0.1):
-    df = prep_data(df_in, threshold)
+def label_sensitivity_hit(ax, df_in, label, threshold, size=2, size_txt=None,
+                          ctrl_label = 'no-targeting',
+                          t_x=.5, t_y=-0.1):
+    df = prep_data(df_in, threshold, ctrl_label)
 
     target_data = df[df['target'] == label]
 
@@ -141,8 +148,10 @@ def label_sensitivity_hit(ax, df_in, label, threshold, size=2, size_txt=None, t_
                     color='black', size=size_txt)
 
 
-def label_resistance_hit(ax, df_in, label, threshold, size=2, size_txt=None, t_x=.5, t_y=-0.1):
-    df = prep_data(df_in, threshold)
+def label_resistance_hit(ax, df_in, label, threshold, size=2, size_txt=None, 
+                         ctrl_label = 'no-targeting',
+                         t_x=.5, t_y=-0.1):
+    df = prep_data(df_in, threshold, ctrl_label)
 
     target_data = df[df['target'] == label]