# Differential expression: Post-Hoc Analysis by using Pairwise Tukey
An F-test will tell us whether there is a significant difference across several conditions, but it doesn't tell us which conditions are different. Typically we want to do a post hoc test to tell us which conditions are significantly different from one another.

In [1]:
import json

import pandas as pd
from statsmodels.stats.multicomp import pairwise_tukeyhsd

## 1. Load Data and Experimental Groups

In [2]:
present_transcripts_df = pd.read_csv('../../data/expression_by_probe.csv', index_col=0)
diff_exp_genes_list = json.load(open('../../results/diff_exp_genes_list.json'))

## 2. Significant Comparisons Extraction Function
Parses results of statsmodels.stats.multicomp.pairwise_tukeyhsd for smoother processing of many 
comparisons at once. Returns pandas DataFrame with adjusted p-values for each combination of conditions.

In [3]:
def tukeyhsd_get_significant_comparisons(tukeyhsd_results, only_sig_comparisons=True):
    '''
    tukeyhsd_results - output of statsmodels.stats.multicomp.pairwise_tukeyhsd
    include_CI - boolean value for whether to include the confidence interval
    only_sig_comparisons - boolean value for whether to only include comparisons where we can
                            reject the null hypothesis
    '''

    summary_df = pd.DataFrame(
        tukeyhsd_results._results_table.data[1:], 
        columns=tukeyhsd_results._results_table.data[0])
    
    desired_comparisons = [True] if only_sig_comparisons else [True, False]  
      
    a = summary_df.loc[:,['p-adj', 'reject']]
        
    # Add the Confidence Interval        
    a['CI'] = [[i[0] for i in zip(summary_df.loc[j, ['lower', 'upper']])] for j in summary_df.index]
      
    a.index = [i+'_'+j for i,j in summary_df.loc[:,['group1', 'group2']].values]
        
    return a.loc[[True if i in desired_comparisons else False for i in summary_df['reject']]]

## 3. Get the Significant Results

In [4]:
post_hoc_diff_dict = {}
post_hoc_dict = {}

for transcript in diff_exp_genes_list:
    # Use Statsmodels to do the actual math
    tukey_summary = pairwise_tukeyhsd(
        present_transcripts_df.loc[transcript].to_numpy(),
        ['Control'] * 3 + ['IL2'] * 3 + ['IL15'] * 3 + ['IL21'] * 3,
        alpha=0.01)

    # Statsmodels' pairwise_tukeyhsd returns very difficult to interpret results
    # We thus have another function to extract the relevant information from these results
    tukey_results = tukeyhsd_get_significant_comparisons(tukey_summary)
    # We also want the non significant comparisons so we can make a volcano plot later
    tukey_results_w_rejected = tukeyhsd_get_significant_comparisons(tukey_summary, only_sig_comparisons=False)
    
    # Make dict with only significant results so we can look at them
    for comparison in tukey_results.index:
        if not comparison in post_hoc_diff_dict:
            post_hoc_diff_dict[comparison] = {}
        post_hoc_diff_dict[comparison][transcript] = tukey_results.loc[comparison, 'p-adj']
    
    # Make a dict with rejected transcript included for later volcano plots    
    for comparison in tukey_results_w_rejected.index:
        if not comparison in post_hoc_dict:
            post_hoc_dict[comparison] = {}
        post_hoc_dict[comparison][transcript] = tukey_results_w_rejected.loc[comparison, 'p-adj']

### Significant differentially expressed genes after post hoc analysis:

In [5]:
for condition in post_hoc_diff_dict:
    print(f'{condition}: {len(post_hoc_diff_dict[condition])} genes')

Control_IL15: 1678 genes
IL15_IL21: 1446 genes
Control_IL2: 1737 genes
IL2_IL21: 1519 genes
IL15_IL2: 44 genes
Control_IL21: 192 genes


## 4. Save Results for Further Use

In [6]:
json.dump(post_hoc_diff_dict, open('../../results/post_hoc_diff_dict.json', 'w'), indent=2)
json.dump(post_hoc_dict, open('../../results/post_hoc_dict.json', 'w'), indent=2)