https://www.pythonfordatascience.org/factorial-anova-python/

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
from itertools import permutations, product, combinations
from scipy.stats import pearsonr, spearmanr, mannwhitneyu, ks_2samp

from itertools import permutations
from itertools import combinations

In [None]:
#https://www.scribbr.com/statistics/two-way-anova/
import statsmodels.api as sm
from statsmodels.formula.api import ols
import statsmodels.stats.multitest as multi

In [None]:
import warnings
from statsmodels.tools.sm_exceptions import ConvergenceWarning, HessianInversionWarning, ValueWarning
# ignore these warning
warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=HessianInversionWarning)
warnings.filterwarnings("ignore", category=ValueWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)

In [None]:
def FC(group1, group2):
    return (np.mean(group2) - np.mean(group1)) / abs((np.mean(group2) + np.mean(group1)))

## Settings

In [None]:
#analysis = "Fastcore"
analysis = "iMAT"

#analysis = "gimme"
#analysis = "init"
#analysis = "tinit"

In [None]:
#analysis_type = "FVA"
#analysis_type = "pFBA"
analysis_type = "sampling"

### Sum of squares type

In [None]:
ss_type = 3

if ss_type == 3:
    formula = "activity ~ C(gender, Sum) + C(genotype, Sum) + C(diet, Sum) + C(gender, Sum)*C(genotype, Sum) + C(gender, Sum)*C(diet, Sum) + C(genotype, Sum)*C(diet, Sum) + C(gender, Sum)*C(genotype, Sum)*C(diet, Sum)"
    #formula = "activity ~ C(gender, Sum) + C(genotype, Sum) + C(diet, Sum)"
else:
    formula = "activity ~ C(gender) + C(genotype) + C(diet) + C(gender)*C(genotype) + C(gender)*C(diet) + C(genotype)*C(diet) + C(gender)*C(genotype)*C(diet)"

### Number of reactions to observe 
Only keep the first `n_reactions` reactions (unless set to zero)

In [None]:
n_reactions = 0 # all reactions will be included
#n_reactions = 10

## Read the data

In [None]:
df = pd.read_csv("data\\"+analysis_type+"_"+analysis+".csv", sep=";")

In [None]:
models = list(df.columns[1:])
#models = list(map(lambda x: x.split("_")[1].split(".")[0], models))

In [None]:
df.columns = [df.columns[0]] + models

#### Convert values to float and replace nans with zeros

In [None]:
df[models] = df[models].astype(float)
df = df.fillna(0)

### Filter the reactions

#### Remove the reactions that are always the same

In [None]:
#df[models].eq(df[models].iloc[:, 0], axis=0).all(axis=1)

In [None]:
df = df[~df[models].eq(df[models].iloc[:, 0], axis=0).all(axis=1)]

In [None]:
#df = df.loc[~(df[df.columns[1:]]==0).all(axis=1)]
#df = df.loc[~(df[df.columns[1:]]==1).all(axis=1)]

#### If `n_reactions` is not zero only retain first `n_reactions`

In [None]:
if n_reactions:
    df = df.head(n_reactions)

# Groups

## Grouping by genotype

In [None]:
# WT
genotype0 = ["GSM1405493","GSM1405505","GSM1405517", 
              "GSM1405489","GSM1405501","GSM1405513",
              "GSM1405485","GSM1405497","GSM1405509",
              "GSM1405494","GSM1405506","GSM1405518",
              "GSM1405490","GSM1405502","GSM1405514",
              "GSM1405486","GSM1405498","GSM1405510"]
# KO
genotype1 = ["GSM1405495","GSM1405507","GSM1405519",
              "GSM1405491","GSM1405503","GSM1405515",
              "GSM1405487","GSM1405499","GSM1405511",
              "GSM1405496","GSM1405508","GSM1405520",
              "GSM1405492","GSM1405504","GSM1405516",
              "GSM1405488","GSM1405500","GSM1405512"]
genotype = (genotype0, genotype1)

## Grouping by diet

In [None]:
# LFnC
diet0 = ["GSM1405485","GSM1405497","GSM1405509","GSM1405487","GSM1405499","GSM1405511",
         "GSM1405486","GSM1405498","GSM1405510","GSM1405488","GSM1405500","GSM1405512"]

# HFnC
diet1 = ["GSM1405489","GSM1405501","GSM1405513","GSM1405491","GSM1405503","GSM1405515",
         "GSM1405490","GSM1405502","GSM1405514","GSM1405492","GSM1405504","GSM1405516"]

# HFC
diet2 = ["GSM1405493","GSM1405505","GSM1405517","GSM1405495","GSM1405507","GSM1405519",
         "GSM1405494","GSM1405506","GSM1405518","GSM1405496","GSM1405508","GSM1405520"]

diet = (diet0, diet1, diet2)

## Grouping by gender

In [None]:
# F
gender0 = ["GSM1405493","GSM1405505","GSM1405517",
           "GSM1405489","GSM1405501","GSM1405513",
           "GSM1405485","GSM1405497","GSM1405509",
           "GSM1405495","GSM1405507","GSM1405519",
           "GSM1405491","GSM1405503","GSM1405515",
           "GSM1405487","GSM1405499","GSM1405511"]

# M
gender1 = ["GSM1405494","GSM1405506","GSM1405518",
           "GSM1405490","GSM1405502","GSM1405514",
           "GSM1405486","GSM1405498","GSM1405510",
           "GSM1405496","GSM1405508","GSM1405520",
           "GSM1405492","GSM1405504","GSM1405516",
           "GSM1405488","GSM1405500","GSM1405512"]

gender = (gender0, gender1)

## Groups

In [None]:
groups = {"genotype": genotype, "diet": diet, "gender": gender}
labels = {"genotype": ("WT","KO"), "diet": ("LFnC", "HFnC", "HFC"), "gender": ("F","M")}

In [None]:
d = {}

for g in groups:
    d[g] = {}
    for label, group in zip(labels[g], groups[g]):
        d[g][label] = group

groups = d    

## Retain only observed models

In [None]:
observed = gender0 + gender1

In [None]:
df = df[[df.columns[0]] + observed]

In [None]:
df.head()

# Organize the data

In [None]:
df2 = pd.melt(df, id_vars=["rxns"])
df2.columns = ['rxn', 'model', 'activity']

# already did this
## convert activities to float
#df2['activity'] = df2['activity'].str.replace(",",".")
#df2['activity'] = df2['activity'].astype(float)

## replace nans with zero
#df2['activity'] = df2['activity'].fillna(0)

In [None]:
for factor_label, group in groups.items():
    for group_label, group_models in group.items():        
        df2.loc[df2['model'].isin(group_models), factor_label] = group_label
        

In [None]:
rxns = df2.rxn.unique()
len(rxns)

# Test pairs

In [None]:
genotypes = list(groups['genotype'].keys())
genders = list(groups['gender'].keys())
diets = list(groups['diet'].keys())

In [None]:
genotype_pairs = list(combinations(genotypes,2))
gender_pairs = list(combinations(genders,2))
diet_pairs = list(combinations(diets,2))

In [None]:
df2[(df2['rxn'] == '34DHPHEt_b')&(df2['gender'] == 'F')].activity.values

In [None]:
df2[(df2['rxn'] == '34DHPHEt_b')&(df2['gender'] == 'M')].activity.values

## Everything together

In [None]:
"""
GENOTYPE
"""
df_genotype = pd.DataFrame()
df_genotype['rxn'] = rxns

for gender in genders + [""]:
    df3 = df2
        
    if gender:
        df3 = df3[df3['gender'] == gender]  
    
        
    for diet in diets + [""]:        
        df4 = df3
        
        if diet:        
            df4 = df4[df4['diet'] == diet]                    
                
        for g1,g2 in genotype_pairs:
            
            column = ""
            if gender:
                column = gender + "_" + column
            if diet:
                column += diet + "_"
            
            column += "(" + g1 + ":" + g2 +")" 
            column_ks = column+"[ks]"
            column1 = column.replace(":", "<")
            column2 = column.replace(":", ">")
            column_FC = column + "(FC)"
            
            df_genotype[column] = np.nan
            group1 = df4[df4['genotype'] == g1]
            group2 = df4[df4['genotype'] == g2]
            
            for reaction in rxns:
                r1 = sorted(group1[group1['rxn'] == reaction].activity.values)
                r2 = sorted(group2[group2['rxn'] == reaction].activity.values)
                
                
                if r1 == r2:
                    mw = 1
                    p_R1 = 1
                    p_R2 = 1
                    ks = 1
                    fc = 0
                else: 
                    mw = mannwhitneyu(r1,r2)[1]
                    ks = ks_2samp(r1, r2)[1]
                    fc = FC(r1, r2)
                    
                    # omit repeats of the same values within a group
                    r1 = np.unique(r1)
                    r2 = np.unique(r2)

                    # if only 1 value per group is left
                    if len(r1)==1 and len(r2)==1:
                        if r1 < r2:
                            p_R1,p_R2 = 0,1
                        elif r1 > r2:
                            p_R1,p_R2 = 1,0
                        else:
                            p_R1,p_R2 = 1,1
                    else:                                       
                        R1, p_R1 = spearmanr(np.arange(len(r1) + len(r2)), np.concatenate((r1, r2)))
                        if R1 < 0:
                            p_R1 = 1
                        
                        R2, p_R2 = spearmanr(np.arange(len(r1) + len(r2)), np.concatenate((r2, r1)))
                        if R2 < 0:
                            p_R2 = 1             
                    
                df_genotype.loc[df_genotype["rxn"]==reaction, column] = mw    
                df_genotype.loc[df_genotype["rxn"]==reaction, column_ks] = ks
                df_genotype.loc[df_genotype["rxn"]==reaction, column1] = p_R1    
                df_genotype.loc[df_genotype["rxn"]==reaction, column2] = p_R2
                df_genotype.loc[df_genotype["rxn"]==reaction, column_FC] = fc 
                 
                
                
"""
GENDER
"""
df_gender = pd.DataFrame()
df_gender['rxn'] = rxns

for genotype in genotypes + [""]:
    df3 = df2
        
    if genotype:
        df3 = df3[df3['genotype'] == genotype]  
    
        
    for diet in diets + [""]:        
        df4 = df3
        
        if diet:        
            df4 = df4[df4['diet'] == diet]                    
                
        for g1,g2 in gender_pairs:
            
            column = ""
            if genotype:
                column = genotype + "_" + column
            
            if diet:
                column += diet + "_"
            
            column += "(" + g1 + ":" + g2 +")" 
            column_ks = column+"[ks]"
            column1 = column.replace(":", "<")
            column2 = column.replace(":", ">") 
            column_FC = column + "(FC)"
            
            df_gender[column] = np.nan
            group1 = df4[df4['gender'] == g1]
            group2 = df4[df4['gender'] == g2]
            
            for reaction in rxns:
                r1 = sorted(group1[group1['rxn'] == reaction].activity.values)
                r2 = sorted(group2[group2['rxn'] == reaction].activity.values)
           
                if r1 == r2:
                    mw = 1
                    ks = 1
                    p_R1 = 1
                    p_R2 = 1
                    fc = 0
                else: 
                    mw = mannwhitneyu(r1,r2)[1]
                    ks = ks_2samp(r1, r2)[1]
                    fc = FC(r1, r2)
                    
                    # omit repeats of the same values within a group
                    r1 = np.unique(r1)
                    r2 = np.unique(r2)

                    # if only 1 value per group is left
                    if len(r1)==1 and len(r2)==1:
                        if r1 < r2:
                            p_R1,p_R2 = 0,1
                        elif r1 > r2:
                            p_R1,p_R2 = 1,0
                        else:
                            p_R1,p_R2 = 1,1
                    else:                                       
                        R1, p_R1 = spearmanr(np.arange(len(r1) + len(r2)), np.concatenate((r1, r2)))
                        if R1 < 0:
                            p_R1 = 1
                        
                        R2, p_R2 = spearmanr(np.arange(len(r1) + len(r2)), np.concatenate((r2, r1)))
                        if R2 < 0:
                            p_R2 = 1
                    
                df_gender.loc[df_gender["rxn"]==reaction, column] = mw    
                df_gender.loc[df_gender["rxn"]==reaction, column_ks] = ks
                df_gender.loc[df_gender["rxn"]==reaction, column1] = p_R1    
                df_gender.loc[df_gender["rxn"]==reaction, column2] = p_R2   
                df_gender.loc[df_gender["rxn"]==reaction, column_FC] = fc                     
        
"""
DIET
"""

df_diet = pd.DataFrame()
df_diet['rxn'] = rxns

for gender in genders + [""]:
    df3 = df2
        
    if gender:
        df3 = df3[df3['gender'] == gender]  
    
        
    for genotype in genotypes + [""]:        
        df4 = df3
        
        if genotype:        
            df4 = df4[df4['genotype'] == genotype]                    
                
        for g1,g2 in diet_pairs:
            
            column = ""
            if gender:
                column = gender + "_" + column
            
            if genotype:
                column += genotype + "_"
            
            column += "(" + g1 + ":" + g2 +")" 
            column_ks = column+"[ks]"
            column1 = column.replace(":", "<")
            column2 = column.replace(":", ">") 
            column_FC = column + "(FC)"
            
            df_diet[column] = np.nan
            group1 = df4[df4['diet'] == g1]
            group2 = df4[df4['diet'] == g2]
            
            for reaction in rxns:
                r1 = sorted(group1[group1['rxn'] == reaction].activity.values)
                r2 = sorted(group2[group2['rxn'] == reaction].activity.values)
               
                if r1 == r2:
                    mw = 1
                    p_R1 = 1
                    p_R2 = 1
                    ks = 1
                    fc = 0
                else: 
                    mw = mannwhitneyu(r1,r2)[1]
                    ks = ks_2samp(r1, r2)[1]
                    fc = FC(r1, r2)
                    
                    # omit repeats of the same values within a group
                    r1 = np.unique(r1)
                    r2 = np.unique(r2)

                    # if only 1 value per group is left
                    if len(r1)==1 and len(r2)==1:
                        if r1 < r2:
                            p_R1,p_R2 = 0,1
                        elif r1 > r2:
                            p_R1,p_R2 = 1,0
                        else:
                            p_R1,p_R2 = 1,1
                    else:                                       
                        R1, p_R1 = spearmanr(np.arange(len(r1) + len(r2)), np.concatenate((r1, r2)))
                        if R1 < 0:
                            p_R1 = 1
                        
                        R2, p_R2 = spearmanr(np.arange(len(r1) + len(r2)), np.concatenate((r2, r1)))
                        if R2 < 0:
                            p_R2 = 1
                    
                df_diet.loc[df_diet["rxn"]==reaction, column] = mw    
                df_diet.loc[df_diet["rxn"]==reaction, column_ks] = ks
                df_diet.loc[df_diet["rxn"]==reaction, column1] = p_R1    
                df_diet.loc[df_diet["rxn"]==reaction, column2] = p_R2
                df_diet.loc[df_diet["rxn"]==reaction, column_FC] = fc 
                
                
    
        

In [None]:
df_genotype

## Save the results

In [None]:
df_genotype.to_csv("results_pairs\\" + analysis_type + "_" + analysis + "_genotype_compare.csv", index=False)
df_gender.to_csv("results_pairs\\" + analysis_type + "_" + analysis + "_gender_compare.csv", index=False)
df_diet.to_csv("results_pairs\\" + analysis_type + "_" + analysis + "_diet_compare.csv", index=False)

In [None]:
df_genotype_q = df_genotype.copy()
df_gender_q = df_gender.copy()
df_diet_q = df_diet.copy()

for c in df_genotype_q.columns[1:]:    
    df_genotype_q[c] = multi.multipletests(df_genotype_q[c], method = 'fdr_bh')[1]
for c in df_gender_q.columns[1:]:
    df_gender_q[c] = multi.multipletests(df_gender_q[c], method = 'fdr_bh')[1]
for c in df_diet_q.columns[1:]:
    df_diet_q[c] = multi.multipletests(df_diet_q[c], method = 'fdr_bh')[1]

In [None]:
df_genotype_q.to_csv("results_pairs\\" + analysis_type + "_" + analysis + "_genotype_compare_q.csv", index=False)
df_gender_q.to_csv("results_pairs\\" + analysis_type + "_" + analysis + "_gender_compare_q.csv", index=False)
df_diet_q.to_csv("results_pairs\\" + analysis_type + "_" + analysis + "_diet_compare_q.csv", index=False)

In [None]:
df_genotype_q[(df_genotype_q.iloc[:,1:] < 0.05).any(axis=1)]

In [None]:
spearmanr(np.arange(len(r1) + len(r2)), np.concatenate((r1, r2)))

In [None]:
r1 = np.array([0, 1])
r2 = np.array([3])

r1 = sorted(r1)
r2 = sorted(r2)

r1 = np.unique(r1)
r2 = np.unique(r2)

print(r1,r2)

if len(r1)==1 and len(r2)==1:
    if r1 < r2:
        print((0,1))
    else:
        print((1,0))
        
print("Spearman")
print(np.concatenate((r1, r2)), end=": ")
print(spearmanr(np.arange(len(r1) + len(r2)), np.concatenate((r1, r2))))
print(np.concatenate((r2, r1)), end=": ")
print(spearmanr(np.arange(len(r1) + len(r2)), np.concatenate((r2, r1))))

print("Pearson")
print(np.concatenate((r1, r2)), end=": ")
print(pearsonr(np.arange(len(r1) + len(r2)), np.concatenate((r1, r2))))
print(np.concatenate((r2, r1)), end=": ")
print(pearsonr(np.arange(len(r1) + len(r2)), np.concatenate((r2, r1))))



In [None]:
spearmanr([0,1,3],[2,3,4])