In [2]:
import pandas as pd
import numpy as np
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm, AnovaRM
from statsmodels.stats.multicomp import pairwise_tukeyhsd

In [3]:
# import data
data = pd.ExcelFile('/Users/jessebecklevisohn/Documents/GitHub/group3-project/Cleaned_data.xlsx')

In [43]:
#loading all sheets as seperate dataframe objects

mcav_control = pd.read_excel(data, sheet_name ='Mcav Control')

mcav_elevated_PCO2 = pd.read_excel(data, sheet_name ='Mcav Elevated PCO2')

ofav_control = pd.read_excel(data, sheet_name ='Ofav Control')

ofav_elevated_PCO2 = pd.read_excel(data, sheet_name = 'Ofav Elevated P CO2')

pcli_control = pd.read_excel(data, sheet_name = 'Pcli Control')

pcli_elevated_PCO2 = pd.read_excel(data, sheet_name = 'Pcli Elevated PCO2')

In [121]:
# Combines control and elevated pCO2 sheets into one pandas dataframe
mcav = pd.concat([mcav_control, mcav_elevated_PCO2], axis=0).reset_index(drop=True)
ofav = pd.concat([ofav_control, ofav_elevated_PCO2], axis=0).reset_index(drop=True)
pcli = pd.concat([pcli_control, pcli_elevated_PCO2], axis=0).reset_index(drop=True)

In [122]:
# Adding all the sheets to a list
species = {"mcav":mcav, "ofav":ofav, "pcli":pcli}

In [123]:
# Rename the columns of the sheet
for key in species:
    species[key] = species[key].rename(columns={"Percent change Surface Area Density": "SAD", 
                                  "Calcification rate (mgCaCO3/cm2/day)": "calcification_rate", 
                                  "Tissue growth (mm2/day)": "tissue_growth",
                                  "Exposure Period": "exposure_period"})

In [124]:
# Get rid of all null values in the columns that we care about
for key in species:
    species[key] = species[key].loc[
        (species[key]["calcification_rate"].notnull()) & 
        (species[key]["tissue_growth"].notnull()) & 
        (species[key]["SAD"].notnull()) &
        (species[key]["exposure_period"].notnull()),:]

In [125]:
# Set up an empty dictionary
models = {"mcav":[], "ofav": [], "pcli": []}

# Create a list of the formulas for the regressions
formulas = ["calcification_rate ~ -1 + Elevated + exposure_period", "tissue_growth ~ -1 + Elevated + exposure_period", "SAD ~ -1 + Elevated + exposure_period"]

# Iterate over each species and each dependant variable that we care about
for key in species:
    for f in formulas:
        model = ols(formula=f, data=species[key]).fit() # Fit a ols regression for it 
        models[key].append(anova_lm(model)) # Append the ols to a list for each species in the dictionary

In [126]:
dependent_variables = ["calcification_rate", "tissue_growth", "SAD"]
tukeys = {"mcav":{}, "ofav": {}, "pcli": {}}

# Iterate over each species and each dependant variable that we care about
for key in species:
    for var in dependent_variables:
        tukey = pairwise_tukeyhsd(endog=species[key][var], groups=species[key]["Elevated"], alpha=0.05) # Conduct a Tukey Test across the pCO2 levels 
        tukeys[key][var] = tukey # Save the result of the tukey test to a dictionary of dictionaries

print(tukeys["mcav"]["calcification_rate"])

Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj  lower  upper  reject
--------------------------------------------------
     0      1   0.0935   0.9 -1.6122 1.7991  False
--------------------------------------------------


In [136]:
print(AnovaRM(data=species["mcav"], depvar='SAD', subject='Sample', within=['Elevated', 'exposure_period']).fit())

dependent_variables = ["calcification_rate", "tissue_growth", "SAD"]
rms = {"mcav":{}, "ofav": {}, "pcli": {}}

# Iterate over each species and each dependant variable that we care about
for key in species:
    for var in dependent_variables:
        # Conduct a repeated measures ANOVA and save the result to a dictionary of dictionaries
        rms[key][var] = AnovaRM(data=species[key], depvar=var, subject='Sample', within=['Elevated', 'exposure_period']).fit() 

                        Anova
                         F Value Num DF  Den DF Pr > F
------------------------------------------------------
Elevated                  0.6998 1.0000  9.0000 0.4245
exposure_period          72.4414 2.0000 18.0000 0.0000
Elevated:exposure_period  0.6440 2.0000 18.0000 0.5369

                        Anova
                         F Value Num DF  Den DF Pr > F
------------------------------------------------------
Elevated                  0.0253 1.0000  9.0000 0.8770
exposure_period          13.0731 2.0000 18.0000 0.0003
Elevated:exposure_period  0.1015 2.0000 18.0000 0.9040

                        Anova
                         F Value Num DF  Den DF Pr > F
------------------------------------------------------
Elevated                  1.6910 1.0000  9.0000 0.2258
exposure_period          93.3113 2.0000 18.0000 0.0000
Elevated:exposure_period  0.8247 2.0000 18.0000 0.4543

                        Anova
                         F Value Num DF  Den DF Pr >

ValueError: Data is unbalanced.