In [10]:
import pandas as pd
from scipy.stats import barnard_exact, ttest_ind
import numpy as np
import statsmodels.formula.api as smf

In [11]:
def read_data():
    df = pd.read_excel('data.xlsx')
    df = pd.merge(df, pd.get_dummies(df['bed_type'], drop_first=True, dtype=float), left_index=True, right_index=True)
    display(df.head())
    display(df.info())
    
    return df

df = read_data()

OSError: [Errno 22] Invalid argument: 'data.xlsx'

# Survival to harvest

In [None]:
def survive_to_harvest_analysis(df):
    survival_contingency = pd.crosstab(df.bed_type, df.survived_to_harvest)
    display(survival_contingency)
    # Barnard becuase it is more powerful than Fisher and only one margin is fixed
    # unpooled as unknown if variences are the same for each group. Pooling assumes the same varience. https://cran.r-project.org/web/packages/Exact/Exact.pdf , https://stats.stackexchange.com/questions/169864/which-test-for-cross-table-analysis-boschloo-or-barnard
    display(barnard_exact(survival_contingency, alternative='two-sided', pooled=False, n=32))

survive_to_harvest_analysis(df)

survived_to_harvest,0,1
bed_type,Unnamed: 1_level_1,Unnamed: 2_level_1
Dig,2,7
No-dig,3,6


BarnardExactResult(statistic=-0.5344572305152208, pvalue=0.7294983244707437)

# Yield

## Simple hypothesis testing

In [None]:
def yield_analysis(df):
    print(f'There are {len(df)} samples.')
    mask = df['yield_g'].isna()
    print(f'Excluding {sum(mask)} samples with nan yield.')
    df = df[mask==False]
    print(f'{len(df)} samples remain.')

    a = df[df['bed_type']=='Dig']['yield_g']
    b = df[df['bed_type']=='No-dig']['yield_g']

    print()
    print(f'The Dig yield mean is {a.mean()} g')
    print(f'The No-dig yield mean is {b.mean()} g')

    display(ttest_ind(a=a, b=b, equal_var=True, nan_policy='raise', permutations=None, alternative='two-sided', trim=0))

yield_analysis(df)

There are 18 samples.
Excluding 5 samples with nan yield.
13 samples remain.

The Dig yield mean is 90.21428571428571 g
The No-dig yield mean is 67.55 g


TtestResult(statistic=0.5179628986230862, pvalue=0.614741985581841, df=11.0)

## Linear regression

In [12]:
def drop_na(df):
    print(f'There are {len(df)} samples.')
    mask = df['yield_g'].isna()
    print(f'Excluding {sum(mask)} samples with nan yield.')
    df = df[mask==False]
    print(f'{len(df)} samples remain.')
    return df


def yield_linear_regression(df):
    df = drop_na(df)

    mod = smf.ols('yield_g ~ bed_type + wall_distance_e_mm + wall_distance_s_mm', data=df)
    res = mod.fit()

    print(res.summary())
    
yield_linear_regression(df)

There are 18 samples.
Excluding 5 samples with nan yield.
13 samples remain.
                            OLS Regression Results                            
Dep. Variable:                yield_g   R-squared:                       0.201
Model:                            OLS   Adj. R-squared:                 -0.066
Method:                 Least Squares   F-statistic:                    0.7540
Date:                Sun, 28 Jul 2024   Prob (F-statistic):              0.547
Time:                        20:57:53   Log-Likelihood:                -72.805
No. Observations:                  13   AIC:                             153.6
Df Residuals:                       9   BIC:                             155.9
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------

