In [7]:
import pandas as pd
from scipy.stats import barnard_exact, ttest_ind
import statsmodels.api as sm
import numpy as np

In [6]:
def read_data():
    df = pd.read_excel('data.xlsx')
    df = pd.merge(df, pd.get_dummies(df['bed_type'], drop_first=True, dtype=float), left_index=True, right_index=True)
    display(df.head())
    display(df.info())
    
    return df

df = read_data()

Unnamed: 0,plant_id,bed_id,bed_type,height_30d_mm,height_30d_date,survived_to_harvest,yield_g,harvest_date,wall_distance_e_mm,wall_distance_s_mm,no-dig
0,1,A,dig,300.0,2024-04-25,1,151.9,2024-07-06,3700,600,0.0
1,2,A,dig,120.0,2024-04-25,1,6.5,2024-07-06,3550,600,0.0
2,3,A,dig,280.0,2024-04-25,1,87.9,2024-07-06,3400,600,0.0
3,4,A,dig,230.0,2024-04-25,1,59.2,2024-07-06,3250,600,0.0
4,5,A,dig,160.0,2024-04-25,1,10.3,2024-07-06,3100,600,0.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18 entries, 0 to 17
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   plant_id             18 non-null     int64         
 1   bed_id               18 non-null     object        
 2   bed_type             18 non-null     object        
 3   height_30d_mm        14 non-null     float64       
 4   height_30d_date      18 non-null     datetime64[ns]
 5   survived_to_harvest  18 non-null     int64         
 6   yield_g              13 non-null     float64       
 7   harvest_date         13 non-null     datetime64[ns]
 8   wall_distance_e_mm   18 non-null     int64         
 9   wall_distance_s_mm   18 non-null     int64         
 10  no-dig               18 non-null     float64       
dtypes: datetime64[ns](2), float64(3), int64(4), object(2)
memory usage: 1.7+ KB


None

# Survival to harvest

In [3]:
def survive_to_harvest_analysis(df):
    survival_contingency = pd.crosstab(df.bed_type, df.survived_to_harvest)
    display(survival_contingency)
    # Barnard becuase it is more powerful than Fisher and only one margin is fixed
    # unpooled as unknown if variences are the same for each group. Pooling assumes the same varience. https://cran.r-project.org/web/packages/Exact/Exact.pdf , https://stats.stackexchange.com/questions/169864/which-test-for-cross-table-analysis-boschloo-or-barnard
    display(barnard_exact(survival_contingency, alternative='two-sided', pooled=False, n=32))

survive_to_harvest_analysis(df)

survived_to_harvest,0,1
bed_type,Unnamed: 1_level_1,Unnamed: 2_level_1
Dig,2,7
No-dig,3,6


BarnardExactResult(statistic=-0.5344572305152208, pvalue=0.7294983244707437)

# Yield

## Simple hypothesis testing

In [4]:
def yield_analysis(df):
    print(f'There are {len(df)} samples.')
    mask = df['yield_g'].isna()
    print(f'Excluding {sum(mask)} samples with nan yield.')
    df = df[mask==False]
    print(f'{len(df)} samples remain.')

    a = df[df['bed_type']=='Dig']['yield_g']
    b = df[df['bed_type']=='No-dig']['yield_g']

    print()
    print(f'The Dig yield mean is {a.mean()} g')
    print(f'The No-dig yield mean is {b.mean()} g')

    display(ttest_ind(a=a, b=b, equal_var=True, nan_policy='raise', permutations=None, alternative='two-sided', trim=0))

yield_analysis(df)

There are 18 samples.
Excluding 5 samples with nan yield.
13 samples remain.

The Dig yield mean is 90.21428571428571 g
The No-dig yield mean is 67.55 g


TtestResult(statistic=0.5179628986230862, pvalue=0.614741985581841, df=11.0)

## Linear regression

In [8]:
def drop_na(df):
    print(f'There are {len(df)} samples.')
    mask = df['yield_g'].isna()
    print(f'Excluding {sum(mask)} samples with nan yield.')
    df = df[mask==False]
    print(f'{len(df)} samples remain.')
    return df


def yield_linear_regression(df):
    df = drop_na(df)
    X = np.array(df[['no-dig', 'wall_distance_e_mm', 'wall_distance_s_mm']])
    y = np.array(df[['yield_g']])
    X = sm.add_constant(X)
    print(y)
    print(X)

    mod = sm.OLS(y, X)
    res = mod.fit()

    print(res.summary())
    
yield_linear_regression(df)

There are 18 samples.
Excluding 5 samples with nan yield.
13 samples remain.
[[151.9]
 [  6.5]
 [ 87.9]
 [ 59.2]
 [ 10.3]
 [ 31.2]
 [ 55.4]
 [ 34.1]
 [281.6]
 [ 65.5]
 [ 55.1]
 [156. ]
 [ 42.1]]
[[1.00e+00 0.00e+00 3.70e+03 6.00e+02]
 [1.00e+00 0.00e+00 3.55e+03 6.00e+02]
 [1.00e+00 0.00e+00 3.40e+03 6.00e+02]
 [1.00e+00 0.00e+00 3.25e+03 6.00e+02]
 [1.00e+00 0.00e+00 3.10e+03 6.00e+02]
 [1.00e+00 1.00e+00 2.80e+03 6.00e+02]
 [1.00e+00 1.00e+00 2.65e+03 6.00e+02]
 [1.00e+00 0.00e+00 3.40e+03 4.00e+02]
 [1.00e+00 0.00e+00 3.10e+03 4.00e+02]
 [1.00e+00 1.00e+00 2.80e+03 4.00e+02]
 [1.00e+00 1.00e+00 2.65e+03 4.00e+02]
 [1.00e+00 1.00e+00 2.50e+03 4.00e+02]
 [1.00e+00 1.00e+00 2.35e+03 4.00e+02]]
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.201
Model:                            OLS   Adj. R-squared:                 -0.066
Method:                 Least Squares   F-statistic:        

