# CBEE 213 Midterm Group Project GROUP COMPONENT  
Samuel Perkins  
Rachel Strelow  
Trey Stephens-Cherry  

In [17]:
import numpy as np
from matplotlib import pyplot as plt
import scipy.stats as st
import pandas as pd
from scipy.stats import t
import os
df = pd.read_excel("Biocharcoals and Soil Nitrates_SIMPLIFIED.xlsx")

### Part 5: Comparing Confidence intervals

Here is the confidence intervals calculated in the individual components:

In [13]:
# Locating file path; note that your data file must be in the same directory as this file
def path_finder(file_name):
    
    directory = os.path.dirname('__file__')
    file_directory = os.path.join(directory, file_name)
    
    # print(f'Found directory: {file_directory}')
    
    return file_directory


# Reading the found file
found_data = pd.read_excel("Biocharcoals and Soil Nitrates_SIMPLIFIED.xlsx")


# This function creates a dictionary that groups the data by Soil and Biochar
# It works by iterating through all rows in the DataFrame, and creating a key
# based on the row contents.
def data_sorter(data):
    
    grouped_data = {}
    
    for _, row in data.iterrows():
        
        soil_type = row['Soil']
        biochar_content = row['Biochar']
        
        key = (soil_type, biochar_content)
    
        if key not in grouped_data:
            grouped_data[key] = []
            
        grouped_data[key].append(row.to_dict())
        
    return grouped_data


# Finds the relevant statistics data, mean/median/standard dev/... etc
def data_analyzer(data):
    
    warnings.filterwarnings('ignore', category=RuntimeWarning)
    
    analysis_results = {}
    
    for key, rows in data.items():
        
        soil_type, biochar_content = key
        key = (soil_type, biochar_content if not pd.isna(biochar_content) else 'Unknown')

        
        df = pd.DataFrame(rows)
        
        stats = {
            'Average FractionCO2': df['FractionCO2'].mean(skipna=True),
            'Median FractionCO2': df['FractionCO2'].median(skipna=True),
            'Std FractionCO2': df['FractionCO2'].std(skipna=True),
            'Average Grav': df['Grav'].mean(skipna=True),
            'Median Grav': df['Grav'].median(skipna=True),
            'Std Grav': df['Grav'].std(skipna=True),
            'Sample Size': len(df)
        }
        
        stats = {k: float(v) if isinstance(v, np.float64) else v for k, v in stats.items()}
        
        analysis_results[key] = stats
        
    return analysis_results


# Formats the analyzed data into a cleaner output that is more legible
def data_formatter(data):
    
    formatted_data = pd.DataFrame.from_dict(data, orient='index')

    formatted_data.reset_index(inplace=True)

    if 'level_0' in formatted_data.columns and 'level_1' in formatted_data.columns:
        formatted_data['Group'] = formatted_data['level_0'].astype(str) + "-" + formatted_data['level_1'].astype(str)
        formatted_data.drop(['level_0', 'level_1'], axis=1, inplace=True)
    else:
        print("Expected 'level_0' and 'level_1' columns not found. Current columns:", formatted_data.columns)
   
    return formatted_data


# Calculates the confidence interval for a given column
def calculate_confidence_interval(data, column, confidence=0.95):
    
    if column not in data.columns:
        raise ValueError(f"Column '{column}' not found in the dataset.")

    mean = data[column].mean()
    std_dev = data[column].std()
    n = len(data[column])

    if n < 2:
        raise ValueError('Not enough data points to calculate a confidence interval.')
    
    t_critical = t.ppf((1 + confidence) / 2, df=n - 1)

    margin_of_error = t_critical * (std_dev / (n ** 0.5))

    lower_bound = float(mean - margin_of_error)
    upper_bound = float(mean + margin_of_error)

    return lower_bound, upper_bound


# Calculates the confidence interval for a single replicate of a specific treatment
def single_replicate_confidence_interval(data, soil, biochar, 
                                         replicate, column, 
                                         confidence=0.95):
    
    filtered_data = data[
        (data['Soil'] == soil) &
        (data['Biochar'] == biochar) &
        (data['Replicate'] == replicate)
    ]

    return calculate_confidence_interval(filtered_data, column, confidence)


# Confidence interval for all replicates
def combined_replicates_confidence_interval(data, soil, 
                                            biochar, column, 
                                            confidence=0.95):

    filtered_data = data[
        (data['Soil'] == soil) &
        (data['Biochar'] == biochar)
    ]

    return calculate_confidence_interval(filtered_data, column, confidence)


# Function to calculate the probability of samples being within the confidence interval
def test_samples_within_confidence_interval(data, column, soil, 
                                            biochar, confidence=0.95,
                                            sample_size=None, 
                                            iterations=None):
 
    filtered_data = data[
                (data['Soil'] == soil) &
                (data['Biochar'] == biochar)
                ]
    
    if column not in filtered_data.columns:
        raise ValueError(f"Column '{column}' not found in the dataset.")

    lower_bound, upper_bound = calculate_confidence_interval(filtered_data, column, confidence)

    within_interval_count = 0

    for _ in range(iterations):
        sample = filtered_data[column].dropna().sample(n=sample_size, replace=True)

        sample_mean = sample.mean()

        if lower_bound <= sample_mean <= upper_bound:
            within_interval_count += 1

    probability = within_interval_count / iterations

    return probability

In [34]:
soil_of_interest = "Chehalis"
biochar_amount_of_interest = 350
i = ["Temp", "DeltaCO2", "FractionCO2", "Grav", "Nitrate"]
for data_point_of_interest in i:
    print(f"Confidence interval analysis for {data_point_of_interest}")
    for R in [1, 2, 3]:
        try:
            biochar_amount_of_interest = float(biochar_amount_of_interest)
        except ValueError:
            print('Error, biochar must be a number')
            exit()
        
        if data_point_of_interest not in found_data.columns:
            print(f"Error: Data point '{data_point_of_interest}' not found in the dataset.")
            exit()
        
        try:
            ci_single = single_replicate_confidence_interval( 
                data=found_data, soil=soil_of_interest, biochar=biochar_amount_of_interest,
                replicate=R, column=data_point_of_interest)
        
            # Gets the confidence interval for all replicates
            ci_combined = combined_replicates_confidence_interval(
                data=found_data, soil=soil_of_interest,
                biochar=biochar_amount_of_interest, column=data_point_of_interest)
        
            # Finding the probability that the sample average lies within the confidence interval
            probability = test_samples_within_confidence_interval(
                data=found_data, column=data_point_of_interest, soil=soil_of_interest,
                biochar=biochar_amount_of_interest, confidence=0.95,
                sample_size=5, iterations=10000
            )
        
            #print("\n--- Confidence Interval Results ---")
            print(f"95% Confidence Interval for Replicate {R} of {soil_of_interest}-{biochar_amount_of_interest}: {np.round(ci_single, 4)}")
            print(f"Probability of a sample being within the confidence interval: {probability:.2f}%")
            print("-")
        except Exception as e:
            print(f'Error: {e}')
    print(f"95% Confidence Interval for All Replicates Combined: {np.round(ci_combined, 2)}")
    print("-------------------------------------------------------------")

Confidence interval analysis for Temp
95% Confidence Interval for Replicate 1 of Chehalis-350.0: [24.7848 25.4518]
Probability of a sample being within the confidence interval: 0.52%
-
95% Confidence Interval for Replicate 2 of Chehalis-350.0: [24.7848 25.4518]
Probability of a sample being within the confidence interval: 0.52%
-
95% Confidence Interval for Replicate 3 of Chehalis-350.0: [24.7848 25.4518]
Probability of a sample being within the confidence interval: 0.52%
-
95% Confidence Interval for All Replicates Combined: [24.94 25.29]
-------------------------------------------------------------
Confidence interval analysis for DeltaCO2
95% Confidence Interval for Replicate 1 of Chehalis-350.0: [0.0065 0.0129]
Probability of a sample being within the confidence interval: 0.53%
-
95% Confidence Interval for Replicate 2 of Chehalis-350.0: [0.0067 0.014 ]
Probability of a sample being within the confidence interval: 0.53%
-
95% Confidence Interval for Replicate 3 of Chehalis-350.0: [

1. What do these confidence intervals calculated in 3a. tell you?  
They tell us the interval at which a random sample's mean will be included in 95% of the time. 

2. In a markdown cell, explain broadly how each group member’s single-replicate confidence intervals within the “Chehalis.500” treatment may have differed. How did the single-replicates confidence intervals broadly compare to the all-replicates confidence intervals? Which would you use when communicating these data?  
For most categories, each replicate's CIs are pretty close together, and their differences are probably due to the natrual variability within the data. The single-replicate CIs are also similar to the all-replicate CIs, although the all-replicate CIs seem to be slightly wider. When communicating data, the all-replicate CIs would be used since they communicate the most well-rounded and complete data. 

3. Count how many measurements fall outside of each confidence inteval?  
   The probability measurements in the output above gives the answer for this. 

In [11]:
# Count how many of the values fall outside of the confidence intervals

# ~~~~~ NOTE: I do not know wether this section refers to just the overall measurements or the measurements for each replicate ~~~~~

# Replicate 1
# Index to isolate the Replicate 1 of Chehalis.500 data
idx = ( (df["Treatment"] == "Chehalis.500") & (df["Replicate"] == 1) )
# This loops through each type of measurement and prints a summary about the data's relation to the CIs
for i in R_key:
    CI_index = R_key.index(i)
    # NaN values always return False when comparing to a number, so by checking for numbers
    # below the low CI and above the high CI, false positives from the NaN cells can be avoided
    below_CI = sum(df[idx][i] < R1_CI_low[CI_index])
    above_CI = sum(df[idx][i] > R1_CI_high[CI_index])
    outside_CI = below_CI + above_CI
    value_count = df[idx][i].count() # this counts the number of non-NaN values in the set
    print(f"Out of {value_count} total values for {i}, there are {outside_CI} values that are outside the confidence interval.")


# Replicate 2



# Replicate 3



# Overall

Out of 12 total values for Temp, there are 7 values that are outside the confidence interval.
Out of 12 total values for DeltaCO2, there are 7 values that are outside the confidence interval.
Out of 12 total values for FractionCO2, there are 6 values that are outside the confidence interval.
Out of 10 total values for Grav, there are 1 values that are outside the confidence interval.
Out of 10 total values for Nitrate, there are 5 values that are outside the confidence interval.


### Part 6
1. In a markdown cell, write your relationship prediction in sentence form  
   Hypothesis one: The Willamette-350 treatment will have grater Fractional CO2 than the Chehalis-350 treatment.  
   Hypothesis two: The Willamette-500 treatment will have a differnent Nitrate content Willamette-700 treatment.  
   Hypothesis three: The Chehalis-350 treatment will have less Nitrate than the Chehalis-500 treatment.
   
2. In a markdown cell, write the mathematical hypotheses for each prediction  
   Hypothesis 1:  H₀: μ₁=μ₂ ; Hₐ: μ₁>μ₂  
   Hypothesis 2:  H₀: μ₁=μ₂ ; Hₐ: μ₁≠μ₂  
   Hypothesis 3:  H₀: μ₁=μ₂ ; Hₐ: μ₁<μ₂

### Part 7
1. Choose a level of uncertainty (alpha), calculate the p-value (show your work), and compare your p-value to your alpha  
   We will be testing hypothesis 2. alpha = 0.05 (the p value will be doubled to compare against this.    

In [25]:
# function to calculate p value
def two_sample_t_test(sample_1,sample_2):
    """
    This function performs compares the means of two
    independent samples using a two-tailed t-test.
    
    It takes two samples as input.
    """
    
    # calculate the mean of each sample
    x_bar1 = np.mean(sample_1)
    x_bar2 = np.mean(sample_2)
    
    # calculate the standard deviation of each 
    # sample using (n-1) in the denominator
    s1 = np.std(sample_1, ddof=1)
    s2 = np.std(sample_2, ddof=1)
    
    # get number of measurements in each sample
    n1 = len(sample_1)
    n2 = len(sample_2)
    
    # calculate the t-statistic
    t = -1*abs( (x_bar1-x_bar2) / np.sqrt( ((s1**2)/n1) + ((s2**2)/n2) ) )
    
    # calculate degree of freedom
    nu = n1+n2-2    
    
    # choose a method from above for calculating the t-statistic and p-value
    # method 4
    p = 2*st.t.cdf(t, nu)
    
    print("p-value: {}".format(p))
    
    return t,p

In [28]:
# create the indexes for each sample
idx_1 = df["Treatment"] == "Willamette.500"
idx_2 = df["Treatment"] == "Willamette.700"

sample_1 = df[idx_1]["Nitrate"]
sample_2 = df[idx_2]["Nitrate"]
t,p = two_sample_t_test(sample_1, sample_2)

p-value: 0.9202684418205275


Based on this high p value, we fail to reject the null hypothesis (0.92>0.05). 