In [21]:
#MRF 16* 3.33

In [None]:
#based on this explanation: after calculating the HA, i want to add a column under the name HAsqf to convert THE VALUES INTO square feet for each row. (HAsqf = HA value * 10.7639) then create a column under the name of FinalHA and use this formula to calculate the final value for HA

#FinalHA = HAsqf - (HAsqf *0.25)
#then create a new column under the name of newlength and caluate this for each row:

#newlength = FinalHA /3.33
#finally for calculating the CorrectedYield use this formula:
#CorrectedYield = if([PLOTWT] <= 0, -9, [PLOTWT] / 453.6 / (3.33 * newlength / 43560) * ((100 - [MOIST]) /(100 - 18)) / 100)

In [7]:
import pandas as pd
import os
import glob
import numpy as np

# 1. Setup Paths
#---------------------------2021MRC--------------------------
# base_file_path = r"D:\MSU\2025\Evan\MRF2021\2116.xlsx"
# trials_folder = r"C:\Users\bazrafka\Desktop\counting\DiscussionPaperData\Outputs\2021_\excel"
# output_folder = r"C:\Users\bazrafka\Desktop\counting\DiscussionPaperData\Outputs\2021_\excel\Joined"


# #---------------------------2025Canada--------------------------
# base_file_path = r"C:\Users\bazrafka\Desktop\counting\DiscussionPaperData\EVAN\Canada2025trials.xlsx"
# trials_folder = r"C:\Users\bazrafka\Desktop\counting\DiscussionPaperData\Outputs\Canada25\excel"
# output_folder = r"C:\Users\bazrafka\Desktop\counting\DiscussionPaperData\Outputs\Canada25\excel\Joined"

# #---------------------------2024Canada--------------------------
base_file_path = r"C:\Users\bazrafka\Desktop\counting\DiscussionPaperData\EVAN\CanadaMagic24.xlsx"
trials_folder = r"C:\Users\bazrafka\Desktop\counting\DiscussionPaperData\Outputs\Canada24\excel"
output_folder = r"C:\Users\bazrafka\Desktop\counting\DiscussionPaperData\Outputs\Canada24\excel\Joined"



if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# 2. Read the Base File
base_df = pd.read_excel(base_file_path)
base_df.columns = base_df.columns.str.strip()
base_df['Experiment Name'] = base_df['Experiment Name'].astype(str).str.split('.').str[0].str.strip().str.zfill(4)

# Updated list to include the new requested columns
columns_to_keep = [
    "PlotID", "Join_Count", "ENTRY", "REP", "IBLK", "Rng", "Pas", 
    "Name", "Pedigree", "Experiment Name", "Origin", "Year", 
    "Location", "PLOTWT", "MOIST", "CWT_A", "HA", "SHAPE_Area",
    "HAsqf", "FinalHA", "newlength", "CorrectedYield"
]

# 3. List to store all dataframes for the final merge
all_trials_list = []

# 4. Search for .xls trial files
search_path = os.path.join(trials_folder, "*.xls")
trial_files = glob.glob(search_path)

print(f"Found {len(trial_files)} .xls trial files.")

for file_path in trial_files:
    file_name = os.path.basename(file_path)
    if file_name.startswith('~$'): continue 

    name_only = os.path.splitext(file_name)[0]
    trial_id = name_only[-5:].strip()
    
    filtered_base = base_df[base_df['Experiment Name'] == trial_id].copy()

    if filtered_base.empty:
        print(f"Skipping {file_name}: No match for ID {trial_id} in Base File.")
        continue

    # Load Trial file
    trial_df = pd.read_excel(file_path)
    trial_df.columns = trial_df.columns.str.strip()
    
    trial_df['PlotID'] = trial_df['PlotID'].astype(str).str.strip()
    filtered_base['PlotID'] = filtered_base['PlotID'].astype(str).str.strip()

    # 5. Join
    merged_df = pd.merge(filtered_base, trial_df, on="PlotID", suffixes=('', '_drop'))

    if merged_df.empty:
        print(f"Skipping {file_name}: No matching PlotIDs found.")
        continue

    # 6. Cleaning: Remove rows where CWT_A is -9
    if 'CWT_A' in merged_df.columns:
        merged_df = merged_df[merged_df['CWT_A'] != -9]

    # 7. Calculations for HA and Corrected Yield
    try:
        # HAsqf: Convert HA into square feet
        merged_df['HAsqf'] = merged_df['HA'] * 10.7639
        
        # FinalHA: HAsqf minus 25%
        merged_df['FinalHA'] = merged_df['HAsqf'] - (merged_df['HAsqf'] * 0.15)
        
        # newlength: FinalHA / 3.33
        merged_df['newlength'] = merged_df['FinalHA'] / 3.33
        
        # CorrectedYield: Conditional logic based on PLOTWT
        # If PLOTWT <= 0 then -9, else run the yield formula
        merged_df['CorrectedYield'] = np.where(
            merged_df['PLOTWT'] <= 0, 
            -9, 
            (merged_df['PLOTWT'] / 453.6) / 
            (3.33 * merged_df['newlength'] / 43560) * ((100 - merged_df['MOIST']) / (100 - 18)) / 100
        )

        # 8. Refine columns
        existing_cols = [c for c in columns_to_keep if c in merged_df.columns]
        final_df = merged_df[existing_cols].copy()
        
        # Add to our list for the master file
        all_trials_list.append(final_df)
        
        # 9. Save individual file
        new_file_name = name_only + ".xlsx"
        output_path = os.path.join(output_folder, new_file_name)
        final_df.to_excel(output_path, index=False, engine='openpyxl')
        print(f"Done: {new_file_name}")
        
    except Exception as e:
        print(f"   Error calculating yield for {file_name}: {e}")

# --- MASTER MERGE ---
if all_trials_list:
    print("\nCreating Master Combined File...")
    master_df = pd.concat(all_trials_list, ignore_index=True)
    master_output_path = os.path.join(output_folder, "All_Trials_Combined.xlsx")
    master_df.to_excel(master_output_path, index=False, engine='openpyxl')
    print(f"⭐ Master file saved successfully: {master_output_path}")
else:
    print("\nNo data was processed, Master file not created.")

print("\nProcessing complete.")

Found 1 .xls trial files.
Done: N24116.xlsx

Creating Master Combined File...
⭐ Master file saved successfully: C:\Users\bazrafka\Desktop\counting\DiscussionPaperData\Outputs\Canada24\excel\Joined\All_Trials_Combined.xlsx

Processing complete.


In [8]:
import pandas as pd
import os
import glob
import numpy as np

# 1. Setup MRC2021
# base_file_path = r"D:\MSU\2025\Evan\MRF2021\2115.xlsx"
# trials_folder = r"C:\Users\bazrafka\Desktop\counting\DiscussionPaperData\Outputs\2021_\excel"
# output_folder = r"C:\Users\bazrafka\Desktop\counting\DiscussionPaperData\Outputs\2021_\excel\Joined"

# # #---------------------------2022MRC--------------------------
# base_file_path = r"C:\Users\bazrafka\Desktop\counting\DiscussionPaperData\EVAN\MRF2022Trials.xlsx"
# trials_folder = r"C:\Users\bazrafka\Desktop\counting\DiscussionPaperData\Outputs\MRC2022_\EXCEL"
# output_folder = r"C:\Users\bazrafka\Desktop\counting\DiscussionPaperData\Outputs\MRC2022_\EXCEL\Joined"

#---------------------------2024 sevrec--------------------------
# base_file_path = r"C:/Users/bazrafka/Desktop/counting/DiscussionPaperData/EVAN/2024SVRECTrialsrawdata.xlsx"
# trials_folder = r"C:/Users/bazrafka/Desktop/counting/DiscussionPaperData/Outputs/2024_/excell"
# output_folder = r"C:/Users/bazrafka/Desktop/counting/DiscussionPaperData/Outputs/2024_/excell/Joined"


#---------------------------2022sevrec--------------------------
# base_file_path = r"C:/Users/bazrafka/Desktop/counting/DiscussionPaperData/EVAN/2022SEVREC.xlsx"
# trials_folder = r"C:/Users/bazrafka/Desktop/counting/DiscussionPaperData/Outputs/SEVREC2022_/excel"
# output_folder = r"C:/Users/bazrafka/Desktop/counting/DiscussionPaperData/Outputs\SEVREC2022_/excell/Joined"

#---------------------------2025sevrec--------------------------
# base_file_path = r"C:/Users/bazrafka/Desktop/counting/DiscussionPaperData/EVAN/2025trials.xlsx"
# trials_folder = r"C:\Users\bazrafka\Desktop\counting\DiscussionPaperData\Outputs\sevrec25_\excel"
# output_folder = r"C:\Users\bazrafka\Desktop\counting\DiscussionPaperData\Outputs\sevrec25_\excel\Joined"

# #---------------------------2025Canada--------------------------
# base_file_path = r"C:\Users\bazrafka\Desktop\counting\DiscussionPaperData\EVAN\Canada2025trials.xlsx"
# trials_folder = r"C:\Users\bazrafka\Desktop\counting\DiscussionPaperData\Outputs\Canada25\excel"
# output_folder = r"C:\Users\bazrafka\Desktop\counting\DiscussionPaperData\Outputs\Canada25\excel\Joined"

# #---------------------------2024Canada--------------------------
base_file_path = r"C:\Users\bazrafka\Desktop\counting\DiscussionPaperData\EVAN\CanadaMagic24.xlsx"
trials_folder = r"C:\Users\bazrafka\Desktop\counting\DiscussionPaperData\Outputs\Canada24\excel"
output_folder = r"C:\Users\bazrafka\Desktop\counting\DiscussionPaperData\Outputs\Canada24\excel\Joined"



if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# 2. Read the Base File
base_df = pd.read_excel(base_file_path)
base_df.columns = base_df.columns.str.strip()
base_df['Experiment Name'] = base_df['Experiment Name'].astype(str).str.split('.').str[0].str.strip().str.zfill(4)

# Define columns to keep
columns_to_keep = [
    "PlotID", "Join_Count", "ENTRY", "REP", "IBLK", "Rng", "Pas", 
    "Name", "Pedigree", "Experiment Name", "Origin", "Year", 
    "Location", "PLOTWT", "MOIST", "CWT_A", "HA", "HAsqf", "CorrectedYield"
]

all_trials_list = []
search_path = os.path.join(trials_folder, "*.xls")
trial_files = glob.glob(search_path)

print(f"Found {len(trial_files)} .xls trial files.")

for file_path in trial_files:
    file_name = os.path.basename(file_path)
    if file_name.startswith('~$'): continue 

    name_only = os.path.splitext(file_name)[0]
    trial_id = name_only[-5:].strip()
    
    filtered_base = base_df[base_df['Experiment Name'] == trial_id].copy()
    if filtered_base.empty: continue

    trial_df = pd.read_excel(file_path)
    trial_df.columns = trial_df.columns.str.strip()
    trial_df['PlotID'] = trial_df['PlotID'].astype(str).str.strip()
    filtered_base['PlotID'] = filtered_base['PlotID'].astype(str).str.strip()

    merged_df = pd.merge(filtered_base, trial_df, on="PlotID", suffixes=('', '_drop'))
    if merged_df.empty: continue

    if 'CWT_A' in merged_df.columns:
        merged_df = merged_df[merged_df['CWT_A'] != -9]

    # --- UPDATED CALCULATION STRATEGY ---
    try:
        # 1. Convert HA (Hectares) to Square Feet
        # 1 Hectare = 107,639.1 Square Feet
        merged_df['HAsqf'] = merged_df['HA'] * 10.7639
        
        # 2. Calculate CorrectedYield 
        # Replacing (3.33 * P_FT) with HAsqf
        merged_df['CorrectedYield'] = np.where(
            merged_df['PLOTWT'] <= 0, 
            -9, 
            ((merged_df['PLOTWT'] / 453.6) / (merged_df['HAsqf'] / 43560)) * ((100 - merged_df['MOIST']) / (100 - 18)) / 100
        )

        # Cleanup and Save
        existing_cols = [c for c in columns_to_keep if c in merged_df.columns]
        final_df = merged_df[existing_cols].copy()
        all_trials_list.append(final_df)
        
        output_path = os.path.join(output_folder, name_only + ".xlsx")
        final_df.to_excel(output_path, index=False)
        print(f"Done: {name_only}")
        
    except Exception as e:
        print(f"   Error in {file_name}: {e}")

# Master Merge
if all_trials_list:
    master_df = pd.concat(all_trials_list, ignore_index=True)
    master_df.to_excel(os.path.join(output_folder, "All_Trials_Combined.xlsx"), index=False)
    print("\n⭐ Master file saved successfully.")

print("Processing complete.")

Found 1 .xls trial files.
Done: N24116

⭐ Master file saved successfully.
Processing complete.
