## Title: Seed Count-data analysis of groundtruth data and SAM-generated feature extraction data
##### Author: Harpreet Kaur Bargota
##### Email: harpreet.bargota@agr.gc.ca
##### Date: March 17, 2025

In [1]:
# import the required libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings(action='ignore')


In [2]:
# read the .csv file containing the seed count data 
seed2=pd.read_excel(r"C:\Users\bargotah\Downloads\FEpipeline_Scripts\FEpipeline_Results_March\ouput_FE_SAM2.1\Seed Count.xlsx")
seed1=pd.read_excel(r"C:\Users\bargotah\Downloads\FEpipeline_Scripts\FEpipeline_Results_March\output_FE_SAM1.0\Seed Count.xlsx")

In [3]:
# check the dataset
print (seed1.head(4))
print (seed2.head(4))

                    Class  Seed Count
0  Faba-Seed-CC_Vf196-1-2          11
1  Faba-Seed-CC_Vf127-1-2          11
2  Faba-Seed-CC_Vf309-3-1          10
3  Faba-Seed-CC_Vf308-1-1          10
                    Class  Seed Count
0  Faba-Seed-CC_Vf196-1-2          11
1  Faba-Seed-CC_Vf127-1-2          11
2  Faba-Seed-CC_Vf299-3-1          10
3  Faba-Seed-CC_Vf295-1-1          10


In [4]:
#check teh data
print (seed1.info())
print (seed2.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 554 entries, 0 to 553
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Class       554 non-null    object
 1   Seed Count  554 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 8.8+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 554 entries, 0 to 553
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Class       554 non-null    object
 1   Seed Count  554 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 8.8+ KB
None


In [5]:
#check for descriptive summary of data
print (seed1.describe())
print (seed2.describe())

       Seed Count
count  554.000000
mean     7.648014
std      2.705016
min      1.000000
25%      5.000000
50%      9.000000
75%     10.000000
max     11.000000
       Seed Count
count  554.000000
mean     7.642599
std      2.701963
min      1.000000
25%      5.000000
50%      9.000000
75%     10.000000
max     11.000000


In [6]:
# check if the index is the same for both SAM1.0 and SAM2.1
seed1.index.equals(seed2.index)


True

## Comparison of seed count of groundtruth data with SAM2.1

In [7]:
# read the groundtruth data (Aug data) provided by Dr. Nicholas Larkan 
Aug_data = pd.read_excel("C:/Users/bargotah/Downloads/Python Scripts/Faba Seed Analyzer Data August 2024-103.xlsx", sheet_name="Mean S2 Seed Data")
Aug_data.head(5)

Unnamed: 0,ID,Main Seed,Main Seeds,Weight(g),TGW(g),Seeds/kg,"""B-Stocking""/Non-Seeds",Date,Time,ØArea,...,Max Width,Max Length,Volume(ml),Volume Weight(kg/hl),ØCircularity,Min Circularity,Max Circularity,ØL/W Ratio,Min L/W Ratio,Max L/W Ratio
0,Vf1-1-2,Vicia faba,4,4.8877,1221.925,818.380834,8,13/06/2023,14:38,219.500247,...,14.532204,21.09744,15,32.584667,1.168153,1.162423,1.17374,1.368314,1.291562,1.451772
1,Vf4-1-1,Vicia faba,6,1.9316,321.933333,3106.233175,7,13/06/2023,14:40,54.012175,...,8.224431,10.592581,15,12.877333,1.14516,1.126615,1.155499,1.238091,1.16978,1.29469
2,Vf7-1-1,Vicia faba,10,3.5681,356.81,2802.612034,9,13/06/2023,14:42,65.235907,...,8.462456,11.367768,15,23.787333,1.164687,1.135995,1.187734,1.351447,1.251236,1.429181
3,Vf8-1-1,Vicia faba,9,3.8984,433.155556,2308.639442,9,13/06/2023,14:43,70.955747,...,9.208415,12.489041,15,25.989333,1.161806,1.15172,1.179324,1.317772,1.24953,1.399287
4,Vf9-1-1,Vicia faba,10,4.5645,456.45,2190.820462,11,13/06/2023,14:45,77.492095,...,9.354255,13.39067,15,30.43,1.194822,1.167892,1.263545,1.460535,1.374786,1.620578


#### The groundtruth Aug datasheet contains the columns-ID and Main seeds, which gives the information about class ID and number of seeds in images, respectively.

In [8]:
# Extract the df containing only ID, MAin seeds
col_list = ['ID', 'Main Seeds']
Org_Seeds = Aug_data[col_list]
Org_Seeds.head(2)

Unnamed: 0,ID,Main Seeds
0,Vf1-1-2,4
1,Vf4-1-1,6


#### For uniformity in indices, add the string'Faba-Seed-CC_' before the ID.



In [9]:
# add the string before the string- ID
Org_Seeds['ID'] = 'Faba-Seed-CC_' + Org_Seeds['ID']
Org_Seeds.head(4)

Unnamed: 0,ID,Main Seeds
0,Faba-Seed-CC_Vf1-1-2,4
1,Faba-Seed-CC_Vf4-1-1,6
2,Faba-Seed-CC_Vf7-1-1,10
3,Faba-Seed-CC_Vf8-1-1,9


#### Join the dfs-seed1 (SAM1 data) and groundtruth data using merge function, which joins both dfs according to the common key "CLass", "ID".

In [10]:
# merge both dfs
merged_df = pd.merge(seed2, Org_Seeds, left_on='Class', right_on='ID', how='inner')
merged_df.head(4)

Unnamed: 0,Class,Seed Count,ID,Main Seeds
0,Faba-Seed-CC_Vf196-1-2,11,Faba-Seed-CC_Vf196-1-2,11
1,Faba-Seed-CC_Vf127-1-2,11,Faba-Seed-CC_Vf127-1-2,11
2,Faba-Seed-CC_Vf299-3-1,10,Faba-Seed-CC_Vf299-3-1,10
3,Faba-Seed-CC_Vf295-1-1,10,Faba-Seed-CC_Vf295-1-1,10


#### Since the merged_df has the data for number of seeds in groundtruth data and SAM measured data. Lets see if both the data are equal by subtracting the number of seeds from each other. if the value is zero, it means the number of seeds are the same in groundtruth data and the SAM measured data.

In [11]:
#check if seed count is the same
merged_df['Seeds_left'] = merged_df['Seed Count'] - merged_df['Main Seeds']
merged_df.head(4)

Unnamed: 0,Class,Seed Count,ID,Main Seeds,Seeds_left
0,Faba-Seed-CC_Vf196-1-2,11,Faba-Seed-CC_Vf196-1-2,11,0
1,Faba-Seed-CC_Vf127-1-2,11,Faba-Seed-CC_Vf127-1-2,11,0
2,Faba-Seed-CC_Vf299-3-1,10,Faba-Seed-CC_Vf299-3-1,10,0
3,Faba-Seed-CC_Vf295-1-1,10,Faba-Seed-CC_Vf295-1-1,10,0


#### check for the class ID with unequal number of seeds.

In [12]:
# check for the ID with unequal seed count
filtered_df = merged_df[merged_df['Seeds_left'] != 0]
filtered_df

Unnamed: 0,Class,Seed Count,ID,Main Seeds,Seeds_left
262,Faba-Seed-CC_Vf460-1-1,9,Faba-Seed-CC_Vf460-1-1,10,-1
263,Faba-Seed-CC_Vf456-1-1,9,Faba-Seed-CC_Vf456-1-1,10,-1
265,Faba-Seed-CC_Vf620-1-1,9,Faba-Seed-CC_Vf620-1-1,10,-1
266,Faba-Seed-CC_Vf619-1-1,9,Faba-Seed-CC_Vf619-1-1,10,-1
274,Faba-Seed-CC_Vf117-1-2,9,Faba-Seed-CC_Vf117-1-2,10,-1
276,Faba-Seed-CC_Vf198-1-2,9,Faba-Seed-CC_Vf198-1-2,10,-1
336,Faba-Seed-CC_Vf280-1-2,7,Faba-Seed-CC_Vf280-1-2,8,-1
345,Faba-Seed-CC_Vf615-1-1,7,Faba-Seed-CC_Vf615-1-1,10,-3
357,Faba-Seed-CC_Vf90-1-1,6,Faba-Seed-CC_Vf90-1-1,7,-1
360,Faba-Seed-CC_Vf139-1-1,6,Faba-Seed-CC_Vf139-1-1,5,1


### Success rate (%) for seed count for SAM2.1 generated data

In [13]:
Success_rate=100*((len(seed2)-len(filtered_df))/len(seed2))
Success_rate     
print (f"The faba bean feature extraction pipeline leveraging SAM2.1 has {Success_rate}% success rate.")
              

The faba bean feature extraction pipeline leveraging SAM2.1 has 98.014440433213% success rate.


# Results: Pipeline with SAM2.1: 
## Errors in  images -- Vf460, Vf117, Vf198, Vf619, Vf283, Vf90- no masks for 1 seed and Vf615 (no masks for 3 seeds). 
## Vf456 and Vf280 has bounding box coords for bbox w greater than 700. 
## Vf139-small nonspecific part detected.
## Error in Vf620: Incorrect number of seeds in Aug data (originally 9 in image but incorrectly written as 10 in Aug data in excel.


In [14]:
print ("-----------------------------------------------------------------------------------------------------------------------------------------------")
print ("-----------------------------------------------------------------------------------------------------------------------------------------------")

-----------------------------------------------------------------------------------------------------------------------------------------------
-----------------------------------------------------------------------------------------------------------------------------------------------


## Comparison of seed count of groundtruth data with SAM1.0

In [15]:
# merge the df of groundtruth with ground truth data
merged_df = pd.merge(seed1, Org_Seeds, left_on='Class', right_on='ID', how='inner')
merged_df.head(4)

Unnamed: 0,Class,Seed Count,ID,Main Seeds
0,Faba-Seed-CC_Vf196-1-2,11,Faba-Seed-CC_Vf196-1-2,11
1,Faba-Seed-CC_Vf127-1-2,11,Faba-Seed-CC_Vf127-1-2,11
2,Faba-Seed-CC_Vf309-3-1,10,Faba-Seed-CC_Vf309-3-1,10
3,Faba-Seed-CC_Vf308-1-1,10,Faba-Seed-CC_Vf308-1-1,10


In [16]:
# check for equality of seed count 
merged_df['Seeds_left'] = merged_df['Seed Count'] - merged_df['Main Seeds']
merged_df.head(4)

Unnamed: 0,Class,Seed Count,ID,Main Seeds,Seeds_left
0,Faba-Seed-CC_Vf196-1-2,11,Faba-Seed-CC_Vf196-1-2,11,0
1,Faba-Seed-CC_Vf127-1-2,11,Faba-Seed-CC_Vf127-1-2,11,0
2,Faba-Seed-CC_Vf309-3-1,10,Faba-Seed-CC_Vf309-3-1,10,0
3,Faba-Seed-CC_Vf308-1-1,10,Faba-Seed-CC_Vf308-1-1,10,0


In [17]:
# check for ID of images with unequal seed count
filtered_df = merged_df[merged_df['Seeds_left'] != 0]
filtered_df

Unnamed: 0,Class,Seed Count,ID,Main Seeds,Seeds_left
255,Faba-Seed-CC_Vf460-1-1,9,Faba-Seed-CC_Vf460-1-1,10,-1
258,Faba-Seed-CC_Vf620-1-1,9,Faba-Seed-CC_Vf620-1-1,10,-1
272,Faba-Seed-CC_Vf117-1-2,9,Faba-Seed-CC_Vf117-1-2,10,-1
277,Faba-Seed-CC_Vf11-1-1,9,Faba-Seed-CC_Vf11-1-1,10,-1
279,Faba-Seed-CC_Vf615-1-1,9,Faba-Seed-CC_Vf615-1-1,10,-1
280,Faba-Seed-CC_Vf619-1-1,9,Faba-Seed-CC_Vf619-1-1,10,-1
358,Faba-Seed-CC_Vf90-1-1,6,Faba-Seed-CC_Vf90-1-1,7,-1
368,Faba-Seed-CC_Vf283-3-1,6,Faba-Seed-CC_Vf283-3-1,7,-1


### Success rate (%) for seed count for SAM1.0 generated data

In [18]:
Success_rate=100*((len(seed1)-len(filtered_df))/len(seed1))
Success_rate     
print (f"The faba bean feature extraction pipeline leveraging SAM1.0 has {Success_rate}% success rate.")

The faba bean feature extraction pipeline leveraging SAM1.0 has 98.55595667870037% success rate.


#### Pipeline with SAM: Errors in 7 images (Vf615, Vf117, Vf11, Vf460, Vf619, Vf283, Vf90) due to mask not formed for 1 seed. Error in Vf620: Incorrect number of seeds in Aug data (originally 9 in image but incorrectly written as 10 in Aug data in excel. These errors can be reduced by finetuning SAM using difefrent parameters.
