Analysis of image factors on annotation consensus-- Zooniverse
Start date: 09/26/2023

LIBRARIES

In [1]:
#Imports
import pandas as pd
from PIL import Image
import os
import ast
import numpy as np
import cv2
import pandas as pd
from skimage.feature import graycomatrix, graycoprops
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

LOAD DATA

In [None]:
#Analysis annotations
path = "E:\\imagefactors\\data\\consensusLabels_agreementIndex.csv"
with open(path) as f:
  df = pd.read_csv(f)

#Fixing how bounding boxes are read for the analysis labels
def eval_bbox_refined(row):
    if pd.notnull(row['consensus_bbox']):
        return ast.literal_eval(row['consensus_bbox'])
    else:
        return None
# Apply the function to the 'bbox_refined' column and save the results
df['consensus_bbox'] = df.apply(eval_bbox_refined, axis=1)

#Creating a base file column to match tiles to full images later
df["basefile"] = [x[:-10] for x in df['filename']]

df.head()

In [None]:
#IF RESUMING FROM A SAVED POINT
path = "E:/imagefactors/data/crowdsourced_imagefactors.csv"
with open(path) as f:
  df = pd.read_csv(f)
#Fixing how bounding boxes are read for the analysis labels
def eval_bbox_refined(row):
    if pd.notnull(row['consensus_bbox']):
        return ast.literal_eval(row['consensus_bbox'])
    else:
        return None
# Apply the function to the 'bbox_refined' column and save the results
df['consensus_bbox'] = df.apply(eval_bbox_refined, axis=1)

DERIVE IMAGE/ANNOTATION FACTORS FOR ANALYSIS

In [None]:
#BBOX AREA

def calc_area(row):
    bbox = row['consensus_bbox']
    xmin, ymin, w, h = bbox
    return w * h

df['area'] = df.apply(calc_area, axis=1)

In [None]:
# % AREA BBOX
# Percent area of the bounding box of the total image area

# Define a function to calculate percentage area
def calculate_percentage_area(image_filename, bbox_area):
    image_path = os.path.join("E:\\imagefactors\\data\\zooniverse", image_filename)
    
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        # Handle the case where the image is not found
        print(f"Image not found: {image_path}")
        return None  # You can return a special value, such as None, to indicate the image wasn't found
    
    image_width, image_height = image.size
    image_area = image_width * image_height

    percentage_area = (bbox_area / image_area) * 100
    return percentage_area

# Calculate percentage area and add it as a new
df['bbox_percent_area'] = df.apply(lambda row: calculate_percentage_area(row['filename'], row['area']), axis=1)

In [None]:
# SAME CLASS %
# % of targets of the same class as the analysis target (in the same image)

# Define a function to calculate the percentage of same-class neighbors for a given row
def calculate_same_class_percentage(row, df):
    # Get the filename and class ID of the target bounding box
    filename = row['filename']
    class_id = row['consensus_class_ID']
    
    # Filter the DataFrame to include only rows with matching filenames
    matching_rows = df[df['filename'] == filename]
    
    # Calculate the total number of neighbors in the same image
    total_neighbors = len(matching_rows) - 1  # Subtract 1 to exclude the target bounding box
    
    if total_neighbors == 0:
        return 0  # Avoid division by zero
    
    # Calculate the number of same-class neighbors
    same_class_neighbors = len(matching_rows[matching_rows['consensus_class_ID'] == class_id]) - 1  # Subtract 1 to exclude the target bounding box
    
    # Calculate the percentage of same-class neighbors
    same_class_percentage = (same_class_neighbors / total_neighbors) * 100
    
    return same_class_percentage

# Calculate the same-class percentage for each row and add the results as a new column
df['same_class_percent'] = df.apply(lambda row: calculate_same_class_percentage(row, df), axis=1)

In [None]:
#NUMBER OF NEIGHBORS 
# Number of annotations within 2x maximum of bbox width or height (to account for positional differences)

# Define a function to calculate the number of neighbors for a given row
def count_neighbors(row, df):
    # Extract 'bbox' values from the 'consensus_bbox' column as a list [xmin, ymin, width, height]
    bbox = row['consensus_bbox']  # Use ast.literal_eval() to safely evaluate the string
    
    # Define the search radius as 2 times the maximum of width and height
    search_radius = 2 * max(bbox[2], bbox[3])
    
    # Calculate the center coordinates of the bounding box
    x_center = bbox[0] + bbox[2] / 2
    y_center = bbox[1] + bbox[3] / 2
    
    # Initialize a count for neighbors
    num_neighbors = 0
    
    # Iterate through rows with matching filenames
    matching_rows = df[df['filename'] == row['filename']]
    
    for _, neighbor_row in matching_rows.iterrows():
        if neighbor_row.name != row.name:
            # Extract 'bbox' values for the neighbor as a list [xmin, ymin, width, height]
            neighbor_bbox = neighbor_row['consensus_bbox']
            
            # Calculate the center coordinates of the potential neighbor
            neighbor_x_center = neighbor_bbox[0] + neighbor_bbox[2] / 2
            neighbor_y_center = neighbor_bbox[1] + neighbor_bbox[3] / 2
            
            # Calculate the Euclidean distance between centers
            distance = np.sqrt((x_center - neighbor_x_center)**2 + (y_center - neighbor_y_center)**2)
            
            # Check if the neighbor is within the search radius
            if distance <= search_radius:
                num_neighbors += 1
    
    return num_neighbors

# Calculate the number of neighbors for each row and add the results as a new column
df['num_neighbors'] = df.apply(lambda row: count_neighbors(row, df), axis=1)

In [None]:
#TOTAL NUMBER OF BIRDS PER IMAGE
df['density'] = df.groupby('filename')['consensus_bbox'].transform('count')

In [None]:
#DISTANCE OF TARGET FROM IMAGE CENTER-- in meters

path1 = "E:\\imagefactors\\data\\crowdsourced_gsd.csv"
with open(path1) as f1:
  gsd_df = pd.read_csv(f1)

gsd_df["basefile"] = gsd_df["filename"].apply(lambda x: os.path.splitext(x)[0])

merged_df = pd.merge(df, gsd_df, on="basefile", how="left")
merged_df = merged_df.rename(columns={"filename_x": "filename"})
merged_df = merged_df.drop(columns=["filename_y", "filename_base"])

# Function to calculate distance from center
def calculate_distance_from_center(row):
    image_path = os.path.join("E:\\imagefactors\\data\\zooniverse", row["filename"])
    
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        # Handle the case where the image is not found
        print(f"Image not found: {image_path}")
        return None  # You can return a special value, such as None, to indicate the image wasn't found
    
    image_width, image_height = image.size
    center_x_px = image_width/2 
    center_y_px = image_height/2
    gsd_m = row['gsd'] / 100

    row['center_x_m'] = center_x_px * gsd_m
    row['center_y_m'] = center_y_px * gsd_m
    
    # Get the coordinates of the bounding box (x, y, width, height)
    x, y, width, height = row['consensus_bbox']

    # Calculate the center point of the bounding box in pixels
    bbox_center_x_px = x + (width / 2)
    bbox_center_y_px = y + (height / 2)

    # Calculate the center point of the bounding box in meters
    bbox_center_x_m = bbox_center_x_px * gsd_m
    bbox_center_y_m = bbox_center_y_px * gsd_m

    # Calculate the distance from the center of the image in meters
    distance_m = ((row['center_x_m'] - bbox_center_x_m)**2 + (row['center_y_m'] - bbox_center_y_m)**2)**0.5

    return distance_m

# Apply the function to the merged dataframe
merged_df['distance_from_center'] = merged_df.apply(calculate_distance_from_center, axis=1)
df = merged_df

In [50]:
#TEXTURE METRICS- GLCM
#Bounding box and "donut" (buffer region directly around bbox)

def calculate_gclm_derivatives(image, bbox):
    # Convert bounding box coordinates to integers
    x, y, width, height = map(int, bbox)
    
    # Extract the region of interest (ROI) from the image using the bounding box
    roi = image[y:y+height, x:x+width]
    
    # Check if the ROI is empty or None
    if roi is None or roi.size == 0:
        print("Warning: ROI is empty or None")
        return None, None, None, None
    
    # Convert the ROI to grayscale
    roi_gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
    
    # Calculate GCLM features for the grayscale ROI
    distances = [1, 2]  # Define the distances for GCLM
    angles = [0, np.pi/4, np.pi/2, 3*np.pi/4]  # Define the angles for GCLM
    gclm = graycomatrix(roi_gray, distances=distances, angles=angles, levels=256,
                        symmetric=True, normed=True)
    
    # Calculate GCLM derivatives (contrast, dissimilarity, homogeneity, energy)
    contrast = graycoprops(gclm, 'contrast').mean()
    dissimilarity = graycoprops(gclm, 'dissimilarity').mean()
    homogeneity = graycoprops(gclm, 'homogeneity').mean()
    energy = graycoprops(gclm, 'energy').mean()
    
    return contrast, dissimilarity, homogeneity, energy

def adjust_bbox_to_image(image, bbox):
    # Get image dimensions
    image_height, image_width, _ = image.shape
    
    # Adjust bounding box coordinates if they exceed image boundaries
    x, y, width, height = bbox
    
    # Ensure the bounding box does not go beyond the image boundaries
    x = max(x, 0)
    y = max(y, 0)
    width = min(width, image_width - x)
    height = min(height, image_height - y)
    
    return x, y, width, height

def calculate_texture_metrics_for_directory(image_dir, csv_file):
    # Initialize an empty dataframe to store the texture metrics
    texture_metrics_df = pd.DataFrame()
    
    # List all files in the specified directory
    image_files = [f for f in os.listdir(image_dir) if os.path.isfile(os.path.join(image_dir, f))]
    
    for image_file in image_files:
        try:
            # Construct the full path to the image file
            image_path = os.path.join(image_dir, image_file)
            
            # Load the image
            image = cv2.imread(image_path)
            
            if image is None:
                print(f"Warning: Image '{image_path}' not found or cannot be loaded.")
                continue
            
            # Read the CSV file
            csv_data = pd.read_csv(csv_file)
            
            # Find the corresponding image filename
            image_filename = os.path.basename(image_path)
            
            # Filter annotations based on the image filename
            annotations = csv_data[csv_data['filename'] == image_filename]
            
            # Initialize lists to store the texture metrics
            bbox_contrast_list = []
            bbox_dissimilarity_list = []
            bbox_homogeneity_list = []
            bbox_energy_list = []
            donut_contrast_list = []
            donut_dissimilarity_list = []
            donut_homogeneity_list = []
            donut_energy_list = []
            
            # Iterate through annotations and calculate texture metrics
            for _, row in annotations.iterrows():
                bbox = ast.literal_eval(row['consensus_bbox'])  # Parse bbox values from string to list
                
                # Adjust bounding box to stay within image boundaries
                bbox = adjust_bbox_to_image(image, bbox)
                
                # Calculate GCLM derivatives for bounding box
                bbox_contrast, bbox_dissimilarity, bbox_homogeneity, bbox_energy = calculate_gclm_derivatives(image, bbox)
                
                donut_left = max(0, bbox[0] - 20)  # Adjust the buffer size as needed
                donut_top = max(0, bbox[1] - 20)
                donut_right = min(image.shape[1], bbox[0] + bbox[2] + 20)
                donut_bottom = min(image.shape[0], bbox[1] + bbox[3] + 20)
                donut_bbox = [donut_left, donut_top, donut_right - donut_left, donut_bottom - donut_top]
                donut_contrast, donut_dissimilarity, donut_homogeneity, donut_energy = calculate_gclm_derivatives(image, donut_bbox)

                # Append the calculated texture metrics to the lists
                bbox_contrast_list.append(bbox_contrast)
                bbox_dissimilarity_list.append(bbox_dissimilarity)
                bbox_homogeneity_list.append(bbox_homogeneity)
                bbox_energy_list.append(bbox_energy)
                donut_contrast_list.append(donut_contrast)
                donut_dissimilarity_list.append(donut_dissimilarity)
                donut_homogeneity_list.append(donut_homogeneity)
                donut_energy_list.append(donut_energy)
            
            # Add texture metrics as columns to a temporary dataframe
            temp_df = pd.DataFrame({
                'ID': annotations["id"],
                'filename': [image_filename] * len(annotations),
                'bbox_contrast': bbox_contrast_list,
                'bbox_dissimilarity': bbox_dissimilarity_list,
                'bbox_homogeneity': bbox_homogeneity_list,
                'bbox_energy': bbox_energy_list,
                'donut_contrast': donut_contrast_list,
                'donut_dissimilarity': donut_dissimilarity_list,
                'donut_homogeneity': donut_homogeneity_list,
                'donut_energy': donut_energy_list
            })
            
            # Append the temporary dataframe to the main dataframe
            texture_metrics_df = pd.concat([texture_metrics_df, temp_df], ignore_index=True)
        except Exception as e:
            print(f"Error processing image '{image_path}': {e}")
    
    # Save the main dataframe with texture metrics to a CSV file
    texture_metrics_df.to_csv('E:\\imagefactors\\data\\gclm_crowd.csv', index=False)

# Example usage with a directory containing images
image_dir = 'E:\\imagefactors\\data\\zooniverse'
csv_file = 'E:\\imagefactors\\data\\crowdsourced_imagefactors.csv'  # Replace with the actual path to your CSV file containing the annotations

calculate_texture_metrics_for_directory(image_dir, csv_file)




In [53]:
#Merge the basic GCLM to the main dataframe
path1 = "E:\\imagefactors\\data\\gclm_crowd.csv"
with open(path1) as f1:
  gclm_crowd = pd.read_csv(f1)

df = pd.merge(df, gclm_crowd, on=["id", "filename"], how="left")
df.head()

Unnamed: 0,id,filename,consensus_class_ID,consensus_bbox,pielou_index,basefile,area,bbox_percent_area,same_class_percent,num_neighbors,...,distance_from_center,density,bbox_contrast,bbox_dissimilarity,bbox_homogeneity,bbox_energy,donut_contrast,donut_dissimilarity,donut_homogeneity,donut_energy
0,1,20211201_Atrisco_0459_01_01.png,Goose,"[634.05224609375, 260.4735412597656, 49.0, 80....",0.764205,20211201_Atrisco_0459,3920.06131,1.100016,100.0,1,...,2.163344,9,708.546349,18.148248,0.086465,0.016047,669.09411,17.871651,0.08455,0.014223
1,2,20211201_Atrisco_0459_01_01.png,Goose,"[555.4261474609375, 216.25, 53.0, 69.0]",0.764205,20211201_Atrisco_0459,3657.0,1.026198,100.0,1,...,1.628098,9,658.630723,17.472776,0.089489,0.01706,650.098199,17.941989,0.081127,0.013212
2,3,20211201_Atrisco_0459_01_01.png,Goose,"[266.75, 120.83124542236328, 60.33087158203125...",0.764205,20211201_Atrisco_0459,5188.454956,1.455943,100.0,4,...,0.723215,9,624.725403,17.631852,0.077243,0.014419,553.768272,16.981959,0.073528,0.012126
3,4,20211201_Atrisco_0459_01_01.png,Goose,"[176.8125, 22.46035385131836, 52.0, 84.0]",0.764205,20211201_Atrisco_0459,4368.0,1.225713,100.0,2,...,1.630146,9,463.748687,14.682327,0.093184,0.016236,477.028835,15.441471,0.082321,0.012909
4,5,20211201_Atrisco_0459_01_01.png,Goose,"[101.36946105957031, 170.06580352783203, 62.24...",0.721928,20211201_Atrisco_0459,5135.498171,1.441082,100.0,1,...,1.459121,9,475.779931,15.135325,0.091384,0.015773,465.466953,15.48335,0.080959,0.012942


In [54]:
# Calculate the differences for each GCLM statistic
df['contrast_difference'] = df['donut_contrast'] - df['bbox_contrast']
df['energy_difference'] = df['donut_energy'] - df['bbox_energy']
df['homogeneity_difference'] = df['donut_homogeneity'] - df['bbox_homogeneity']
df['dissimilarity_difference'] = df['donut_dissimilarity'] - df['bbox_dissimilarity']

In [55]:
#Save 
df.to_csv('E:/imagefactors/data/crowdsourced_imagefactors.csv', index=False)

In [43]:
#Merge with new file that has the simplified dependent variable
import pandas as pd

path1 = "E:\\imagefactors\\data\\crowdsourced_imagefactors.csv"
with open(path1) as f1:
  img = pd.read_csv(f1)

path2 = "E:\\imagefactors\\data\\consensusLabels_simple.csv"
with open(path2) as f2:
  simple = pd.read_csv(f2)

df = pd.merge(img, simple, on=["id"], how="left")

df.head()

Unnamed: 0,id,filename_x,consensus_class_ID_x,consensus_bbox_x,pielou_index,basefile,area,bbox_percent_area,same_class_percent,num_neighbors,...,energy_difference,homogeneity_difference,dissimilarity_difference,cluster_id,filename_y,consensus_class_ID_y,consensus_bbox_y,num_annotations,consensus_guesses,correct_fraction
0,1,20211201_Atrisco_0459_01_01.png,Goose,"[634.05224609375, 260.4735412597656, 49.0, 80....",0.764205,20211201_Atrisco_0459,3920.06131,1.100016,100.0,1,...,-0.001824,-0.001915,-0.276598,0.0,20211201_Atrisco_0459_01_01.png,Goose,"[634.05224609375, 260.4735412597656, 49.0, 80....",9.0,7.0,0.777778
1,2,20211201_Atrisco_0459_01_01.png,Goose,"[555.4261474609375, 216.25, 53.0, 69.0]",0.764205,20211201_Atrisco_0459,3657.0,1.026198,100.0,1,...,-0.003848,-0.008362,0.469213,1.0,20211201_Atrisco_0459_01_01.png,Goose,"[555.4261474609375, 216.25, 53.0, 69.0]",9.0,7.0,0.777778
2,3,20211201_Atrisco_0459_01_01.png,Goose,"[266.75, 120.83124542236328, 60.33087158203125...",0.764205,20211201_Atrisco_0459,5188.454956,1.455943,100.0,4,...,-0.002293,-0.003715,-0.649892,2.0,20211201_Atrisco_0459_01_01.png,Goose,"[266.75, 120.83124542236328, 60.33087158203125...",9.0,7.0,0.777778
3,4,20211201_Atrisco_0459_01_01.png,Goose,"[176.8125, 22.46035385131836, 52.0, 84.0]",0.764205,20211201_Atrisco_0459,4368.0,1.225713,100.0,2,...,-0.003327,-0.010862,0.759144,3.0,20211201_Atrisco_0459_01_01.png,Goose,"[176.8125, 22.46035385131836, 52.0, 84.0]",9.0,7.0,0.777778
4,5,20211201_Atrisco_0459_01_01.png,Goose,"[101.36946105957031, 170.06580352783203, 62.24...",0.721928,20211201_Atrisco_0459,5135.498171,1.441082,100.0,1,...,-0.002831,-0.010425,0.348026,4.0,20211201_Atrisco_0459_01_01.png,Goose,"[101.36946105957031, 170.06580352783203, 62.24...",10.0,8.0,0.8


In [44]:
df = df.drop(columns=['consensus_class_ID_y', 'consensus_bbox_y', 'filename_y'], axis=1)
df = df.rename(columns={'filename_x': 'filename', 'consensus_class_ID_x': 'consensus_class_ID', 'consensus_bbox_x': 'consensus_bbox'})
df['n-k'] = df['num_annotations'] - df['consensus_guesses']
df.head()

Unnamed: 0,id,filename,consensus_class_ID,consensus_bbox,pielou_index,basefile,area,bbox_percent_area,same_class_percent,num_neighbors,...,donut_energy,contrast_difference,energy_difference,homogeneity_difference,dissimilarity_difference,cluster_id,num_annotations,consensus_guesses,correct_fraction,n-k
0,1,20211201_Atrisco_0459_01_01.png,Goose,"[634.05224609375, 260.4735412597656, 49.0, 80....",0.764205,20211201_Atrisco_0459,3920.06131,1.100016,100.0,1,...,0.014223,-39.452239,-0.001824,-0.001915,-0.276598,0.0,9.0,7.0,0.777778,2.0
1,2,20211201_Atrisco_0459_01_01.png,Goose,"[555.4261474609375, 216.25, 53.0, 69.0]",0.764205,20211201_Atrisco_0459,3657.0,1.026198,100.0,1,...,0.013212,-8.532524,-0.003848,-0.008362,0.469213,1.0,9.0,7.0,0.777778,2.0
2,3,20211201_Atrisco_0459_01_01.png,Goose,"[266.75, 120.83124542236328, 60.33087158203125...",0.764205,20211201_Atrisco_0459,5188.454956,1.455943,100.0,4,...,0.012126,-70.957132,-0.002293,-0.003715,-0.649892,2.0,9.0,7.0,0.777778,2.0
3,4,20211201_Atrisco_0459_01_01.png,Goose,"[176.8125, 22.46035385131836, 52.0, 84.0]",0.764205,20211201_Atrisco_0459,4368.0,1.225713,100.0,2,...,0.012909,13.280148,-0.003327,-0.010862,0.759144,3.0,9.0,7.0,0.777778,2.0
4,5,20211201_Atrisco_0459_01_01.png,Goose,"[101.36946105957031, 170.06580352783203, 62.24...",0.721928,20211201_Atrisco_0459,5135.498171,1.441082,100.0,1,...,0.012942,-10.312978,-0.002831,-0.010425,0.348026,4.0,10.0,8.0,0.8,2.0


MODELING IMPACT OF IMAGE FACTORS ON LABELING DIFFICULTY

In [53]:
#Merge with new file that has the simplified dependent variable
import pandas as pd

path1 = "E:\\imagefactors\\data\\crowdsourced_imagefactors.csv"
with open(path1) as f1:
  img = pd.read_csv(f1)

path2 = "E:\\imagefactors\\data\\filtered_crowd.csv"
with open(path2) as f2:
  simple = pd.read_csv(f2)

df = pd.merge(img, simple, on=["consensus_bbox"], how="left")

df = df.drop(columns=['consensus_class_ID_y', 'filename_y'], axis=1)
df = df.rename(columns={'filename_x': 'filename', 'consensus_class_ID_x': 'consensus_class_ID'})
#df['n-k'] = df['num_annotations'] - df['consensus_guesses']
df.head()

Unnamed: 0.1,id,filename,consensus_class_ID,consensus_bbox,pielou_index,basefile,area,bbox_percent_area,same_class_percent,num_neighbors,...,donut_energy,contrast_difference,energy_difference,homogeneity_difference,dissimilarity_difference,Unnamed: 0,cluster_id,num_annotations,consensus_guesses,correct_fraction
0,1,20211201_Atrisco_0459_01_01.png,Goose,"[634.05224609375, 260.4735412597656, 49.0, 80....",0.764205,20211201_Atrisco_0459,3920.06131,1.100016,100.0,1,...,0.014223,-39.452239,-0.001824,-0.001915,-0.276598,1.0,0.0,9.0,7.0,0.777778
1,2,20211201_Atrisco_0459_01_01.png,Goose,"[555.4261474609375, 216.25, 53.0, 69.0]",0.764205,20211201_Atrisco_0459,3657.0,1.026198,100.0,1,...,0.013212,-8.532524,-0.003848,-0.008362,0.469213,2.0,1.0,9.0,7.0,0.777778
2,3,20211201_Atrisco_0459_01_01.png,Goose,"[266.75, 120.83124542236328, 60.33087158203125...",0.764205,20211201_Atrisco_0459,5188.454956,1.455943,100.0,4,...,0.012126,-70.957132,-0.002293,-0.003715,-0.649892,3.0,2.0,9.0,7.0,0.777778
3,4,20211201_Atrisco_0459_01_01.png,Goose,"[176.8125, 22.46035385131836, 52.0, 84.0]",0.764205,20211201_Atrisco_0459,4368.0,1.225713,100.0,2,...,0.012909,13.280148,-0.003327,-0.010862,0.759144,4.0,3.0,9.0,7.0,0.777778
4,5,20211201_Atrisco_0459_01_01.png,Goose,"[101.36946105957031, 170.06580352783203, 62.24...",0.721928,20211201_Atrisco_0459,5135.498171,1.441082,100.0,1,...,0.012942,-10.312978,-0.002831,-0.010425,0.348026,5.0,4.0,10.0,8.0,0.8


In [60]:
df = df.dropna()
len(df)

107698

In [62]:
#Dummy variables for class ID
data = pd.get_dummies(df, columns=["consensus_class_ID"], prefix="class")
for column in data.filter(like='class_'):
    data[column] = data[column].astype(int)
data.head()

Unnamed: 0.1,id,filename,consensus_bbox,pielou_index,basefile,area,bbox_percent_area,same_class_percent,num_neighbors,agl,...,homogeneity_difference,dissimilarity_difference,Unnamed: 0,cluster_id,num_annotations,consensus_guesses,correct_fraction,class_Crane,class_Duck,class_Goose
0,1,20211201_Atrisco_0459_01_01.png,"[634.05224609375, 260.4735412597656, 49.0, 80....",0.764205,20211201_Atrisco_0459,3920.06131,1.100016,100,1,30.0,...,-0.001915,-0.276598,1.0,0.0,9.0,7.0,0.777778,0,0,1
1,2,20211201_Atrisco_0459_01_01.png,"[555.4261474609375, 216.25, 53.0, 69.0]",0.764205,20211201_Atrisco_0459,3657.0,1.026198,100,1,30.0,...,-0.008362,0.469213,2.0,1.0,9.0,7.0,0.777778,0,0,1
2,3,20211201_Atrisco_0459_01_01.png,"[266.75, 120.83124542236328, 60.33087158203125...",0.764205,20211201_Atrisco_0459,5188.454956,1.455943,100,4,30.0,...,-0.003715,-0.649892,3.0,2.0,9.0,7.0,0.777778,0,0,1
3,4,20211201_Atrisco_0459_01_01.png,"[176.8125, 22.46035385131836, 52.0, 84.0]",0.764205,20211201_Atrisco_0459,4368.0,1.225713,100,2,30.0,...,-0.010862,0.759144,4.0,3.0,9.0,7.0,0.777778,0,0,1
4,5,20211201_Atrisco_0459_01_01.png,"[101.36946105957031, 170.06580352783203, 62.24...",0.721928,20211201_Atrisco_0459,5135.498171,1.441082,100,1,30.0,...,-0.010425,0.348026,5.0,4.0,10.0,8.0,0.8,0,0,1


In [70]:
data['n-k'] = data['num_annotations'] - data['consensus_guesses']

In [71]:
#Save data
data.to_csv('E:/imagefactors/data/filtered_crowd_IF.csv', index=False)

In [40]:
path = "E:/imagefactors/data/filtered_crowd_IF.csv"
with open(path) as f:
  data1 = pd.read_csv(f)

In [82]:
import pandas as pd
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Assuming 'data' is your DataFrame with predictor variables
predictors = data1[[
 'bbox_percent_area',
 #'same_class_percent',
 'num_neighbors',
 #'agl',
 #'gsd',
 #'distance_from_center',
 'density',
 #'bbox_contrast',
 #'bbox_dissimilarity',
 #'bbox_homogeneity',
 #'bbox_energy',
 #'donut_contrast',
 #'donut_dissimilarity',
 #'donut_homogeneity',
 #'donut_energy',
 #'contrast_difference',
 'energy_difference',
 #'homogeneity_difference',
 #'dissimilarity_difference',
 'class_Crane',
 'class_Duck',
 'class_Goose']]

# Calculate VIF
vif_data = pd.DataFrame()
vif_data["Variable"] = predictors.columns
vif_data["VIF"] = [variance_inflation_factor(predictors.values, i) for i in range(predictors.shape[1])]

# Display the VIF DataFrame
print(vif_data)

            Variable       VIF
0  bbox_percent_area  5.501058
1      num_neighbors  2.441503
2            density  2.410602
3  energy_difference  1.152246
4        class_Crane  3.645609
5         class_Duck  3.436611
6        class_Goose  1.272057


In [85]:
#Logistic Regression

import statsmodels.api as sm

# Define the independent variables
X = data[['bbox_percent_area', 'num_neighbors', 'density', 'energy_difference',
          'class_Goose', 'class_Crane']]

# Add a constant term to the independent variables
X = sm.add_constant(X)

# Define the response variable
y = data[['consensus_guesses', 'n-k']]

# Fit logistic regression model
model = sm.GLM(y, X, family=sm.families.Binomial()).fit()

# Display the model summary
print(model.summary())

                      Generalized Linear Model Regression Results                       
Dep. Variable:     ['consensus_guesses', 'n-k']   No. Observations:               107698
Model:                                      GLM   Df Residuals:                   107691
Model Family:                          Binomial   Df Model:                            6
Link Function:                            Logit   Scale:                          1.0000
Method:                                    IRLS   Log-Likelihood:                -95677.
Date:                          Wed, 06 Dec 2023   Deviance:                   1.1190e+05
Time:                                  08:58:50   Pearson chi2:                 1.20e+05
No. Iterations:                               6   Pseudo R-squ. (CS):             0.1738
Covariance Type:                      nonrobust                                         
                        coef    std err          z      P>|z|      [0.025      0.975]
------------------------

In [30]:
import pandas as pd
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Assuming 'data' is your DataFrame with predictor variables
predictors = data1[[
 #'bbox_percent_area',
 #'same_class_percent',
 'num_neighbors',
 #'agl',
 #'gsd',
 #'distance_from_center',
 'density',
 #'bbox_contrast',
 #'bbox_dissimilarity',
 #'bbox_homogeneity',
 #'bbox_energy',
 #'donut_contrast',
 #'donut_dissimilarity',
 #'donut_homogeneity',
 #'donut_energy',
 #'contrast_difference',
 'energy_difference',
 #'homogeneity_difference',
 #'dissimilarity_difference',
 'num_annotations',
 'class_Crane',
 'class_Duck',
 'class_Goose']]
 #'class_Seagull']]

# Calculate VIF
vif_data = pd.DataFrame()
vif_data["Variable"] = predictors.columns
vif_data["VIF"] = [variance_inflation_factor(predictors.values, i) for i in range(predictors.shape[1])]

# Display the VIF DataFrame
print(vif_data)

            Variable       VIF
0      num_neighbors  2.422707
1            density  2.519670
2  energy_difference  1.149683
3    num_annotations  4.205368
4        class_Crane  1.698305
5         class_Duck  2.816433
6        class_Goose  1.271071


In [12]:
#Multiple linear regression 
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

y = data['n-k']
X = data[['gsd', 'bbox_percent_area', 'same_class_percent', 'num_neighbors', 'distance_from_center', 'density', 'contrast_difference', 'energy_difference', 'homogeneity_difference', 'dissimilarity_difference',
        'class_Crane', 'class_Duck', 'class_Goose', 'class_Other Bird', 'class_Seagull']]

# Add a constant term to the independent variables (intercept)
X = sm.add_constant(X)

# Fit the linear regression model
model = sm.OLS(y, X).fit()

# Print the regression summary
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                    n-k   R-squared:                         nan
Model:                            OLS   Adj. R-squared:                    nan
Method:                 Least Squares   F-statistic:                       nan
Date:                Tue, 31 Oct 2023   Prob (F-statistic):                nan
Time:                        12:35:13   Log-Likelihood:                    nan
No. Observations:              136447   AIC:                               nan
Df Residuals:                  136432   BIC:                               nan
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
                               coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------
const                   

In [13]:
vif = pd.DataFrame()
vif["Variable"] = X.columns
vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
print(vif)

  return 1 - self.ssr/self.centered_tss
  vif = 1. / (1. - r_squared_i)


                    Variable        VIF
0                      const   0.000000
1                        gsd   3.488659
2          bbox_percent_area   2.867210
3         same_class_percent   1.077093
4              num_neighbors   1.385959
5       distance_from_center   2.163836
6                    density   2.049233
7        contrast_difference  19.635931
8          energy_difference   4.236287
9     homogeneity_difference  12.530935
10  dissimilarity_difference  36.395213
11               class_Crane        inf
12                class_Duck        inf
13               class_Goose        inf
14          class_Other Bird        inf
15             class_Seagull        inf


In [14]:
correlation_matrix = X.corr()
correlated_pairs = [(var1, var2) for var1 in X.columns for var2 in X.columns if var1 != var2 and abs(correlation_matrix.loc[var1, var2]) > 0.7]
correlated_pairs

[('gsd', 'distance_from_center'),
 ('distance_from_center', 'gsd'),
 ('contrast_difference', 'dissimilarity_difference'),
 ('energy_difference', 'homogeneity_difference'),
 ('homogeneity_difference', 'energy_difference'),
 ('homogeneity_difference', 'dissimilarity_difference'),
 ('dissimilarity_difference', 'contrast_difference'),
 ('dissimilarity_difference', 'homogeneity_difference')]

In [16]:
#Multiple linear regression 
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

y = data['n-k']
X = data[['gsd', 'bbox_percent_area', 'same_class_percent', 'num_neighbors', 'density', 'contrast_difference', 'energy_difference',
        'class_Crane', 'class_Duck', 'class_Goose', 'class_Other Bird', 'class_Seagull']]

# Add a constant term to the independent variables (intercept)
X = sm.add_constant(X)

# Fit the linear regression model
model = sm.OLS(y, X).fit()

# Print the regression summary
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                    n-k   R-squared:                         nan
Model:                            OLS   Adj. R-squared:                    nan
Method:                 Least Squares   F-statistic:                       nan
Date:                Tue, 31 Oct 2023   Prob (F-statistic):                nan
Time:                        12:40:50   Log-Likelihood:                    nan
No. Observations:              136447   AIC:                               nan
Df Residuals:                  136435   BIC:                               nan
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                      nan    

In [21]:
#Multiple linear regression 
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

y = data['n-k']
X = data[['gsd', 'bbox_percent_area', 'same_class_percent', 'num_neighbors', 'distance_from_center', 'density', 'contrast_difference', 'energy_difference', 'homogeneity_difference', 'dissimilarity_difference',
        'class_Crane', 'class_Duck', 'class_Goose', 'class_Seagull']]

# Add a constant term to the independent variables (intercept)
X = sm.add_constant(X)

# Fit the linear regression model
model = sm.OLS(y, X).fit()

# Print the regression summary
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                    n-k   R-squared:                         nan
Model:                            OLS   Adj. R-squared:                    nan
Method:                 Least Squares   F-statistic:                       nan
Date:                Thu, 02 Nov 2023   Prob (F-statistic):                nan
Time:                        12:47:40   Log-Likelihood:                    nan
No. Observations:              136447   AIC:                               nan
Df Residuals:                  136432   BIC:                               nan
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
                               coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------
const                   

VISUALIZATIONS

In [None]:
# Create a pairplot with regression lines
sns.pairplot(df, x_vars=['gsd', 'bbox_percent_area', 'same_class_percent', 'num_neighbors', 'distance_from_center', 'density'], y_vars=['pielou_index'], kind='reg', height=4)
plt.show()

In [None]:
#Sample visualization of donut + bounding boxes just to make sure it all looks good

import os
import cv2
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle

def visualize_bounding_boxes_with_donuts(image_path, csv_file):
    # Load the image
    image = cv2.imread(image_path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)  # Convert from BGR to RGB
    
    # Read the CSV file
    csv_data = pd.read_csv(csv_file)
    
    # Find the corresponding image filename
    image_filename = os.path.basename(image_path)
    
    # Filter annotations based on the image filename
    annotations = csv_data[csv_data['filename'] == image_filename]
    
    # Plot the image with bounding boxes and donut regions
    fig, ax = plt.subplots(figsize=(8, 8))
    ax.imshow(image)
    ax.axis('off')
    
    # Iterate through annotations and draw bounding boxes and donut regions
    for _, row in annotations.iterrows():
        bbox = ast.literal_eval(row['consensus_bbox'])  # Parse bbox values from string to list
        
        # Draw bounding box
        rectangle = Rectangle((bbox[0], bbox[1]), bbox[2], bbox[3], linewidth=1, edgecolor='r', facecolor='none')
        ax.add_patch(rectangle)
        
        # Draw donut region
        donut_left = max(0, bbox[0] - 20)  # Adjust the buffer size as needed
        donut_top = max(0, bbox[1] - 20)
        donut_right = min(image.shape[1], bbox[0] + bbox[2] + 20)
        donut_bottom = min(image.shape[0], bbox[1] + bbox[3] + 20)
        donut_rectangle = Rectangle((donut_left, donut_top), (donut_right - donut_left),
                                   (donut_bottom - donut_top), linewidth=1, edgecolor='g', facecolor='none')
        ax.add_patch(donut_rectangle)
    
    # Show the plot
    plt.show()

# Example usage
image_path = 'E:\\imagefactors\\data\\zooniverse\\20211201_Atrisco_0459_01_01.png'
csv_file = path

visualize_bounding_boxes_with_donuts(image_path, csv_file)


SCRATCH

In [None]:
#TEXTURE METRICS-- GCLM
# annotation + "donut" area (interior + exterior buffer)-- SINGLE IMAGE ONLY

def calculate_gclm_derivatives(image, bbox):
    # Convert bounding box coordinates to integers
    x, y, width, height = map(int, bbox)
    
    # Extract the region of interest (ROI) from the image using the bounding box
    roi = image[y:y+height, x:x+width]
    
    # Convert the ROI to grayscale
    roi_gray = cv2.cvtColor(roi, cv2.COLOR_RGB2GRAY)
    
    # Calculate GCLM features for the grayscale ROI
    distances = [1, 2]  # Define the distances for GCLM
    angles = [0, np.pi/4, np.pi/2, 3*np.pi/4]  # Define the angles for GCLM
    gclm = graycomatrix(roi_gray, distances=distances, angles=angles, levels=256,
                        symmetric=True, normed=True)
    
    # Calculate GCLM derivatives (contrast, dissimilarity, homogeneity, energy)
    contrast = graycoprops(gclm, 'contrast').mean()
    dissimilarity = graycoprops(gclm, 'dissimilarity').mean()
    homogeneity = graycoprops(gclm, 'homogeneity').mean()
    energy = graycoprops(gclm, 'energy').mean()
    
    return contrast, dissimilarity, homogeneity, energy

def calculate_texture_metrics_for_csv(image_path, csv_file):
        # Load the image
        image = cv2.imread(image_path)
        
        # Read the CSV file
        csv_data = pd.read_csv(csv_file)
        
        # Find the corresponding image filename
        image_filename = os.path.basename(image_path)
        
        # Filter annotations based on the image filename
        annotations = csv_data[csv_data['filename'] == image_filename]
        
        # Initialize lists to store the texture metrics
        bbox_contrast_list = []
        bbox_dissimilarity_list = []
        bbox_homogeneity_list = []
        bbox_energy_list = []
        donut_contrast_list = []
        donut_dissimilarity_list = []
        donut_homogeneity_list = []
        donut_energy_list = []
        
        # Iterate through annotations and calculate texture metrics
        for _, row in annotations.iterrows():
            bbox = ast.literal_eval(row['consensus_bbox'])  # Parse bbox values from string to list
            
            # Calculate GCLM derivatives for bounding box and donut region
            bbox_contrast, bbox_dissimilarity, bbox_homogeneity, bbox_energy = calculate_gclm_derivatives(image, bbox)
            
            donut_left = max(0, bbox[0] - 20)  # Adjust the buffer size as needed
            donut_top = max(0, bbox[1] - 20)
            donut_right = min(image.shape[1], bbox[0] + bbox[2] + 20)
            donut_bottom = min(image.shape[0], bbox[1] + bbox[3] + 20)
            donut_bbox = [donut_left, donut_top, donut_right - donut_left, donut_bottom - donut_top]
            donut_contrast, donut_dissimilarity, donut_homogeneity, donut_energy = calculate_gclm_derivatives(image, donut_bbox)
            
            # Append the calculated texture metrics to the lists
            bbox_contrast_list.append(bbox_contrast)
            bbox_dissimilarity_list.append(bbox_dissimilarity)
            bbox_homogeneity_list.append(bbox_homogeneity)
            bbox_energy_list.append(bbox_energy)
            
            donut_contrast_list.append(donut_contrast)
            donut_dissimilarity_list.append(donut_dissimilarity)
            donut_homogeneity_list.append(donut_homogeneity)
            donut_energy_list.append(donut_energy)
        
        # Add texture metrics as new columns in the dataframe
        annotations['bbox_contrast'] = bbox_contrast_list
        annotations['bbox_dissimilarity'] = bbox_dissimilarity_list
        annotations['bbox_homogeneity'] = bbox_homogeneity_list
        annotations['bbox_energy'] = bbox_energy_list
        
        annotations['donut_contrast'] = donut_contrast_list
        annotations['donut_dissimilarity'] = donut_dissimilarity_list
        annotations['donut_homogeneity'] = donut_homogeneity_list
        annotations['donut_energy'] = donut_energy_list
        
        # Save the modified dataframe if needed
        annotations.to_csv('E:\\imagefactors\\data\\gclm.csv', index=False)
    
# Example usage
image_path = 'E:\\imagefactors\\data\\zooniverse\\20211212_Alameda_0285_07_08.png'
csv_file = path  # Replace with the actual path to your CSV file containing the annotations

calculate_texture_metrics_for_csv(image_path, csv_file)