Analysis of image factors on annotation consensus-- USFWS
Start date: 10/03/2023

LIBRARIES

In [1]:
import pandas as pd
from PIL import Image
import os
import ast
import numpy as np
import cv2
import pandas as pd
from skimage.feature import graycomatrix, graycoprops
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
import seaborn as sns
import statsmodels.api as sm

LOAD DATA

In [None]:
#Analysis annotations
path = "E:\\imagefactors\\data\\expertconsensusLabels_agreementIndex_Superclass.csv"
with open(path) as f:
  df = pd.read_csv(f)

#path = "E:\\imagefactors\\data\\expertconsensusLabels_agreementIndex_Spp.csv"
#with open(path) as f:
#  df = pd.read_csv(f)

def eval_bbox_refined(row):
    if pd.notnull(row['bbox']):
        return ast.literal_eval(row['bbox'])
    else:
        return None
df['bbox'] = df.apply(eval_bbox_refined, axis=1)

In [None]:
#IF RESUMING FROM A SAVED POINT
#path = "E:/imagefactors/data/expert_imagefactors_superclass.csv"
#with open(path) as f:
#  df = pd.read_csv(f)
  
path = "E:/imagefactors/data/expert_imagefactors_SPP.csv"
with open(path) as f:
  df = pd.read_csv(f)

#Fixing how bounding boxes are read
def eval_bbox_refined(row):
    if pd.notnull(row['bbox']):
        return ast.literal_eval(row['bbox'])
    else:
        return None
df['bbox'] = df.apply(eval_bbox_refined, axis=1)
df.head()

DERIVE IMAGE/ANNOTATION FACTORS FOR ANALYSIS

In [None]:
#BBOX AREA

def calc_area(row):
    bbox = row['bbox']
    xmin, ymin, w, h = bbox
    return w * h

df['area'] = df.apply(calc_area, axis=1)

In [None]:
# % AREA BBOX
# Percent area of the bounding box of the total image area

# Define a function to calculate percentage area
def calculate_percentage_area(image_filename, bbox_area):
    image_path = os.path.join("E:\\imagefactors\\data\\usfws", image_filename)
    
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        # Handle the case where the image is not found
        print(f"Image not found: {image_path}")
        return None
    
    image_width, image_height = image.size
    image_area = image_width * image_height

    percentage_area = (bbox_area / image_area) * 100
    return percentage_area

# Implementation
df['bbox_percent_area'] = df.apply(lambda row: calculate_percentage_area(row['filename'], row['area']), axis=1)

In [None]:
# SAME CLASS %
# % of targets of the same class as the analysis target (in the same image)

# Define a function to calculate the percentage of same-class neighbors for a given row
def calculate_same_class_percentage(row, df):
    # Get the filename and class ID of the target bounding box
    filename = row['filename']
    class_id = row['consensus_class_ID']
    
    # Filter the DataFrame to include only rows with matching filenames
    matching_rows = df[df['filename'] == filename]
    
    # Calculate the total number of neighbors in the same image
    total_neighbors = len(matching_rows) - 1  # Subtract 1 to exclude the target bounding box
    
    if total_neighbors == 0:
        return 0  # Avoid division by zero
    
    # Calculate the number of same-class neighbors
    same_class_neighbors = len(matching_rows[matching_rows['consensus_class_ID'] == class_id]) - 1  # Subtract 1 to exclude the target bounding box
    
    # Calculate the percentage of same-class neighbors
    same_class_percentage = (same_class_neighbors / total_neighbors) * 100
    
    return same_class_percentage

# Calculate the same-class percentage for each row and add the results as a new column
df['same_class_percent'] = df.apply(lambda row: calculate_same_class_percentage(row, df), axis=1)

In [None]:
#NUMBER OF NEIGHBORS 
# Number of annotations within 2x maximum of bbox width or height (to account for positional differences)

# Define a function to calculate the number of neighbors for a given row
def count_neighbors(row, df):
    bbox = row['bbox']
    # Define the search radius as 2 times the maximum of width and height
    search_radius = 2 * max(bbox[2], bbox[3])
    
    # Calculate the center coordinates of the bounding box
    x_center = bbox[0] + bbox[2] / 2
    y_center = bbox[1] + bbox[3] / 2
    
    # Initialize a count for neighbors
    num_neighbors = 0
    
    # Iterate through rows with matching filenames
    matching_rows = df[df['filename'] == row['filename']]
    for _, neighbor_row in matching_rows.iterrows():
        if neighbor_row.name != row.name:
            # Extract 'bbox' values for the neighbor
            neighbor_bbox = neighbor_row['bbox']
            
            # Calculate the center coordinates of the potential neighbor
            neighbor_x_center = neighbor_bbox[0] + neighbor_bbox[2] / 2
            neighbor_y_center = neighbor_bbox[1] + neighbor_bbox[3] / 2
            
            # Calculate the Euclidean distance between centers
            distance = np.sqrt((x_center - neighbor_x_center)**2 + (y_center - neighbor_y_center)**2)
            
            # Check if the neighbor is within the search radius
            if distance <= search_radius:
                num_neighbors += 1
    
    return num_neighbors

# Calculate the number of neighbors for each row and add the results as a new column
df['num_neighbors'] = df.apply(lambda row: count_neighbors(row, df), axis=1)

In [None]:
#TOTAL NUMBER OF BIRDS PER IMAGE
df['density'] = df.groupby('filename')['bbox'].transform('count')

In [None]:
#OVERALL CLASS PREVALENCE IN THE DATASET-- SPP only

class_counts = df['consensus_class_ID'].value_counts()
class_prevalence = (class_counts / len(df)) * 100
df['rarity'] = df['consensus_class_ID'].map(class_prevalence)

In [None]:
#DISTANCE OF TARGET FROM IMAGE CENTER-- in meters

#Add in AGL/GSD info derived from derive_agl_gsd.ipynb
path1 = "E:\\imagefactors\\data\\benchmark_gsd.csv"
with open(path1) as f1:
  gsd_df = pd.read_csv(f1)

merged_df = pd.merge(df, gsd_df, on="filename", how="left")
#Remove Maxwell from analysis
merged_df = merged_df[merged_df["filename"] != "mxw_L13_20181215_1.JPG"]

# Function to calculate distance from center
def calculate_distance_from_center(row):
    image_path = os.path.join("E:\\imagefactors\\data\\usfws", row["filename"])
    
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        # Handle the case where the image is not found
        print(f"Image not found: {image_path}")
        return None  # You can return a special value, such as None, to indicate the image wasn't found
    
    image_width, image_height = image.size
    center_x_px = image_width/2 
    center_y_px = image_height/2
    gsd_m = row['gsd'] / 100

    row['center_x_m'] = center_x_px * gsd_m
    row['center_y_m'] = center_y_px * gsd_m
    
    # Get the coordinates of the bounding box (x, y, width, height)
    x, y, width, height = row['bbox']

    # Calculate the center point of the bounding box in pixels
    bbox_center_x_px = x + (width / 2)
    bbox_center_y_px = y + (height / 2)

    # Calculate the center point of the bounding box in meters
    bbox_center_x_m = bbox_center_x_px * gsd_m
    bbox_center_y_m = bbox_center_y_px * gsd_m

    # Calculate the distance from the center of the image in meters
    distance_m = ((row['center_x_m'] - bbox_center_x_m)**2 + (row['center_y_m'] - bbox_center_y_m)**2)**0.5

    return distance_m

# Apply the function to the merged dataframe
merged_df['distance_from_center'] = merged_df.apply(calculate_distance_from_center, axis=1)
df = merged_df

In [None]:
#TEXTURE METRICS-- GCLM
# annotation + "donut" area (interior + exterior buffer)

def calculate_gclm_derivatives(image, bbox):
    # Convert bounding box coordinates to integers
    x, y, width, height = map(int, bbox)
    
    # Extract the region of interest (ROI) from the image using the bounding box
    roi = image[y:y+height, x:x+width]
    
    # Check if the ROI is empty or None
    if roi is None or roi.size == 0:
        print("Warning: ROI is empty or None")
        return None, None, None, None
    
    # Convert the ROI to grayscale
    roi_gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
    
    # Calculate GCLM features for the grayscale ROI
    distances = [1, 3, 5]  # Define the distances for GCLM
    angles = [0, np.pi/4, np.pi/2, 3*np.pi/4, np.pi/8, 3*np.pi/8, 5*np.pi/8, 7*np.pi/8]  # Define the angles for GCLM
    gclm = graycomatrix(roi_gray, distances=distances, angles=angles, levels=256,
                        symmetric=True, normed=True)
    
    # Calculate GCLM derivatives (contrast, dissimilarity, homogeneity, energy)
    contrast = graycoprops(gclm, 'contrast').mean()
    dissimilarity = graycoprops(gclm, 'dissimilarity').mean()
    homogeneity = graycoprops(gclm, 'homogeneity').mean()
    energy = graycoprops(gclm, 'energy').mean()
    
    return contrast, dissimilarity, homogeneity, energy

def calculate_texture_metrics_for_directory(image_dir, csv_file):
    # Initialize an empty dataframe to store the texture metrics
    texture_metrics_df = pd.DataFrame()
    
    # List all files in the specified directory
    image_files = [f for f in os.listdir(image_dir) if os.path.isfile(os.path.join(image_dir, f))]
    
    for image_file in image_files:
        try:
            # Construct the full path to the image file
            image_path = os.path.join(image_dir, image_file)
            
            # Load the image
            image = cv2.imread(image_path)
            
            if image is None:
                print(f"Warning: Image '{image_path}' not found or cannot be loaded.")
                continue
            
            # Read the CSV file
            csv_data = pd.read_csv(csv_file)
            
            # Find the corresponding image filename
            image_filename = os.path.basename(image_path)
            
            # Filter annotations based on the image filename
            annotations = csv_data[csv_data['filename'] == image_filename]
            
            # Initialize lists to store the texture metrics
            bbox_contrast_list = []
            bbox_dissimilarity_list = []
            bbox_homogeneity_list = []
            bbox_energy_list = []
            donut_contrast_list = []
            donut_dissimilarity_list = []
            donut_homogeneity_list = []
            donut_energy_list = []
            
            # Iterate through annotations and calculate texture metrics
            for _, row in annotations.iterrows():
                bbox = ast.literal_eval(row['bbox'])  # Parse bbox values from string to list
                
                # Calculate GCLM derivatives for bounding box and donut region
                bbox_contrast, bbox_dissimilarity, bbox_homogeneity, bbox_energy = calculate_gclm_derivatives(image, bbox)
                
                donut_left = max(0, bbox[0] - 20)  # Adjust the buffer size as needed
                donut_top = max(0, bbox[1] - 20)
                donut_right = min(image.shape[1], bbox[0] + bbox[2] + 20)
                donut_bottom = min(image.shape[0], bbox[1] + bbox[3] + 20)
                donut_bbox = [donut_left, donut_top, donut_right - donut_left, donut_bottom - donut_top]
                donut_contrast, donut_dissimilarity, donut_homogeneity, donut_energy = calculate_gclm_derivatives(image, donut_bbox)
                
                # Append the calculated texture metrics to the lists
                bbox_contrast_list.append(bbox_contrast)
                bbox_dissimilarity_list.append(bbox_dissimilarity)
                bbox_homogeneity_list.append(bbox_homogeneity)
                bbox_energy_list.append(bbox_energy)
                
                donut_contrast_list.append(donut_contrast)
                donut_dissimilarity_list.append(donut_dissimilarity)
                donut_homogeneity_list.append(donut_homogeneity)
                donut_energy_list.append(donut_energy)
            
            # Add texture metrics as columns to a temporary dataframe
            temp_df = pd.DataFrame({
                'ID': annotations["Unnamed: 0"],
                'filename': [image_filename] * len(annotations),
                'bbox_contrast': bbox_contrast_list,
                'bbox_dissimilarity': bbox_dissimilarity_list,
                'bbox_homogeneity': bbox_homogeneity_list,
                'bbox_energy': bbox_energy_list,
                'donut_contrast': donut_contrast_list,
                'donut_dissimilarity': donut_dissimilarity_list,
                'donut_homogeneity': donut_homogeneity_list,
                'donut_energy': donut_energy_list
            })
            
            # Append the temporary dataframe to the main dataframe
            texture_metrics_df = pd.concat([texture_metrics_df, temp_df], ignore_index=True)
        except Exception as e:
            print(f"Error processing image '{image_path}': {e}")
    
    # Save the main dataframe with texture metrics to a CSV file
    texture_metrics_df.to_csv('E:\\imagefactors\\data\\gclm_usfws_135_8angles.csv', index=False)

# Example usage with a directory containing images
image_dir = 'E:\\imagefactors\\data\\usfws'
csv_file = path  # Replace with the actual path to your CSV file containing the annotations

calculate_texture_metrics_for_directory(image_dir, csv_file)

In [None]:
#Same as above, but using a PCA as the base instead of the greyscale image

# Modify the function to calculate texture metrics based on PCA
def calculate_pca_texture_metrics(image, bbox):
    # Convert bounding box coordinates to integers
    x, y, width, height = map(int, bbox)
    
    # Extract the region of interest (ROI) from the image using the bounding box
    roi = image[y:y+height, x:x+width]
    
    # Check if the ROI is empty or None
    if roi is None or roi.size == 0:
        print("Warning: ROI is empty or None")
        return None, None, None, None
    
    # Convert the ROI to grayscale
    roi_gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
    
    # Perform PCA on the grayscale ROI
    pca = PCA(n_components=3)  # Choose the number of components you want to use
    roi_pca = pca.fit_transform(roi_gray)

    # Convert the PCA-transformed ROI to unsigned integer type
    roi_pca = roi_pca.astype(np.uint8)
    
    # Calculate GCLM features for the PCA-transformed ROI
    distances = [1, 2]  # Define the distances for GCLM
    angles = [0, np.pi/4, np.pi/2, 3*np.pi/4]  # Define the angles for GCLM
    gclm = graycomatrix(roi_pca, distances=distances, angles=angles, levels=256,
                        symmetric=True, normed=True)
    
    # Calculate GCLM derivatives (contrast, dissimilarity, homogeneity, energy)
    contrast = graycoprops(gclm, 'contrast').mean()
    dissimilarity = graycoprops(gclm, 'dissimilarity').mean()
    homogeneity = graycoprops(gclm, 'homogeneity').mean()
    energy = graycoprops(gclm, 'energy').mean()
    
    return contrast, dissimilarity, homogeneity, energy

# Modify the function to use the new texture metrics calculation function
def calculate_texture_metrics_for_directory_pca(image_dir, csv_file):
    # Initialize an empty dataframe to store the texture metrics
    texture_metrics_df = pd.DataFrame()
    
    # List all files in the specified directory
    image_files = [f for f in os.listdir(image_dir) if os.path.isfile(os.path.join(image_dir, f))]
    
    for image_file in image_files:
        try:
            # Construct the full path to the image file
            image_path = os.path.join(image_dir, image_file)
            
            # Load the image
            image = cv2.imread(image_path)
            
            if image is None:
                print(f"Warning: Image '{image_path}' not found or cannot be loaded.")
                continue
            
            # Read the CSV file
            csv_data = pd.read_csv(csv_file)
            
            # Find the corresponding image filename
            image_filename = os.path.basename(image_path)
            
            # Filter annotations based on the image filename
            annotations = csv_data[csv_data['filename'] == image_filename]
            
            # Initialize lists to store the texture metrics
            bbox_contrast_list = []
            bbox_dissimilarity_list = []
            bbox_homogeneity_list = []
            bbox_energy_list = []
            donut_contrast_list = []
            donut_dissimilarity_list = []
            donut_homogeneity_list = []
            donut_energy_list = []
            
            # Iterate through annotations and calculate texture metrics
            for _, row in annotations.iterrows():
                bbox = ast.literal_eval(row['bbox'])  # Parse bbox values from string to list
                
                # Calculate GCLM derivatives for bounding box and donut region
                bbox_contrast, bbox_dissimilarity, bbox_homogeneity, bbox_energy = calculate_pca_texture_metrics(image, bbox)
                
                donut_left = max(0, bbox[0] - 20)  # Adjust the buffer size as needed
                donut_top = max(0, bbox[1] - 20)
                donut_right = min(image.shape[1], bbox[0] + bbox[2] + 20)
                donut_bottom = min(image.shape[0], bbox[1] + bbox[3] + 20)
                donut_bbox = [donut_left, donut_top, donut_right - donut_left, donut_bottom - donut_top]
                donut_contrast, donut_dissimilarity, donut_homogeneity, donut_energy = calculate_pca_texture_metrics(image, donut_bbox)
                
                # Append the calculated texture metrics to the lists
                bbox_contrast_list.append(bbox_contrast)
                bbox_dissimilarity_list.append(bbox_dissimilarity)
                bbox_homogeneity_list.append(bbox_homogeneity)
                bbox_energy_list.append(bbox_energy)
                
                donut_contrast_list.append(donut_contrast)
                donut_dissimilarity_list.append(donut_dissimilarity)
                donut_homogeneity_list.append(donut_homogeneity)
                donut_energy_list.append(donut_energy)
            
            # Add texture metrics as columns to a temporary dataframe
            temp_df = pd.DataFrame({
                'ID': annotations["Unnamed: 0"],
                'filename': [image_filename] * len(annotations),
                'bbox_contrast': bbox_contrast_list,
                'bbox_dissimilarity': bbox_dissimilarity_list,
                'bbox_homogeneity': bbox_homogeneity_list,
                'bbox_energy': bbox_energy_list,
                'donut_contrast': donut_contrast_list,
                'donut_dissimilarity': donut_dissimilarity_list,
                'donut_homogeneity': donut_homogeneity_list,
                'donut_energy': donut_energy_list
            })
            
            # Append the temporary dataframe to the main dataframe
            texture_metrics_df = pd.concat([texture_metrics_df, temp_df], ignore_index=True)
        except Exception as e:
            print(f"Error processing image '{image_path}': {e}")
    
    # Save the main dataframe with texture metrics to a CSV file
    texture_metrics_df.to_csv('E:\\imagefactors\\data\\gclm_usfws_pca.csv', index=False)

# Example usage with a directory containing images
image_dir = 'E:\\imagefactors\\data\\usfws'
csv_file = path  # Replace with the actual path to your CSV file containing the annotations

calculate_texture_metrics_for_directory_pca(image_dir, csv_file)

In [None]:
#Same as above-- but calculate only on one band 

# Modify the function to calculate texture metrics based on a specific band
def calculate_band_texture_metrics(image, bbox, band):
    # Convert bounding box coordinates to integers
    x, y, width, height = map(int, bbox)
    
    # Extract the region of interest (ROI) from the image using the bounding box
    roi = image[y:y+height, x:x+width, band]
    
    # Check if the ROI is empty or None
    if roi is None or roi.size == 0:
        print("Warning: ROI is empty or None")
        return None, None, None, None
    
    # Calculate GCLM features for the grayscale ROI
    distances = [1, 2]  # Define the distances for GCLM
    angles = [0, np.pi/4, np.pi/2, 3*np.pi/4]  # Define the angles for GCLM
    gclm = graycomatrix(roi, distances=distances, angles=angles, levels=256,
                        symmetric=True, normed=True)
    
    # Calculate GCLM derivatives (contrast, dissimilarity, homogeneity, energy)
    contrast = graycoprops(gclm, 'contrast').mean()
    dissimilarity = graycoprops(gclm, 'dissimilarity').mean()
    homogeneity = graycoprops(gclm, 'homogeneity').mean()
    energy = graycoprops(gclm, 'energy').mean()
    
    return contrast, dissimilarity, homogeneity, energy

# Modify the function to use the new texture metrics calculation function
def calculate_texture_metrics_for_directory_band(image_dir, csv_file, band):
    # Initialize an empty dataframe to store the texture metrics
    texture_metrics_df = pd.DataFrame()
    
    # List all files in the specified directory
    image_files = [f for f in os.listdir(image_dir) if os.path.isfile(os.path.join(image_dir, f))]
    
    for image_file in image_files:
        try:
            # Construct the full path to the image file
            image_path = os.path.join(image_dir, image_file)
            
            # Load the image
            image = cv2.imread(image_path)
            
            if image is None:
                print(f"Warning: Image '{image_path}' not found or cannot be loaded.")
                continue
            
            # Read the CSV file
            csv_data = pd.read_csv(csv_file)
            
            # Find the corresponding image filename
            image_filename = os.path.basename(image_path)
            
            # Filter annotations based on the image filename
            annotations = csv_data[csv_data['filename'] == image_filename]
            
            # Initialize lists to store the texture metrics
            bbox_contrast_list = []
            bbox_dissimilarity_list = []
            bbox_homogeneity_list = []
            bbox_energy_list = []
            donut_contrast_list = []
            donut_dissimilarity_list = []
            donut_homogeneity_list = []
            donut_energy_list = []
            
            # Iterate through annotations and calculate texture metrics
            for _, row in annotations.iterrows():
                bbox = ast.literal_eval(row['bbox'])  # Parse bbox values from string to list
                
                # Calculate GCLM derivatives for bounding box and donut region
                bbox_contrast, bbox_dissimilarity, bbox_homogeneity, bbox_energy = calculate_band_texture_metrics(image, bbox, band)
                
                donut_left = max(0, bbox[0] - 20)  # Adjust the buffer size as needed
                donut_top = max(0, bbox[1] - 20)
                donut_right = min(image.shape[1], bbox[0] + bbox[2] + 20)
                donut_bottom = min(image.shape[0], bbox[1] + bbox[3] + 20)
                donut_bbox = [donut_left, donut_top, donut_right - donut_left, donut_bottom - donut_top]
                donut_contrast, donut_dissimilarity, donut_homogeneity, donut_energy = calculate_band_texture_metrics(image, donut_bbox, band)
                
                # Append the calculated texture metrics to the lists
                bbox_contrast_list.append(bbox_contrast)
                bbox_dissimilarity_list.append(bbox_dissimilarity)
                bbox_homogeneity_list.append(bbox_homogeneity)
                bbox_energy_list.append(bbox_energy)
                
                donut_contrast_list.append(donut_contrast)
                donut_dissimilarity_list.append(donut_dissimilarity)
                donut_homogeneity_list.append(donut_homogeneity)
                donut_energy_list.append(donut_energy)
            
            # Add texture metrics as columns to a temporary dataframe
            temp_df = pd.DataFrame({
                'ID': annotations["ID"],
                'filename': [image_filename] * len(annotations),
                'bbox_contrast': bbox_contrast_list,
                'bbox_dissimilarity': bbox_dissimilarity_list,
                'bbox_homogeneity': bbox_homogeneity_list,
                'bbox_energy': bbox_energy_list,
                'donut_contrast': donut_contrast_list,
                'donut_dissimilarity': donut_dissimilarity_list,
                'donut_homogeneity': donut_homogeneity_list,
                'donut_energy': donut_energy_list
            })
            
            # Append the temporary dataframe to the main dataframe
            texture_metrics_df = pd.concat([texture_metrics_df, temp_df], ignore_index=True)
        except Exception as e:
            print(f"Error processing image '{image_path}': {e}")
    
    # Save the main dataframe with texture metrics to a CSV file
    texture_metrics_df.to_csv('E:\\imagefactors\\data\\gclm_usfws_blue.csv', index=False)

# Example usage with a directory containing images and specifying the band (e.g., band=0 for blue, band=1 for green, band=2 for red)
image_dir = 'E:\\imagefactors\\data\\usfws'
csv_file = path  # Replace with the actual path to your CSV file containing the annotations
band = 0  # Specify the band (0 for blue, 1 for green, 2 for red)

calculate_texture_metrics_for_directory_band(image_dir, csv_file, band)

In [None]:
#Merge all the texture metric calc sheets together, then to the main dataframe

path1 = "E:\\imagefactors\\data\\gclm_usfws_8angles.csv"
with open(path1) as f1:
  gclm_angles = pd.read_csv(f1)

path2 = "E:\\imagefactors\\data\\gclm_usfws_135.csv"
with open(path2) as f2:
  gclm_135 = pd.read_csv(f2)

path3 = "E:\\imagefactors\\data\\gclm_usfws_135_8angles.csv"
with open(path3) as f3:
  gclm_8angles135 = pd.read_csv(f3)

path4 = "E:\\imagefactors\\data\\gclm_usfws_pca.csv"
with open(path4) as f4:
  gclm_pca = pd.read_csv(f4)

path5 = "E:\\imagefactors\\data\\gclm_usfws_blue.csv"
with open(path5) as f5:
  gclm_blue = pd.read_csv(f5)

path6 = "E:\\imagefactors\\data\\gclm_usfws_green.csv"
with open(path6) as f6:
  gclm_green = pd.read_csv(f6)

path7 = "E:\\imagefactors\\data\\gclm_usfws_red.csv"
with open(path7) as f7:
  gclm_red = pd.read_csv(f7)

path8 = "E:\\imagefactors\\data\\gclm_usfws_smallkernel.csv"
with open(path8) as f8:
  gclm_smallkernel = pd.read_csv(f8)

merged_df = pd.merge(gclm_smallkernel, gclm_red, on=["ID", "filename"], how="left")
merged_df = pd.merge(merged_df, gclm_green, on=["ID", "filename"], how="left")
merged_df = pd.merge(merged_df, gclm_blue, on=["ID", "filename"], how="left")
merged_df = pd.merge(merged_df, gclm_pca, on=["ID", "filename"], how="left")
merged_df = pd.merge(merged_df, gclm_angles, on=["ID", "filename"], how="left")
merged_df = pd.merge(merged_df, gclm_135, on=["ID", "filename"], how="left")
merged_df = pd.merge(merged_df, gclm_8angles135, on=["ID", "filename"], how="left")
merged_df.head()

In [None]:
#Examining the different texture values and looking at whether they derive different information
from scipy.stats import kruskal

# Kruskal-Wallis test
statistic, p_value = kruskal(merged_df['bbox_dissimilarity'], merged_df['donut_dissimilarity'])

# Print the results
print("Kruskal-Wallis Statistic:", statistic)
print("P-Value:", p_value)

# Check if the p-value is less than your chosen significance level (e.g., 0.05)
if p_value < 0.05:
    print("There are statistically significant differences between at least two groups.")
else:
    print("There is no significant difference between the groups.")

In [None]:
#Merge the basic GCLM to the main dataframe
df = pd.merge(df, gclm_smallkernel, on=["ID", "filename"], how="left")
df.head()

In [None]:
# Calculate the differences for each GCLM statistic
df['contrast_difference'] = df['donut_contrast'] - df['bbox_contrast']
df['energy_difference'] = df['donut_energy'] - df['bbox_energy']
df['homogeneity_difference'] = df['donut_homogeneity'] - df['bbox_homogeneity']
df['dissimilarity_difference'] = df['donut_dissimilarity'] - df['bbox_dissimilarity']


In [None]:
#Save 
df.to_csv('E:/imagefactors/data/expert_imagefactors_SPP.csv', index=False)

In [7]:
#Merge with new file that has the simplified dependent variable
import pandas as pd

path1 = "E:\\imagefactors\\data\\expert_imagefactors_SPP.csv"
with open(path1) as f1:
  img = pd.read_csv(f1)

path2 = "E:\\imagefactors\\data\\expertLabels_simple.csv"
with open(path2) as f2:
  simple = pd.read_csv(f2)

df = pd.merge(img, simple, on=["id"], how="left")

df.head()


Unnamed: 0,id,cluster_id_x,filename_x,consensus_class_ID_x,bbox,pielou_index,area,bbox_percent_area,same_class_percent,num_neighbors,...,energy_difference,homogeneity_difference,dissimilarity_difference,cluster_id_y,filename_y,consensus_class_ID_y,consensus_bbox,num_annotations,consensus_guesses,correct_fraction
0,1,0,BDA_12C_20181127_1.JPG,Canadian Goose,"[4445.5, 2719.5, 95.0, 80.5]",0.0,7647.5,0.038311,54.878049,1,...,-0.002017,-0.00306,-0.901467,0,BDA_12C_20181127_1.JPG,Canadian Goose,"[4445.5, 2719.5, 95.0, 80.5]",10,10,1.0
1,2,1,BDA_12C_20181127_1.JPG,Canadian Goose,"[4312.5, 2739.5, 98.0, 44.0]",0.468996,4312.0,0.021601,54.878049,2,...,-0.003161,-0.002116,-0.316365,1,BDA_12C_20181127_1.JPG,Canadian Goose,"[4312.5, 2739.5, 98.0, 44.0]",10,9,0.9
2,3,2,BDA_12C_20181127_1.JPG,Canadian Goose,"[3725.5, 1779.0, 73.5, 70.5]",0.468996,5181.75,0.025958,54.878049,1,...,-0.003372,-0.012692,1.908553,2,BDA_12C_20181127_1.JPG,Canadian Goose,"[3725.5, 1779.0, 73.5, 70.5]",10,9,0.9
3,4,3,BDA_12C_20181127_1.JPG,Canadian Goose,"[3628.0, 1882.0, 92.0, 38.0]",0.0,3496.0,0.017513,54.878049,3,...,-0.006176,-0.013843,0.038127,3,BDA_12C_20181127_1.JPG,Canadian Goose,"[3628.0, 1882.0, 92.0, 38.0]",9,9,1.0
4,5,4,BDA_12C_20181127_1.JPG,Canadian Goose,"[3679.0, 1929.0, 65.0, 82.0]",0.0,5330.0,0.026701,54.878049,3,...,-0.002882,-0.005665,1.620205,4,BDA_12C_20181127_1.JPG,Canadian Goose,"[3679.0, 1929.0, 65.0, 82.0]",9,9,1.0


In [11]:
df = df.drop(columns=['cluster_id_x', 'cluster_id_y', 'consensus_class_ID_y', 'consensus_bbox', 'filename_y'], axis=1)
df.head()

Unnamed: 0,id,filename_x,consensus_class_ID_x,bbox,pielou_index,area,bbox_percent_area,same_class_percent,num_neighbors,agl,...,donut_dissimilarity,donut_homogeneity,donut_energy,contrast_difference,energy_difference,homogeneity_difference,dissimilarity_difference,num_annotations,consensus_guesses,correct_fraction
0,1,BDA_12C_20181127_1.JPG,Canadian Goose,"[4445.5, 2719.5, 95.0, 80.5]",0.0,7647.5,0.038311,54.878049,1,41.27,...,20.902788,0.062954,0.010217,-133.499291,-0.002017,-0.00306,-0.901467,10,10,1.0
1,2,BDA_12C_20181127_1.JPG,Canadian Goose,"[4312.5, 2739.5, 98.0, 44.0]",0.468996,4312.0,0.021601,54.878049,2,41.27,...,19.51671,0.06736,0.011356,-58.319485,-0.003161,-0.002116,-0.316365,10,9,0.9
2,3,BDA_12C_20181127_1.JPG,Canadian Goose,"[3725.5, 1779.0, 73.5, 70.5]",0.468996,5181.75,0.025958,54.878049,1,41.27,...,22.822854,0.058255,0.010184,68.924911,-0.003372,-0.012692,1.908553,10,9,0.9
3,4,BDA_12C_20181127_1.JPG,Canadian Goose,"[3628.0, 1882.0, 92.0, 38.0]",0.0,3496.0,0.017513,54.878049,3,41.27,...,22.653116,0.080785,0.014425,-149.563152,-0.006176,-0.013843,0.038127,9,9,1.0
4,5,BDA_12C_20181127_1.JPG,Canadian Goose,"[3679.0, 1929.0, 65.0, 82.0]",0.0,5330.0,0.026701,54.878049,3,41.27,...,23.769565,0.064042,0.010392,147.23506,-0.002882,-0.005665,1.620205,9,9,1.0


In [12]:
df = df.rename(columns={'filename_x': 'filename', 'consensus_class_ID_x': 'consensus_class_ID'})
df.head()

Unnamed: 0,id,filename,consensus_class_ID,bbox,pielou_index,area,bbox_percent_area,same_class_percent,num_neighbors,agl,...,donut_dissimilarity,donut_homogeneity,donut_energy,contrast_difference,energy_difference,homogeneity_difference,dissimilarity_difference,num_annotations,consensus_guesses,correct_fraction
0,1,BDA_12C_20181127_1.JPG,Canadian Goose,"[4445.5, 2719.5, 95.0, 80.5]",0.0,7647.5,0.038311,54.878049,1,41.27,...,20.902788,0.062954,0.010217,-133.499291,-0.002017,-0.00306,-0.901467,10,10,1.0
1,2,BDA_12C_20181127_1.JPG,Canadian Goose,"[4312.5, 2739.5, 98.0, 44.0]",0.468996,4312.0,0.021601,54.878049,2,41.27,...,19.51671,0.06736,0.011356,-58.319485,-0.003161,-0.002116,-0.316365,10,9,0.9
2,3,BDA_12C_20181127_1.JPG,Canadian Goose,"[3725.5, 1779.0, 73.5, 70.5]",0.468996,5181.75,0.025958,54.878049,1,41.27,...,22.822854,0.058255,0.010184,68.924911,-0.003372,-0.012692,1.908553,10,9,0.9
3,4,BDA_12C_20181127_1.JPG,Canadian Goose,"[3628.0, 1882.0, 92.0, 38.0]",0.0,3496.0,0.017513,54.878049,3,41.27,...,22.653116,0.080785,0.014425,-149.563152,-0.006176,-0.013843,0.038127,9,9,1.0
4,5,BDA_12C_20181127_1.JPG,Canadian Goose,"[3679.0, 1929.0, 65.0, 82.0]",0.0,5330.0,0.026701,54.878049,3,41.27,...,23.769565,0.064042,0.010392,147.23506,-0.002882,-0.005665,1.620205,9,9,1.0


In [13]:
df['n-k'] = df['num_annotations'] - df['consensus_guesses']
df.head()

Unnamed: 0,id,filename,consensus_class_ID,bbox,pielou_index,area,bbox_percent_area,same_class_percent,num_neighbors,agl,...,donut_homogeneity,donut_energy,contrast_difference,energy_difference,homogeneity_difference,dissimilarity_difference,num_annotations,consensus_guesses,correct_fraction,n-k
0,1,BDA_12C_20181127_1.JPG,Canadian Goose,"[4445.5, 2719.5, 95.0, 80.5]",0.0,7647.5,0.038311,54.878049,1,41.27,...,0.062954,0.010217,-133.499291,-0.002017,-0.00306,-0.901467,10,10,1.0,0
1,2,BDA_12C_20181127_1.JPG,Canadian Goose,"[4312.5, 2739.5, 98.0, 44.0]",0.468996,4312.0,0.021601,54.878049,2,41.27,...,0.06736,0.011356,-58.319485,-0.003161,-0.002116,-0.316365,10,9,0.9,1
2,3,BDA_12C_20181127_1.JPG,Canadian Goose,"[3725.5, 1779.0, 73.5, 70.5]",0.468996,5181.75,0.025958,54.878049,1,41.27,...,0.058255,0.010184,68.924911,-0.003372,-0.012692,1.908553,10,9,0.9,1
3,4,BDA_12C_20181127_1.JPG,Canadian Goose,"[3628.0, 1882.0, 92.0, 38.0]",0.0,3496.0,0.017513,54.878049,3,41.27,...,0.080785,0.014425,-149.563152,-0.006176,-0.013843,0.038127,9,9,1.0,0
4,5,BDA_12C_20181127_1.JPG,Canadian Goose,"[3679.0, 1929.0, 65.0, 82.0]",0.0,5330.0,0.026701,54.878049,3,41.27,...,0.064042,0.010392,147.23506,-0.002882,-0.005665,1.620205,9,9,1.0,0


MODELING IMPACT OF IMAGE FACTORS ON LABELING DIFFICULTY

In [20]:
df['n-k'].value_counts()

n-k
0    844
1    505
2    421
3    232
4    145
5     44
6     14
Name: count, dtype: int64

In [17]:
#Dummy variables for class ID
df = pd.get_dummies(df, columns=["consensus_class_ID"], prefix="class")
df.head()

Unnamed: 0,id,filename,bbox,pielou_index,area,bbox_percent_area,same_class_percent,num_neighbors,agl,gsd,...,n-k,class_American Wigeon,class_Canadian Goose,class_Gadwall,class_Mallard,class_Northern Pintail,class_Northern Shoveler,class_Other,class_Sandhill Crane,class_Teal
0,1,BDA_12C_20181127_1.JPG,"[4445.5, 2719.5, 95.0, 80.5]",0.0,7647.5,0.038311,54.878049,1,41.27,0.932734,...,0,False,True,False,False,False,False,False,False,False
1,2,BDA_12C_20181127_1.JPG,"[4312.5, 2739.5, 98.0, 44.0]",0.468996,4312.0,0.021601,54.878049,2,41.27,0.932734,...,1,False,True,False,False,False,False,False,False,False
2,3,BDA_12C_20181127_1.JPG,"[3725.5, 1779.0, 73.5, 70.5]",0.468996,5181.75,0.025958,54.878049,1,41.27,0.932734,...,1,False,True,False,False,False,False,False,False,False
3,4,BDA_12C_20181127_1.JPG,"[3628.0, 1882.0, 92.0, 38.0]",0.0,3496.0,0.017513,54.878049,3,41.27,0.932734,...,0,False,True,False,False,False,False,False,False,False
4,5,BDA_12C_20181127_1.JPG,"[3679.0, 1929.0, 65.0, 82.0]",0.0,5330.0,0.026701,54.878049,3,41.27,0.932734,...,0,False,True,False,False,False,False,False,False,False


In [23]:
for column in data.filter(like='class_'):
    data[column] = data[column].astype(int)
data.head()

Unnamed: 0,id,filename,bbox,pielou_index,area,bbox_percent_area,same_class_percent,num_neighbors,agl,gsd,...,n-k,class_American Wigeon,class_Canadian Goose,class_Gadwall,class_Mallard,class_Northern Pintail,class_Northern Shoveler,class_Other,class_Sandhill Crane,class_Teal
0,1,BDA_12C_20181127_1.JPG,"[4445.5, 2719.5, 95.0, 80.5]",0.0,7647.5,0.038311,54,1,41.27,0.932734,...,0,0,1,0,0,0,0,0,0,0
1,2,BDA_12C_20181127_1.JPG,"[4312.5, 2739.5, 98.0, 44.0]",0.468996,4312.0,0.021601,54,2,41.27,0.932734,...,1,0,1,0,0,0,0,0,0,0
2,3,BDA_12C_20181127_1.JPG,"[3725.5, 1779.0, 73.5, 70.5]",0.468996,5181.75,0.025958,54,1,41.27,0.932734,...,1,0,1,0,0,0,0,0,0,0
3,4,BDA_12C_20181127_1.JPG,"[3628.0, 1882.0, 92.0, 38.0]",0.0,3496.0,0.017513,54,3,41.27,0.932734,...,0,0,1,0,0,0,0,0,0,0
4,5,BDA_12C_20181127_1.JPG,"[3679.0, 1929.0, 65.0, 82.0]",0.0,5330.0,0.026701,54,3,41.27,0.932734,...,0,0,1,0,0,0,0,0,0,0


In [20]:
data.to_csv('E:/imagefactors/data/expert_IF_SPP_update.csv', index=False)

In [21]:
path = "E:/imagefactors/data/expert_IF_SPP_update.csv"
with open(path) as f:
  data = pd.read_csv(f)

data.head()

Unnamed: 0,id,filename,bbox,pielou_index,area,bbox_percent_area,same_class_percent,num_neighbors,agl,gsd,...,correct_fraction,n-k,class_American Wigeon,class_Canadian Goose,class_Gadwall,class_Mallard,class_Northern Pintail,class_Northern Shoveler,class_Other,class_Sandhill Crane
0,1,BDA_12C_20181127_1.JPG,"[4445.5, 2719.5, 95.0, 80.5]",0.0,7647.5,0.038311,54,1,41.27,0.932734,...,1.0,0,0,1,0,0,0,0,0,0
1,2,BDA_12C_20181127_1.JPG,"[4312.5, 2739.5, 98.0, 44.0]",0.468996,4312.0,0.021601,54,2,41.27,0.932734,...,0.9,1,0,1,0,0,0,0,0,0
2,3,BDA_12C_20181127_1.JPG,"[3725.5, 1779.0, 73.5, 70.5]",0.468996,5181.75,0.025958,54,1,41.27,0.932734,...,0.9,1,0,1,0,0,0,0,0,0
3,4,BDA_12C_20181127_1.JPG,"[3628.0, 1882.0, 92.0, 38.0]",0.0,3496.0,0.017513,54,3,41.27,0.932734,...,1.0,0,0,1,0,0,0,0,0,0
4,5,BDA_12C_20181127_1.JPG,"[3679.0, 1929.0, 65.0, 82.0]",0.0,5330.0,0.026701,54,3,41.27,0.932734,...,1.0,0,0,1,0,0,0,0,0,0


In [22]:
#Logistic Regression

import statsmodels.api as sm

# Define the independent variables
X = data[['bbox_percent_area', 'gsd', 'num_neighbors', 'contrast_difference',
          'class_American Wigeon', 'class_Canadian Goose', 'class_Gadwall',
          'class_Mallard', 'class_Northern Pintail', 'class_Northern Shoveler', 'class_Other', 'class_Sandhill Crane']]

# Add a constant term to the independent variables
X = sm.add_constant(X)

# Define the response variable
y = data[['consensus_guesses', 'n-k']]

# Fit logistic regression model
model = sm.GLM(y, X, family=sm.families.Binomial()).fit()

# Display the model summary
print(model.summary())


                      Generalized Linear Model Regression Results                       
Dep. Variable:     ['consensus_guesses', 'n-k']   No. Observations:                 2205
Model:                                      GLM   Df Residuals:                     2192
Model Family:                          Binomial   Df Model:                           12
Link Function:                            Logit   Scale:                          1.0000
Method:                                    IRLS   Log-Likelihood:                -3035.7
Date:                          Fri, 01 Dec 2023   Deviance:                       3064.2
Time:                                  16:09:26   Pearson chi2:                 3.37e+03
No. Iterations:                               9   Pseudo R-squ. (CS):             0.4063
Covariance Type:                      nonrobust                                         
                              coef    std err          z      P>|z|      [0.025      0.975]
------------------

In [16]:
import pandas as pd
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Assuming 'data' is your DataFrame with predictor variables
predictors = data[[
 'bbox_percent_area',
 #'same_class_percent',
 'num_neighbors',
 #'agl',
 'gsd',
 #'distance_from_center',
 #'density',
 #'rarity',
# 'bbox_contrast',
 #'bbox_dissimilarity',
 #'bbox_homogeneity',
 #'bbox_energy',
 #'donut_contrast',
 #'donut_dissimilarity',
 #'donut_homogeneity',
 #'donut_energy',
 #'contrast_difference',
 'energy_difference',
 #'homogeneity_difference',
 #'dissimilarity_difference',
 #'num_annotations',
 'class_American Wigeon',
 'class_Canadian Goose',
 'class_Gadwall',
 'class_Mallard',
 'class_Northern Pintail',
 'class_Northern Shoveler',
 #'class_Other',
 'class_Sandhill Crane']]
 #'class_Teal']]

# Calculate VIF
vif_data = pd.DataFrame()
vif_data["Variable"] = predictors.columns
vif_data["VIF"] = [variance_inflation_factor(predictors.values, i) for i in range(predictors.shape[1])]

# Display the VIF DataFrame
print(vif_data)

                   Variable        VIF
0         bbox_percent_area   4.305665
1             num_neighbors   2.433099
2                       gsd  16.209672
3         energy_difference   1.297712
4     class_American Wigeon   1.218786
5      class_Canadian Goose   2.612853
6             class_Gadwall   1.048773
7             class_Mallard  16.435828
8    class_Northern Pintail   3.341252
9   class_Northern Shoveler   1.022130
10     class_Sandhill Crane   3.008661


In [24]:
#Multiple linear regression 

y = data['n-k']
X = [['rarity', 'gsd', 'bbox_percent_area', 'same_class_percent', 'num_neighbors', 'distance_from_center', 'density', 'contrast_difference', 'energy_difference', 'homogeneity_difference', 'dissimilarity_difference',
        'class_American Wigeon', 'class_Canadian Goose', 'class_Gadwall', 'class_Mallard', 'class_Northern Pintail', 'class_Northern Shoveler', 'class_Other', 'class_Sandhill Crane', 'class_Teal']]

# Add a constant term to the independent variables (intercept)
X = sm.add_constant(X)

# Fit the linear regression model
model = sm.OLS(y, X).fit()

# Print the regression summary
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                    n-k   R-squared:                       0.296
Model:                            OLS   Adj. R-squared:                  0.290
Method:                 Least Squares   F-statistic:                     51.06
Date:                Tue, 31 Oct 2023   Prob (F-statistic):          7.84e-152
Time:                        11:59:10   Log-Likelihood:                -3482.1
No. Observations:                2205   AIC:                             7002.
Df Residuals:                    2186   BIC:                             7111.
Df Model:                          18                                         
Covariance Type:            nonrobust                                         
                               coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------
const                   

In [25]:
# Calculate Variance Inflation Factors (VIF) to check for multicollinearity
vif = pd.DataFrame()
vif["Variable"] = X.columns
vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
print(vif)

                    Variable        VIF
0                      const   0.000000
1                     rarity        inf
2                        gsd   7.564173
3          bbox_percent_area   3.968870
4         same_class_percent   5.266126
5              num_neighbors   1.193138
6       distance_from_center   1.636949
7                    density   3.820140
8        contrast_difference  13.907166
9          energy_difference   9.113990
10    homogeneity_difference  14.967774
11  dissimilarity_difference  20.977424
12     class_American Wigeon        inf
13      class_Canadian Goose        inf
14             class_Gadwall        inf
15             class_Mallard        inf
16    class_Northern Pintail        inf
17   class_Northern Shoveler        inf
18               class_Other        inf
19      class_Sandhill Crane        inf
20                class_Teal        inf


  return 1 - self.ssr/self.centered_tss
  vif = 1. / (1. - r_squared_i)


In [26]:
correlation_matrix = X.corr()
correlated_pairs = [(var1, var2) for var1 in X.columns for var2 in X.columns if var1 != var2 and abs(correlation_matrix.loc[var1, var2]) > 0.7]
correlated_pairs

[('rarity', 'same_class_percent'),
 ('rarity', 'class_Mallard'),
 ('gsd', 'density'),
 ('same_class_percent', 'rarity'),
 ('same_class_percent', 'class_Mallard'),
 ('density', 'gsd'),
 ('contrast_difference', 'dissimilarity_difference'),
 ('energy_difference', 'homogeneity_difference'),
 ('homogeneity_difference', 'energy_difference'),
 ('dissimilarity_difference', 'contrast_difference'),
 ('class_Mallard', 'rarity'),
 ('class_Mallard', 'same_class_percent')]

In [27]:
y = data['n-k']
X = data[['gsd', 'bbox_percent_area', 'num_neighbors', 'distance_from_center', 'contrast_difference', 'energy_difference',
        'class_American Wigeon', 'class_Canadian Goose', 'class_Gadwall', 'class_Mallard', 'class_Northern Pintail', 'class_Northern Shoveler', 'class_Other', 'class_Sandhill Crane', 'class_Teal']]

# Add a constant term to the independent variables (intercept)
X = sm.add_constant(X)

# Fit the linear regression model
model = sm.OLS(y, X).fit()

# Print the regression summary
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                    n-k   R-squared:                       0.225
Model:                            OLS   Adj. R-squared:                  0.220
Method:                 Least Squares   F-statistic:                     45.31
Date:                Tue, 31 Oct 2023   Prob (F-statistic):          3.48e-110
Time:                        12:14:51   Log-Likelihood:                -3588.7
No. Observations:                2205   AIC:                             7207.
Df Residuals:                    2190   BIC:                             7293.
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                     

In [28]:
vif = pd.DataFrame()
vif["Variable"] = X.columns
vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
print(vif)

                   Variable       VIF
0                     const  0.000000
1                       gsd  3.321961
2         bbox_percent_area  3.686422
3             num_neighbors  1.072452
4      distance_from_center  1.383715
5       contrast_difference  1.343970
6         energy_difference  1.478396
7     class_American Wigeon       inf
8      class_Canadian Goose       inf
9             class_Gadwall       inf
10            class_Mallard       inf
11   class_Northern Pintail       inf
12  class_Northern Shoveler       inf
13              class_Other       inf
14     class_Sandhill Crane       inf
15               class_Teal       inf


  return 1 - self.ssr/self.centered_tss
  vif = 1. / (1. - r_squared_i)


In [30]:
y = data['n-k']
X = data[['gsd', 'bbox_percent_area', 'num_neighbors', 'distance_from_center', 'contrast_difference', 'energy_difference']]

# Add a constant term to the independent variables (intercept)
X = sm.add_constant(X)

# Fit the linear regression model
model = sm.OLS(y, X).fit()

# Print the regression summary
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                    n-k   R-squared:                       0.115
Model:                            OLS   Adj. R-squared:                  0.112
Method:                 Least Squares   F-statistic:                     47.52
Date:                Tue, 31 Oct 2023   Prob (F-statistic):           4.97e-55
Time:                        12:52:45   Log-Likelihood:                -3734.6
No. Observations:                2205   AIC:                             7483.
Df Residuals:                    2198   BIC:                             7523.
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                    3.0249 

In [29]:
correlation_matrix = X.corr()
correlated_pairs = [(var1, var2) for var1 in X.columns for var2 in X.columns if var1 != var2 and abs(correlation_matrix.loc[var1, var2]) > 0.7]
correlated_pairs

[]

In [33]:
y = data['n-k']
X = data[['gsd', 'bbox_percent_area', 'num_neighbors', 'distance_from_center', 'contrast_difference', 'energy_difference',
          'class_American Wigeon', 'class_Canadian Goose', 'class_Gadwall', 'class_Mallard', 'class_Northern Pintail', 'class_Northern Shoveler', 'class_Other', 'class_Sandhill Crane']]

# Add a constant term to the independent variables (intercept)
X = sm.add_constant(X)

# Fit the linear regression model
model = sm.OLS(y, X).fit()

# Print the regression summary
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                    n-k   R-squared:                       0.225
Model:                            OLS   Adj. R-squared:                  0.220
Method:                 Least Squares   F-statistic:                     45.31
Date:                Thu, 02 Nov 2023   Prob (F-statistic):          3.48e-110
Time:                        12:41:00   Log-Likelihood:                -3588.7
No. Observations:                2205   AIC:                             7207.
Df Residuals:                    2190   BIC:                             7293.
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                     

In [None]:
from scipy import stats
import statsmodels.api as sm

# Fit the linear regression model
model = sm.OLS(y, X).fit()

# Get the predicted values from the model
predicted_values = model.predict(X)

# Calculate the residuals
residuals = y - predicted_values

# Perform the Shapiro-Wilk test
shapiro_test_statistic, shapiro_p_value = stats.shapiro(residuals)

# Check the p-value
if shapiro_p_value < 0.05:
    print("The residuals are not normally distributed (p < 0.05). Consider nonparametric methods.")
else:
    print("The residuals appear to be normally distributed (p >= 0.05). OLS may be appropriate.")

In [15]:
#LOGISTIC REGRESSION

import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Define the dependent and independent variables
y = df[['consensus_guesses', 'n-k']]
X = df[['consensus_class_ID', 'gsd', 'bbox_percent_area', 'same_class_percent',
          'num_neighbors', 'distance_from_center', 'energy_difference', 'contrast_difference', 
          'homogeneity_difference', 'dissimilarity_difference']]

# Add a constant term to the independent variables (intercept)
X = sm.add_constant(X)

# Fit a logistic regression model
model = sm.Logit(y, X)
result = model.fit()

# Print the summary
print(result.summary())

# Calculate Variance Inflation Factors (VIF) to check for multicollinearity
vif = pd.DataFrame()
vif["Variable"] = X.columns
vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
print(vif)

ValueError: Pandas data cast to numpy dtype of object. Check input data with np.asarray(data).

VISUALIZATIONS

In [None]:
#Sample visualization of "donuts" + bounding boxes

def visualize_bounding_boxes_with_donuts(image_path, csv_file):
    # Load the image
    image = cv2.imread(image_path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)  # Convert from BGR to RGB
    
    # Read the CSV file
    csv_data = pd.read_csv(csv_file)
    
    # Find the corresponding image filename
    image_filename = os.path.basename(image_path)
    
    # Filter annotations based on the image filename
    annotations = csv_data[csv_data['filename'] == image_filename]
    
    # Plot the image with bounding boxes and donut regions
    fig, ax = plt.subplots(figsize=(8, 8))
    ax.imshow(image)
    ax.axis('off')
    
    # Iterate through annotations and draw bounding boxes and donut regions
    for _, row in annotations.iterrows():
        bbox = ast.literal_eval(row['bbox'])  # Parse bbox values from string to list
        #bbox = row['bbox']
        # Draw bounding box
        rectangle = Rectangle((bbox[0], bbox[1]), bbox[2], bbox[3], linewidth=1, edgecolor='r', facecolor='none')
        ax.add_patch(rectangle)
        
        # Draw donut region
        donut_left = max(0, bbox[0] - 20)  # Adjust the buffer size as needed
        donut_top = max(0, bbox[1] - 20)
        donut_right = min(image.shape[1], bbox[0] + bbox[2] + 20)
        donut_bottom = min(image.shape[0], bbox[1] + bbox[3] + 20)
        donut_rectangle = Rectangle((donut_left, donut_top), (donut_right - donut_left),
                                   (donut_bottom - donut_top), linewidth=1, edgecolor='g', facecolor='none')
        ax.add_patch(donut_rectangle)
    
    # Show the plot
    plt.show()

# Example usage
image_path = 'E:\\imagefactors\\data\\usfws\\BDA_18A4_20181107_4.JPG'
csv_file = path

visualize_bounding_boxes_with_donuts(image_path, csv_file)

In [None]:
# Create pairplot with regression lines
sns.pairplot(df, x_vars=['gsd', 'bbox_percent_area', 'same_class_percent', 'num_neighbors', 'distance_from_center', 'density'], y_vars=['pielou_index'], kind='reg', height=4)
plt.show()