# CLAY Embeddings Feature detection

The goal of this notebook is to show how to use the CLAY embeddings to detect features. We will use embeddgins for a Bali, and labels where aquaculture is present. We'll tip the embeddgins with a few examples of locations of aquaculture, and we'll see how the embeddings can be used to detect other locations of aquaculture. We can iterate refining positive and negative examples to improve the detection.

## Imports

In [None]:
import sys

sys.path.append("../")

In [None]:
import warnings
from pathlib import Path

import pyarrow.parquet as pq
from shapely.wkb import loads
from shapely import wkt


from src.datamodule import ClayDataModule, ClayDataset
from src.model_clay import CLAYModule

warnings.filterwarnings("ignore")

In [None]:
# read embeddings geoparquet file
data_dir = Path("../datadisk")
embeddings_file = data_dir / "aquaculture_processed.gpq"
embeddings = pq.read_table(embeddings_file).to_pandas()

embeddings['geometry'] = embeddings['geometry'].apply(lambda x: loads(x).wkt)
embeddings['geometry'] = embeddings['geometry'].apply(wkt.loads)

# Calculate the centroid and create new columns for x and y coordinates
embeddings['x'] = embeddings['geometry'].apply(lambda geom: geom.centroid.x)
embeddings['y'] = embeddings['geometry'].apply(lambda geom: geom.centroid.y)


print(len(embeddings))
#print number of occurrences of each unique value in the 'aquaculture' column
print(embeddings['aquaculture'].value_counts())
print(len(embeddings.iloc[0]['embeddings']))
embeddings.head()



In [None]:
import geopandas as gpd
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
import random


gdf = embeddings.copy()

n_initial_pos = 2  # Initial number of positive training samples
n_initial_neg = 0  # Initial number of negative training samples
n_add_pos_per_iter = 3  # Number of positive samples to add to the training set per iteration
n_add_neg_per_iter = 3  # Number of negative samples to add to the training set per iteration
max_iterations = 100
max_drop_percentage = 0.1  # % of features to drop per iteration
early_stopping_rounds = 10  # Number of iterations without improvement for early stopping

# Split the data into positive and negative samples
pos_samples = gdf[gdf['aquaculture'] == 1]
neg_samples = gdf[gdf['aquaculture'] == 0]

# Split the positive samples into training and test sets
pos_train_indices = pos_samples.sample(n=n_initial_pos, random_state=42).index
pos_test_indices = pos_samples.drop(pos_train_indices).index

# Split the negative samples into training and test sets
neg_train_indices = neg_samples.sample(n=n_initial_neg, random_state=42).index
neg_test_indices = neg_samples.drop(neg_train_indices).index

# Combine the positive and negative training and test indices
train_indices = np.concatenate((pos_train_indices, neg_train_indices))
test_indices = np.concatenate((pos_test_indices, neg_test_indices))

# Initialize lists to store accuracy, number of dimensions, and iteration number
accuracies = []
dimensions = []
iterations = []

# Initialize the Random Forest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Iterate until maximum iterations are reached or early stopping criteria is met
iteration = 0
best_accuracy = 0
no_improvement_count = 0
while iteration < max_iterations:
    # Select training samples from the training indices
    train_samples = gdf.loc[train_indices]
    X_train = np.array(train_samples['embeddings'].tolist())
    y_train = train_samples['aquaculture'].values

    # Train the classifier on the selected training samples
    rf.fit(X_train, y_train)

    # Extract feature importances and identify least important dimensions to drop
    importances = rf.feature_importances_
    drop_threshold = np.min(importances) * (1 + max_drop_percentage)
    drop_mask = importances <= drop_threshold

    # Calculate the number of features to drop, adhering to max_drop_percentage
    num_features_to_drop = int(max_drop_percentage * len(importances))

    # If more than max_drop_percentage of features have importance below the drop_threshold
    if np.sum(drop_mask) > num_features_to_drop:
        print(f"More than {max_drop_percentage * 100}% of features have importance below {drop_threshold}.")
        # Select a random subset of these features to drop, adhering to the max_drop_percentage
        least_important_indices = np.where(drop_mask)[0]
        least_important_dims = random.sample(list(least_important_indices), num_features_to_drop)
    else:
        # If not exceeding max_drop_percentage, proceed with dropping all identified features
        least_important_dims = np.where(drop_mask)[0]

    print(f"Dropping {len(least_important_dims)} dimensions randomly selected among those with importances below {drop_threshold}")

    # Drop the least important dimensions from the entire dataset
    gdf['embeddings'] = gdf['embeddings'].apply(lambda x: np.delete(x, least_important_dims))

    # Select training samples from the training indices
    train_samples = gdf.loc[train_indices]
    X_train = np.array(train_samples['embeddings'].tolist())
    y_train = train_samples['aquaculture'].values

    # Train the classifier on the selected training samples
    rf.fit(X_train, y_train)

    # Select testing samples from the test indices
    test_samples = gdf.loc[test_indices]
    X_test = np.array(test_samples['embeddings'].tolist())
    y_test = test_samples['aquaculture'].values

    # Evaluate the accuracy of the classifier on the testing samples
    accuracy = accuracy_score(y_test, rf.predict(X_test))

    # Store the accuracy, number of dimensions, and iteration number
    accuracies.append(accuracy)
    dimensions.append(len(gdf.iloc[0]['embeddings']))
    iterations.append(iteration)

    # Print the results for the current iteration
    print(f"Iteration: {iteration}")
    print(f"Accuracy: {accuracy}")
    print(f"Number of dimensions: {len(gdf.iloc[0]['embeddings'])}")
    print(f"Number of positive training samples: {len(pos_train_indices)}")
    print(f"Number of negative training samples: {len(neg_train_indices)}")
    print(f"Number of positive test samples: {len(pos_test_indices)}")
    print(f"Number of negative test samples: {len(neg_test_indices)}")
    print()

    # Check for early stopping
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        no_improvement_count = 0
    else:
        no_improvement_count += 1
        if no_improvement_count >= early_stopping_rounds:
            print(f"No improvement in accuracy for {early_stopping_rounds} iterations. Early stopping.")
            break

    # Move samples from test set to training set
    if len(pos_test_indices) >= n_add_pos_per_iter and len(neg_test_indices) >= n_add_neg_per_iter:
        pos_train_indices = np.concatenate((pos_train_indices, pos_test_indices[:n_add_pos_per_iter]))
        neg_train_indices = np.concatenate((neg_train_indices, neg_test_indices[:n_add_neg_per_iter]))
        pos_test_indices = pos_test_indices[n_add_pos_per_iter:]
        neg_test_indices = neg_test_indices[n_add_neg_per_iter:]
    else:
        pos_train_indices = np.concatenate((pos_train_indices, pos_test_indices))
        neg_train_indices = np.concatenate((neg_train_indices, neg_test_indices))
        pos_test_indices = []
        neg_test_indices = []
        print("All test samples moved to the training set. Stopping iterations.")
        break

    # Update the training and test indices
    train_indices = np.concatenate((pos_train_indices, neg_train_indices))
    test_indices = np.concatenate((pos_test_indices, neg_test_indices))

    iteration += 1

# Create custom x-tick labels with iterations and dimensions
xtick_labels = [f"{iteration} [{dimension}]" for iteration, dimension in zip(iterations, dimensions)]

# Plot the accuracy versus the iteration number
plt.figure(figsize=(10, 6))
plt.plot(iterations, accuracies, marker='o')
plt.xlabel('Iteration [Dimensions]')
plt.ylabel('Accuracy')
plt.title('Accuracy vs. Iteration')
plt.grid(True)
plt.xticks(iterations, xtick_labels, rotation=45)
plt.tight_layout()
plt.show()

# Print the final number of dimensions and the corresponding accuracy
print(f"Final number of dimensions: {len(gdf.iloc[0]['embeddings'])}")
print(f"Final accuracy: {accuracies[-1]}")

In [None]:
import geopandas as gpd
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, accuracy_score, recall_score
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import numpy as np
import random


import folium

def plot_satellite_image(bbox):
    """
    Plot the satellite image of a given bounding box (bbox).
    
    Parameters:
    - bbox (tuple): A tuple of (min_lat, min_lon, max_lat, max_lon)
    
    Returns:
    - A folium map object with the satellite image of the given bbox.
    """
    # Calculate the center of the bounding box
    center_lat = (bbox[0] + bbox[2]) / 2
    center_lon = (bbox[1] + bbox[3]) / 2
    
    # Create a folium map centered at the calculated center
    m = folium.Map(location=[center_lat, center_lon], zoom_start=13, tiles='Esri.WorldImagery', attr='Custom Attribution')
    
    # Add the bounding box as a rectangle on the map
    folium.Rectangle(
        bounds=[(bbox[0], bbox[1]), (bbox[2], bbox[3])],
        color='blue',
        fill=True,
        fill_color='blue',
        fill_opacity=0.2
    ).add_to(m)
    
    return m



def calculate_precision(all_embeddings,
                        positive_labels, negative_labels,
                        drop_percentage):

    all_embeddings.reset_index(drop=True, inplace=True)
    pos_samples = all_embeddings[all_embeddings['aquaculture'] == 1]
    neg_samples = all_embeddings[all_embeddings['aquaculture'] == 0]

    # extract positive and negative samples from the all_embeddings, put rest in test set
    pos_train_indices = pos_samples.sample(n=positive_labels).index
    neg_train_indices = neg_samples.sample(n=negative_labels).index
    pos_test_indices = pos_samples.drop(pos_train_indices).index
    neg_test_indices = neg_samples.drop(neg_train_indices).index

    train_indices = np.concatenate((pos_train_indices, neg_train_indices))
    test_indices = np.concatenate((pos_test_indices, neg_test_indices))

    #shuffle the indices
    np.random.shuffle(train_indices)
    np.random.shuffle(test_indices)

    rf = RandomForestClassifier()

    train_samples = all_embeddings.loc[train_indices]
    X_train = np.array(train_samples['embeddings'].tolist())
    y_train = train_samples['aquaculture'].values
    rf.fit(X_train, y_train)


    if drop_percentage > 0:
        importances = rf.feature_importances_
        # Sort importances and find the threshold importance that marks the top (1 - drop_percentage) of features
        sorted_importances = np.sort(importances)
        threshold_index = int(len(importances) * (1 - drop_percentage))
        # The threshold importance is the value below which features will be considered for dropping
        drop_threshold = sorted_importances[threshold_index]
        
        # Identify all features that have importance less than or equal to the threshold
        drop_mask = importances <= drop_threshold
        least_important_indices = np.where(drop_mask)[0]
        
        # If the number of features to drop is more specific, adjust the selection
        num_features_to_drop = min(int(drop_percentage * len(importances)), len(least_important_indices))
        
        # Randomly select from the least important features if there are more than needed
        if len(least_important_indices) > num_features_to_drop:
            least_important_dims = random.sample(list(least_important_indices), num_features_to_drop)
        else:
            least_important_dims = least_important_indices

        #print(f"Dropping {len(least_important_dims)} dimensions from a total of {len(importances)}")
        all_embeddings['embeddings'] = all_embeddings['embeddings'].apply(lambda x: np.delete(x, least_important_dims))



    #Get 25 closest cosine similarity to the centroid of the positive class, use pos_train_indices
    centroid = np.mean(np.array(all_embeddings.iloc[pos_train_indices]['embeddings'].tolist()), axis=0)

    # Calculate cosine similarity between the centroid and all embeddings
    cosine_similarities = cosine_similarity([centroid], np.array(all_embeddings['embeddings'].tolist())).flatten() 
    #check how many of the pos_train_indices are in the top 25 closest  
    closest_indices = np.argsort(cosine_similarities)
    confirmed_indices = [i for i in closest_indices if i in pos_train_indices]
    print(f"Confirmed indices in top 25 closest: {len(confirmed_indices)}")
    # Exclude the positive training set indices from the closest_indices
    closest_indices = [i for i in closest_indices if i not in pos_train_indices][:25]
    closest_indices = closest_indices[:25]
    closest_labels = all_embeddings.iloc[closest_indices]['aquaculture'].values
    
    # Calculate metrics for the 25 closest embeddings
    closest_precision = precision_score([1]*25, closest_labels)
    closest_recall = recall_score([1]*25, closest_labels)
    closest_accuracy = accuracy_score([1]*25, closest_labels)
    closest_f1 = 2 * (closest_precision * closest_recall) / (closest_precision + closest_recall) if (closest_precision + closest_recall) > 0 else 0
    
    print(f"Closest 25 Precision: {closest_precision}")
    print(f"Closest 25 Recall: {closest_recall}")
    print(f"Closest 25 Accuracy: {closest_accuracy}")
    print(f"Closest 25 F1 Score: {closest_f1}")



    # Store the precision, number of dimensions, and iteration numbers
    results = {
        'precision': format(closest_precision, '.2f'),
        'recall': format(closest_recall, '.2f'),
        'accuracy': format(closest_accuracy, '.2f'),
        'f1': format(closest_f1, '.2f'),
        'num_positives': len(pos_train_indices),
        'num_negatives': len(neg_train_indices),
        'num_test': len(test_indices),
        'num_dimensions': len(all_embeddings.iloc[0]['embeddings']),
    }
    return results



positive_labels = 36
negative_labels = 36
drop_percentage = .50
gdf = embeddings.copy().reset_index(drop=True)

calculate_precision(gdf, positive_labels, negative_labels,drop_percentage)

In [None]:

n_initial_pos = 5  # Initial number of positive training samples
n_initial_neg = 2  # Initial number of negative training samples
n_add = 10  # Number of positive and 2* negative samples to add  per iteration
drop_percentage = 0.1  # max_drop_percentage
max_iterations = 500
early_stopping_rounds = 50  # Number of iterations without improvement for early stopping


# Iterate until maximum iterations are reached or early stopping criteria is met
iteration = 0
best_precision = 0
precisions = []
dimensions = []
num_labels = []


#traverse grid of adding positive/negative samples and dropping dimensions
all_embeddings = embeddings.copy()

for add_labels in np.arange(1, 200 , n_add):
    positive_labels = n_initial_pos + add_labels
    negative_labels = n_initial_neg + add_labels*2

    for drop_percentage in np.arange(0, .7, drop_percentage):
        iteration+=1
        
        results = calculate_precision(all_embeddings,
                                    positive_labels, negative_labels,
                                    drop_percentage)

        print(f"Iteration: {iteration} ", end="")
        for key, value in results.items():
            print(f"{key}: {value} ", end="")
        print("")

        precisions.append(results['precision'])
        dimensions.append(results['num_dimensions'])
        num_labels.append(positive_labels + negative_labels)



# Create the plot
fig, ax = plt.subplots(figsize=(10, 6))
scatter = ax.scatter(dimensions, num_labels, c=precisions, cmap='viridis', vmin=0, vmax=1)
cbar = fig.colorbar(scatter, ax=ax)
cbar.set_label('Precision')
ax.set_xlabel('num_dimensions')
ax.set_ylabel('num_labels')
ax.set_title('Precision by Iterations')
ax.grid(True)


plt.tight_layout()
plt.show()

In [None]:
import random

import folium
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics.pairwise import cosine_similarity


def plot_satellite_image(bbox):
    """
    Plot the satellite image of a given bounding box (bbox).
    
    Parameters:
    - bbox (tuple): A tuple of (min_lat, min_lon, max_lat, max_lon)
    
    Returns:
    - A folium map object with the satellite image of the given bbox.
    """
    # Calculate the center of the bounding box
    center_lat = (bbox[0] + bbox[2]) / 2
    center_lon = (bbox[1] + bbox[3]) / 2
    
    # Create a folium map centered at the calculated center
    m = folium.Map(location=[center_lat, center_lon], zoom_start=13, tiles='Esri.WorldImagery', attr='Custom Attribution')
    
    # Add the bounding box as a rectangle on the map
    folium.Rectangle(
        bounds=[(bbox[0], bbox[1]), (bbox[2], bbox[3])],
        color='blue',
        fill=True,
        fill_color='blue',
        fill_opacity=0.2
    ).add_to(m)
    
    return m

def calculate_precision(all_embeddings, positive_labels, negative_labels, drop_percentage):
    all_embeddings.reset_index(drop=True, inplace=True)
    pos_samples = all_embeddings[all_embeddings['aquaculture'] == 1]
    neg_samples = all_embeddings[all_embeddings['aquaculture'] == 0]

    # Extract positive and negative samples from all_embeddings, put rest in test set
    pos_train_indices = pos_samples.sample(n=positive_labels).index
    neg_train_indices = neg_samples.sample(n=negative_labels).index
    pos_test_indices = pos_samples.drop(pos_train_indices).index
    neg_test_indices = neg_samples.drop(neg_train_indices).index

    train_indices = np.concatenate((pos_train_indices, neg_train_indices))
    test_indices = np.concatenate((pos_test_indices, neg_test_indices))

    # Shuffle the indices
    np.random.shuffle(train_indices)
    np.random.shuffle(test_indices)

    rf = RandomForestClassifier()

    train_samples = all_embeddings.loc[train_indices]
    X_train = np.array(train_samples['embeddings'].tolist())
    y_train = train_samples['aquaculture'].values
    rf.fit(X_train, y_train)

    if drop_percentage > 0:
        importances = rf.feature_importances_
        # Sort importances and find the threshold importance that marks the top (1 - drop_percentage) of features
        sorted_importances = np.sort(importances)
        threshold_index = int(len(importances) * (1 - drop_percentage))
        # The threshold importance is the value below which features will be considered for dropping
        drop_threshold = sorted_importances[threshold_index]
        
        # Identify all features that have importance less than or equal to the threshold
        drop_mask = importances <= drop_threshold
        least_important_indices = np.where(drop_mask)[0]
        
        # If the number of features to drop is more specific, adjust the selection
        num_features_to_drop = min(int(drop_percentage * len(importances)), len(least_important_indices))
        
        # Randomly select from the least important features if there are more than needed
        if len(least_important_indices) > num_features_to_drop:
            least_important_dims = random.sample(list(least_important_indices), num_features_to_drop)
        else:
            least_important_dims = least_important_indices

        all_embeddings['embeddings'] = all_embeddings['embeddings'].apply(lambda x: np.delete(x, least_important_dims))

    # Get 25 closest cosine similarity to the centroid of the positive class, use pos_train_indices
    centroid = np.mean(np.array(all_embeddings.iloc[pos_train_indices]['embeddings'].tolist()), axis=0)

    # Calculate cosine similarity between the centroid and all embeddings
    cosine_similarities = cosine_similarity([centroid], np.array(all_embeddings['embeddings'].tolist())).flatten()
    
    # Exclude the positive training set indices from the closest_indices
    closest_indices = np.argsort(cosine_similarities)
    closest_indices = [i for i in closest_indices if i not in pos_train_indices][:25]
    
    # Get the true labels and predicted labels for the closest indices
    closest_true_labels = all_embeddings.iloc[closest_indices]['aquaculture'].values
    closest_pred_labels = np.ones(25)
    
    # Calculate metrics for the 25 closest embeddings
    closest_precision = precision_score(closest_true_labels, closest_pred_labels)
    closest_recall = recall_score(closest_true_labels, closest_pred_labels)
    closest_accuracy = accuracy_score(closest_true_labels, closest_pred_labels)
    closest_f1 = f1_score(closest_true_labels, closest_pred_labels)
    
    # Create a folium map to visualize the locations
    m = folium.Map(location=[all_embeddings['y'].mean(), 
                             all_embeddings['x'].mean()], 
                   zoom_start=9)
    
    # Add markers for positive training locations
    for idx in pos_train_indices:
        row = all_embeddings.iloc[idx]
        folium.CircleMarker(location=[row['y'], row['x']], 
                            radius=5, color='green', fill=True, fill_color='green', 
                            fill_opacity=0.7, popup=f"Train Positive: {idx}").add_to(m)
    
    # Add markers for negative training locations
    for idx in neg_train_indices:
        row = all_embeddings.iloc[idx]
        folium.CircleMarker(location=[row['y'], row['x']], 
                            radius=5, color='red', fill=True, fill_color='red', 
                            fill_opacity=0.7, popup=f"Train Negative: {idx}").add_to(m)
    
    # Add markers for closest 25 predictions
    for idx, true_label in zip(closest_indices, closest_true_labels):
        row = all_embeddings.iloc[idx]
        if true_label == 1:
            color = 'blue'
            popup = f"Closest 25 True Positive: {idx}"
        else:
            color = 'orange'
            popup = f"Closest 25 False Positive: {idx}"
        folium.CircleMarker(location=[row['y'], row['x']], 
                            radius=5, color=color, fill=True, fill_color=color, 
                            fill_opacity=0.7, popup=popup).add_to(m)
    
    # Display the map
    display(m)
    
    # Store the precision, number of dimensions, and iteration numbers
    results = {
        'precision': format(closest_precision, '.2f'),
        'recall': format(closest_recall, '.2f'),
        'accuracy': format(closest_accuracy, '.2f'),
        'f1': format(closest_f1, '.2f'),
        'num_positives': len(pos_train_indices),
        'num_negatives': len(neg_train_indices),
        'num_test': len(test_indices),
        'num_dimensions': len(all_embeddings.iloc[0]['embeddings']),
    }
    return results

positive_labels = 36
negative_labels = 36
drop_percentage = 0.50
gdf = embeddings.copy().reset_index(drop=True)
print(gdf.columns)

results = calculate_precision(gdf, positive_labels, negative_labels, drop_percentage)
print(results)

In [None]:
import random

import folium
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics.pairwise import cosine_similarity


def plot_satellite_image(bbox):
    """
    Plot the satellite image of a given bounding box (bbox).
    
    Parameters:
    - bbox (tuple): A tuple of (min_lat, min_lon, max_lat, max_lon)
    
    Returns:
    - A folium map object with the satellite image of the given bbox.
    """
    # Calculate the center of the bounding box
    center_lat = (bbox[0] + bbox[2]) / 2
    center_lon = (bbox[1] + bbox[3]) / 2
    
    # Create a folium map centered at the calculated center
    m = folium.Map(location=[center_lat, center_lon], zoom_start=13, tiles='Esri.WorldImagery', attr='Custom Attribution')
    
    # Add the bounding box as a rectangle on the map
    folium.Rectangle(
        bounds=[(bbox[0], bbox[1]), (bbox[2], bbox[3])],
        color='blue',
        fill=True,
        fill_color='blue',
        fill_opacity=0.2
    ).add_to(m)
    
    return m

def calculate_precision(all_embeddings, positive_labels, negative_labels, drop_percentage):
    all_embeddings.reset_index(drop=True, inplace=True)
    pos_samples = all_embeddings[all_embeddings['aquaculture'] == 1]
    neg_samples = all_embeddings[all_embeddings['aquaculture'] == 0]

    # Extract positive and negative samples from all_embeddings, put rest in test set
    pos_train_indices = pos_samples.sample(n=positive_labels).index
    neg_train_indices = neg_samples.sample(n=negative_labels).index
    pos_test_indices = pos_samples.drop(pos_train_indices).index
    neg_test_indices = neg_samples.drop(neg_train_indices).index

    train_indices = np.concatenate((pos_train_indices, neg_train_indices))
    test_indices = np.concatenate((pos_test_indices, neg_test_indices))

    # Shuffle the indices
    np.random.shuffle(train_indices)
    np.random.shuffle(test_indices)

    rf = RandomForestClassifier()

    train_samples = all_embeddings.loc[train_indices]
    X_train = np.array(train_samples['embeddings'].tolist())
    y_train = train_samples['aquaculture'].values
    rf.fit(X_train, y_train)

    if drop_percentage > 0:
        importances = rf.feature_importances_
        # Sort importances and find the threshold importance that marks the top (1 - drop_percentage) of features
        sorted_importances = np.sort(importances)
        threshold_index = int(len(importances) * (1 - drop_percentage))
        # The threshold importance is the value below which features will be considered for dropping
        drop_threshold = sorted_importances[threshold_index]
        
        # Identify all features that have importance less than or equal to the threshold
        drop_mask = importances <= drop_threshold
        least_important_indices = np.where(drop_mask)[0]
        
        # If the number of features to drop is more specific, adjust the selection
        num_features_to_drop = min(int(drop_percentage * len(importances)), len(least_important_indices))
        
        # Randomly select from the least important features if there are more than needed
        if len(least_important_indices) > num_features_to_drop:
            least_important_dims = random.sample(list(least_important_indices), num_features_to_drop)
        else:
            least_important_dims = least_important_indices

        all_embeddings['embeddings'] = all_embeddings['embeddings'].apply(lambda x: np.delete(x, least_important_dims))

    # Get 25 closest cosine similarity to the centroid of the positive class, use pos_train_indices
    centroid = np.mean(np.array(all_embeddings.iloc[pos_train_indices]['embeddings'].tolist()), axis=0)

    # Calculate cosine similarity between the centroid and all embeddings
    cosine_similarities = cosine_similarity([centroid], np.array(all_embeddings['embeddings'].tolist())).flatten()
    
    # Exclude the positive training set indices from the closest_indices
    closest_indices = np.argsort(cosine_similarities)
    closest_indices = [i for i in closest_indices if i not in pos_train_indices][:25]
    
    # Get the true labels and predicted labels for the closest indices
    closest_true_labels = all_embeddings.iloc[closest_indices]['aquaculture'].values
    closest_pred_labels = np.ones(25)
    
    # Calculate metrics for the 25 closest embeddings
    closest_precision = precision_score(closest_true_labels, closest_pred_labels)
    closest_recall = recall_score(closest_true_labels, closest_pred_labels)
    closest_accuracy = accuracy_score(closest_true_labels, closest_pred_labels)
    closest_f1 = f1_score(closest_true_labels, closest_pred_labels)
    
    # Create a folium map to visualize the locations
    m = folium.Map(location=[all_embeddings['y'].mean(), 
                             all_embeddings['x'].mean()], 
                   zoom_start=9)
    
    # Add markers for positive training locations
    for idx in pos_train_indices:
        row = all_embeddings.iloc[idx]
        geometry = row['geometry'] 
        folium.GeoJson(geometry).add_to(m)
    
    # Add markers for negative training locations
    for idx in neg_train_indices:
        row = all_embeddings.iloc[idx]
        geometry = row['geometry']
        folium.GeoJson(geometry).add_to(m)
    
    # Add markers for closest 25 predictions
    for idx, true_label in zip(closest_indices, closest_true_labels):
        row = all_embeddings.iloc[idx]
        geometry = row['geometry']
        if true_label == 1:
            color = 'blue'
            popup = f"Closest 25 True Positive: {idx}"
        else:
            color = 'orange'
            popup = f"Closest 25 False Positive: {idx}"
        folium.GeoJson(geometry, style_function=lambda x: {'fillColor': color}).add_to(m)
    
    # Display the map
    display(m)
    
    # Store the precision, number of dimensions, and iteration numbers
    results = {
        'precision': format(closest_precision, '.2f'),
        'recall': format(closest_recall, '.2f'),
        'accuracy': format(closest_accuracy, '.2f'),
        'f1': format(closest_f1, '.2f'),
        'num_positives': len(pos_train_indices),
        'num_negatives': len(neg_train_indices),
        'num_test': len(test_indices),
        'num_dimensions': len(all_embeddings.iloc[0]['embeddings']),
    }
    return results

positive_labels = 36
negative_labels = 36
drop_percentage = 0.50
gdf = embeddings.copy().reset_index(drop=True)
print(gdf.columns)

results = calculate_precision(gdf, positive_labels, negative_labels, drop_percentage)
print(results)

In [None]:
import random

import folium
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics.pairwise import cosine_similarity


def plot_satellite_image(bbox):
    """
    Plot the satellite image of a given bounding box (bbox).
    
    Parameters:
    - bbox (tuple): A tuple of (min_lat, min_lon, max_lat, max_lon)
    
    Returns:
    - A folium map object with the satellite image of the given bbox.
    """
    # Calculate the center of the bounding box
    center_lat = (bbox[0] + bbox[2]) / 2
    center_lon = (bbox[1] + bbox[3]) / 2
    
    # Create a folium map centered at the calculated center
    m = folium.Map(location=[center_lat, center_lon], zoom_start=13, tiles='Esri.WorldImagery', attr='Custom Attribution')
    
    # Add the bounding box as a rectangle on the map
    folium.Rectangle(
        bounds=[(bbox[0], bbox[1]), (bbox[2], bbox[3])],
        color='blue',
        fill=True,
        fill_color='blue',
        fill_opacity=0.2
    ).add_to(m)
    
    return m

def calculate_precision(all_embeddings, positive_labels, negative_labels, 
                        drop_percentage,closest_similar):
    all_embeddings.reset_index(drop=True, inplace=True)
    pos_samples = all_embeddings[all_embeddings['aquaculture'] == 1]
    neg_samples = all_embeddings[all_embeddings['aquaculture'] == 0]

    # Extract positive and negative samples from all_embeddings, put rest in test set
    pos_train_indices = pos_samples.sample(n=min(positive_labels, len(pos_samples))).index
    neg_train_indices = neg_samples.sample(n=min(negative_labels, len(neg_samples))).index
    pos_test_indices = pos_samples.drop(pos_train_indices).index
    neg_test_indices = neg_samples.drop(neg_train_indices).index

    train_indices = np.concatenate((pos_train_indices, neg_train_indices))
    test_indices = np.concatenate((pos_test_indices, neg_test_indices))

    rf = RandomForestClassifier()

    train_samples = all_embeddings.iloc[train_indices]
    X_train = np.array(train_samples['embeddings'].tolist())
    y_train = train_samples['aquaculture'].values
    rf.fit(X_train, y_train)

    if drop_percentage > 0:
        importances = rf.feature_importances_

        # Calculate the number of features to keep based on the drop_percentage
        num_features_to_keep = int(len(importances) * (1 - drop_percentage))
        # Keep only the top important features based on the calculated number to keep
        top_important_indices = np.argsort(importances)[::-1][:num_features_to_keep]
        # Update all_embeddings to keep only the top important features
        all_embeddings['embeddings'] = all_embeddings['embeddings'].apply(lambda x: x[top_important_indices])

        train_samples = all_embeddings.loc[train_indices]
        X_train = np.array(train_samples['embeddings'].tolist())
        y_train = train_samples['aquaculture'].values
        rf.fit(X_train, y_train)

    # Get 25 closest cosine similarity to the centroid of the positive class, use pos_train_indices
    centroid = np.mean(np.array(all_embeddings.iloc[pos_train_indices]['embeddings'].tolist()), axis=0)

    # Calculate cosine similarity between the centroid and all embeddings
    cosine_similarities = cosine_similarity([centroid], np.array(all_embeddings['embeddings'].tolist())).flatten()
    
    # Exclude the positive training set indices from the closest_indices
    closest_indices = np.argsort(cosine_similarities)
    closest_indices = [i for i in closest_indices if i not in pos_train_indices][:closest_similar]
    
    # Get the true labels and predicted labels for the closest indices
    closest_true_labels = all_embeddings.iloc[closest_indices]['aquaculture'].values
    closest_pred_labels = rf.predict(np.array(all_embeddings.iloc[closest_indices]['embeddings'].tolist()))
    
    # Calculate metrics for the 25 closest embeddings
    closest_precision = precision_score(closest_true_labels, closest_pred_labels)
    closest_recall = recall_score(closest_true_labels, closest_pred_labels)
    closest_accuracy = accuracy_score(closest_true_labels, closest_pred_labels)
    closest_f1 = f1_score(closest_true_labels, closest_pred_labels)
    
    # Create a folium map to visualize the locations
    m = folium.Map(location=[all_embeddings['y'].mean(), 
                             all_embeddings['x'].mean()], 
                   zoom_start=9, tiles="Esri.WorldImagery")
    
    # Add markers for positive training locations
    for idx in pos_train_indices:
        row = all_embeddings.iloc[idx]
        geometry = row['geometry'] 
        folium.GeoJson(geometry, style_function=lambda x: {'color': 'green'}).add_to(m)
    
    # Add markers for negative training locations
    for idx in neg_train_indices:
        row = all_embeddings.iloc[idx]
        geometry = row['geometry']
        folium.GeoJson(geometry, style_function=lambda x: {'color': 'red'}).add_to(m)
    
    # Add markers for closest predictions
    for idx, true_label in zip(closest_indices, closest_true_labels):
        row = all_embeddings.iloc[idx]
        geometry = row['geometry']
        if true_label == 1:
            # Correctly predicted positive
            color = 'blue'
            popup = f"Closest True Positive: {idx}"
        else:
            # Incorrectly predicted positive (false positive)
            color = 'orange'
            popup = f"Closest False Positive: {idx}"
        folium.GeoJson(geometry, style_function=lambda x: {'color': color}).add_to(m)
    
    # Display the map
    display(m)
    
    # Store the precision, number of dimensions, and iteration numbers
    results = {
        'precision': format(closest_precision, '.2f'),
        'recall': format(closest_recall, '.2f'),
        'accuracy': format(closest_accuracy, '.2f'),
        'f1': format(closest_f1, '.2f'),
        'num_positives': len(pos_train_indices),
        'num_negatives': len(neg_train_indices),
        'num_test': len(test_indices),
        'num_dimensions': len(all_embeddings.iloc[0]['embeddings']),
    }
    return results

positive_labels = 10
negative_labels = 20
drop_percentage = 0.
closest_similar = 25
gdf = embeddings.copy().reset_index(drop=True)
print(gdf.columns)

results = calculate_precision(gdf, positive_labels, negative_labels, drop_percentage, closest_similar)
print(results)



In [None]:
import random

import folium
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics.pairwise import cosine_similarity


def plot_satellite_image(bbox):
    """
    Plot the satellite image of a given bounding box (bbox).
    
    Parameters:
    - bbox (tuple): A tuple of (min_lat, min_lon, max_lat, max_lon)
    
    Returns:
    - A folium map object with the satellite image of the given bbox.
    """
    # Calculate the center of the bounding box
    center_lat = (bbox[0] + bbox[2]) / 2
    center_lon = (bbox[1] + bbox[3]) / 2
    
    # Create a folium map centered at the calculated center
    m = folium.Map(location=[center_lat, center_lon], zoom_start=13, tiles='Esri.WorldImagery', attr='Custom Attribution')
    
    # Add the bounding box as a rectangle on the map
    folium.Rectangle(
        bounds=[(bbox[0], bbox[1]), (bbox[2], bbox[3])],
        color='blue',
        fill=True,
        fill_color='blue',
        fill_opacity=0.2
    ).add_to(m)
    
    return m

def find_most_common_closest(all_embeddings, pos_train_indices, closest_similar):
    pos_train_embeddings = np.array(all_embeddings.iloc[pos_train_indices]['embeddings'].tolist())
    
    # Calculate cosine similarity between each positive training embedding and all embeddings
    cosine_similarities = cosine_similarity(pos_train_embeddings, np.array(all_embeddings['embeddings'].tolist()))
    
    # Find the indices of the closest_similar samples for each positive training embedding
    closest_indices_per_pos = np.argsort(cosine_similarities, axis=1)[:, -closest_similar:]
    
    # Flatten the closest indices and count their occurrences
    closest_indices_flat = closest_indices_per_pos.flatten()
    closest_indices_counts = np.bincount(closest_indices_flat)
    
    # Find the 25 most common closest indices
    most_common_closest_indices = np.argsort(closest_indices_counts)[-25:]
    
    return most_common_closest_indices

def calculate_precision(all_embeddings, positive_labels, negative_labels, 
                        drop_percentage, closest_similar):
    all_embeddings.reset_index(drop=True, inplace=True)
    pos_samples = all_embeddings[all_embeddings['aquaculture'] == 1]
    neg_samples = all_embeddings[all_embeddings['aquaculture'] == 0] #take negative examples as negative closest in cosine similarity

    # Extract positive and negative samples from all_embeddings, put rest in test set
    pos_train_indices = pos_samples.sample(n=min(positive_labels, len(pos_samples))).index
    neg_train_indices = neg_samples.sample(n=min(negative_labels, len(neg_samples))).index
    pos_test_indices = pos_samples.drop(pos_train_indices).index
    neg_test_indices = neg_samples.drop(neg_train_indices).index

    train_indices = np.concatenate((pos_train_indices, neg_train_indices))
    test_indices = np.concatenate((pos_test_indices, neg_test_indices))

    rf = RandomForestClassifier()

    train_samples = all_embeddings.iloc[train_indices]
    X_train = np.array(train_samples['embeddings'].tolist())
    y_train = train_samples['aquaculture'].values
    rf.fit(X_train, y_train)

    if drop_percentage > 0:
        importances = rf.feature_importances_

        # Calculate the number of features to keep based on the drop_percentage
        num_features_to_keep = int(len(importances) * (1 - drop_percentage))
        # Keep only the top important features based on the calculated number to keep
        top_important_indices = np.argsort(importances)[::-1][:num_features_to_keep]

        import matplotlib.pyplot as plt
   
        # Sort importances
        sorted_indices = np.argsort(importances)[::-1]
        sorted_importances = importances[sorted_indices]

        # Plot feature importances
        plt.figure(figsize=(20, 6))
        plt.title("Feature Importances")
        plt.bar(range(X_train.shape[1]), sorted_importances, align='center')

        # Draw cutoff line
        plt.axvline(x=num_features_to_keep-0.5, color='r', linestyle='--', label='Cutoff for feature selection')
        plt.xticks(range(X_train.shape[1]), sorted_indices, rotation=90)
        plt.xlim([-1, X_train.shape[1]])
        plt.xlabel("Feature Index")
        plt.ylabel("Importance")
        plt.legend()
        plt.tight_layout()
        plt.show()

        # Update all_embeddings to keep only the top important features
        all_embeddings['embeddings'] = all_embeddings['embeddings'].apply(lambda x: x[top_important_indices])

        train_samples = all_embeddings.loc[train_indices]
        X_train = np.array(train_samples['embeddings'].tolist())
        y_train = train_samples['aquaculture'].values
        rf.fit(X_train, y_train)

    # Find the 25 most common closest samples to the positive training set
    most_common_closest_indices = find_most_common_closest(all_embeddings, pos_train_indices, closest_similar)

    # Get the true labels and predicted labels for the most common closest indices
    closest_true_labels = all_embeddings.iloc[most_common_closest_indices]['aquaculture'].values
    closest_pred_labels = rf.predict(np.array(all_embeddings.iloc[most_common_closest_indices]['embeddings'].tolist()))
    
    # Calculate metrics for the 25 most common closest embeddings
    closest_precision = precision_score(closest_true_labels, closest_pred_labels)
    closest_recall = recall_score(closest_true_labels, closest_pred_labels)
    closest_accuracy = accuracy_score(closest_true_labels, closest_pred_labels)
    closest_f1 = f1_score(closest_true_labels, closest_pred_labels)
    
    # Create a folium map to visualize the locations
    m = folium.Map(location=[all_embeddings['y'].mean(), 
                             all_embeddings['x'].mean()], 
                   zoom_start=9, tiles="Esri.WorldImagery")
    
    # Add markers for positive training locations
    for idx in pos_train_indices:
        row = all_embeddings.iloc[idx]
        geometry = row['geometry'] 
        folium.GeoJson(geometry, style_function=lambda x: {'color': 'green'}).add_to(m)
    
    # Add markers for negative training locations
    for idx in neg_train_indices:
        row = all_embeddings.iloc[idx]
        geometry = row['geometry']
        folium.GeoJson(geometry, style_function=lambda x: {'color': 'red'}).add_to(m)
    
    # Add markers for most common closest predictions
    for idx, true_label in zip(most_common_closest_indices, closest_true_labels):
        row = all_embeddings.iloc[idx]
        geometry = row['geometry']
        if true_label == 1:
            # Correctly predicted positive
            color = 'blue'
            popup = f"Most Common Closest True Positive: {idx}"
        else:
            # Incorrectly predicted positive (false positive)
            color = 'orange'
            popup = f"Most Common Closest False Positive: {idx}"
        folium.GeoJson(geometry, style_function=lambda x: {'color': color}).add_to(m)
    
    # Display the map
    display(m)
    
    # Store the precision, number of dimensions, and iteration numbers
    results = {
        'precision': format(closest_precision, '.2f'),
        'recall': format(closest_recall, '.2f'),
        'accuracy': format(closest_accuracy, '.2f'),
        'f1': format(closest_f1, '.2f'),
        'num_positives': len(pos_train_indices),
        'num_negatives': len(neg_train_indices),
        'num_test': len(test_indices),
        'num_dimensions': len(all_embeddings.iloc[0]['embeddings']),
    }
    return results

positive_labels = 5
negative_labels = 20
drop_percentage = 0.5
closest_similar = 10
gdf = embeddings.copy().reset_index(drop=True)

results = calculate_precision(gdf, positive_labels, negative_labels, drop_percentage, closest_similar)
print(results)

In [None]:
import random
import folium
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics.pairwise import cosine_similarity



def find_most_common_closest(all_embeddings, train_indices, closest_similar):
    train_embeddings = np.array(all_embeddings.iloc[train_indices]['embeddings'].tolist())
    # Calculate cosine similarity between each positive training embedding and all embeddings
    cosine_similarities = cosine_similarity(train_embeddings, np.array(all_embeddings['embeddings'].tolist()))
    # Find the indices of the closest_similar samples for each positive training embedding
    closest_indices_per_pos = np.argsort(cosine_similarities, axis=1)[:, -closest_similar:]

    for train_idx, closest_idxs in zip(train_indices, closest_indices_per_pos):
        train_aquaculture = all_embeddings.iloc[train_idx]['aquaculture']
        closest_aquacultures = all_embeddings.iloc[closest_idxs]['aquaculture'].values
        same_aquaculture_count = np.sum(closest_aquacultures == train_aquaculture)
        print(same_aquaculture_count)

    # Flatten the closest indices and count their occurrences
    closest_indices_flat = closest_indices_per_pos.flatten()
    closest_indices_counts = np.bincount(closest_indices_flat)
    # Find the 25 most common closest indices
    most_common_closest_indices = np.argsort(closest_indices_counts)[-25:]
    
    return most_common_closest_indices

def calculate_precision(all_embeddings,
                        positive_labels, negative_labels, drop_percentage,
                        do_map=False,
                        plot_features=True):
    
    all_embeddings.reset_index(drop=True, inplace=True)
    pos_samples = all_embeddings[all_embeddings['aquaculture'] == 1]
    neg_samples = all_embeddings[all_embeddings['aquaculture'] == 0]

    pos_train_indices = pos_samples.sample(n=min(positive_labels, len(pos_samples))).index
    neg_train_indices = neg_samples.sample(n=min(negative_labels, len(neg_samples))).index
    pos_test_indices = pos_samples.drop(pos_train_indices).index
    neg_test_indices = neg_samples.drop(neg_train_indices).index

    train_indices = np.concatenate((pos_train_indices, neg_train_indices))
    test_indices = np.concatenate((pos_test_indices, neg_test_indices))

    rf = RandomForestClassifier()

    train_samples = all_embeddings.iloc[train_indices]
    X_train = np.array(train_samples['embeddings'].tolist())
    y_train = train_samples['aquaculture'].values
    rf.fit(X_train, y_train)

    if drop_percentage > 0:
        importances = rf.feature_importances_
        num_features_to_keep = int(len(importances) * (1 - drop_percentage))
        sorted_indices = np.argsort(importances)[::-1]
        sorted_importances = importances[sorted_indices]
        top_important_indices = sorted_importances[:num_features_to_keep]
        top_important_indices = top_important_indices.astype(int)

        if plot_features:
            # Plot feature importances
            plt.figure(figsize=(20, 6))
            plt.title("Feature Importances")
            plt.bar(range(len(sorted_importances)), sorted_importances, align='center')
            plt.yscale('log')

            # Set x-tick labels to the sorted feature indices
            plt.xticks(range(len(importances)), sorted_indices, rotation=90)
            plt.axvline(x=num_features_to_keep-0.5, color='r', linestyle='--', label='Cutoff for feature selection')

            plt.xlabel("Feature Index [Log]")
            plt.ylabel("Importance")
            plt.tight_layout()
            plt.show()

        all_embeddings['embeddings'] = all_embeddings['embeddings'].apply(lambda x: x[top_important_indices])
        
        #pull again the train samples and RF from the pruned embeddings
        train_samples = all_embeddings.loc[train_indices]
        X_train = np.array(train_samples['embeddings'].tolist())
        y_train = train_samples['aquaculture'].values
        rf.fit(X_train, y_train)

    # Find the 25 most common closest samples to the positive training set
    most_common_closest_indices = find_most_common_closest(all_embeddings, pos_train_indices, closest_similar)
    # Find the 25 most common closest samples to the negative training set
    most_common_closest_indices_neg = find_most_common_closest(all_embeddings, neg_train_indices, closest_similar)
    #remove most common closest negative indices from the most common closest indices, if they exist
    most_common_closest_indices = [i for i in most_common_closest_indices if i not in most_common_closest_indices_neg]

    #count how many of the most common closest indices are actually positive
    print("Number of positive samples in the most common closest indices")
    print(len(all_embeddings[all_embeddings.index.isin(most_common_closest_indices) & all_embeddings['aquaculture'] == 1]))

    # Get the true labels and predicted labels for the most common closest indices
    closest_true_labels = all_embeddings.iloc[most_common_closest_indices]['aquaculture'].values
    closest_pred_labels = np.ones(len(most_common_closest_indices))

    # Calculate metrics for the 25 most common closest embeddings
    precision = precision_score(closest_true_labels, closest_pred_labels)
    recall = recall_score(closest_true_labels, closest_pred_labels)
    accuracy = accuracy_score(closest_true_labels, closest_pred_labels)
    f1 = f1_score(closest_true_labels, closest_pred_labels)

    if do_map:
        m = folium.Map(location=[all_embeddings['y'].mean(), 
                                    all_embeddings['x'].mean()], 
                        zoom_start=9, tiles="Esri.WorldImagery")

        for idx in pos_train_indices:
            row = all_embeddings.iloc[idx]
            geometry = row['geometry'] 
            folium.GeoJson(geometry, style_function=lambda x: {'color': 'blue'}).add_to(m)

        for idx in neg_train_indices:
            row = all_embeddings.iloc[idx]
            geometry = row['geometry']
            folium.GeoJson(geometry, style_function=lambda x: {'color': 'red'}).add_to(m)

        # Add markers for most common closest predictions
        for idx, true_label in zip(most_common_closest_indices, closest_true_labels):
            print(idx, true_label)
            row = all_embeddings.iloc[idx]
            geometry = row['geometry']
            if true_label == 1:
                color = 'yellow' #true positive
            elif true_label == 0:
                color = 'orange' #false positive
            else:
                break
            folium.GeoJson(geometry, style_function=lambda x: {'color': color}).add_to(m)

        display(m)
    
    results = {
        'precision': format(precision, '.2f'),
        'recall': format(recall, '.2f'),
        'accuracy': format(accuracy, '.2f'),
        'f1': format(f1, '.2f'),
        'num_positives': len(pos_train_indices),
        'num_negatives': len(neg_train_indices),
        'num_test': len(test_indices),
        'num_dimensions': len(all_embeddings.iloc[0]['embeddings']),
    }
    return results

positive_labels = 5
negative_labels = 3
drop_percentage = 0
closest_similar = 10
gdf = embeddings.copy().reset_index(drop=True)
calculate_precision(gdf, positive_labels, negative_labels, drop_percentage, closest_similar, do_map = True, plot_features = True)



In [None]:
def run_experiment(embeddings, drop_percentages, positive_counts):
    results = []
    for drop_percentage in drop_percentages:
        for positive_count in positive_counts:
            negative_count = 2* positive_count
            result = calculate_precision(embeddings.copy(), positive_count, negative_count, drop_percentage)
            result['drop_percentage'] = drop_percentage
            result['positive_count'] = positive_count
            results.append(result)
            print(f"Drop Percentage: {drop_percentage}, Positive Count: {positive_count}")
            print(result)
    return pd.DataFrame(results)

drop_percentages = [i/100 for i in range(0, 71, 10)]
positive_counts = list(range(1, 101, 10))

results_df = run_experiment(embeddings, drop_percentages, positive_counts)

plt.figure(figsize=(10, 6))
for drop_percentage in drop_percentages:
    plt.scatter(results_df[results_df['drop_percentage'] == drop_percentage]['positive_count'],
                results_df[results_df['drop_percentage'] == drop_percentage]['precision'],
                label=f"Drop Percentage: {drop_percentage}")
plt.xlabel("Number of Positive Cases")
plt.ylabel("Precision")
plt.legend(title="Drop Percentage")
plt.show()