In [1]:
#This notebook exists just to take an existing training manifest and remove rows such that the classes are 50/50 balanced
import pandas as pd
import boto3
import random

training_list_path = "train/train_lst.lst"
validation_list_path = "validation/validation_lst.lst"
data_bucket_name = "summer2024-sagemaker-data-bucket"

In [2]:
KEYS = "ajarriet_accessKeys.csv" # change to your path
# KEYS = '/Users/sophiapchung/Desktop/Bioacoustics/spchung_accessKeys.csv'
    
keyInfo = pd.read_csv(KEYS)

# Create a boto3 resource with your credentials
s3 = boto3.resource(
    's3',
    region_name='us-west-2',
    aws_access_key_id=keyInfo["Access key ID"][0],
    aws_secret_access_key=keyInfo["Secret access key"][0]
)
data_bucket = s3.Bucket(data_bucket_name)

#This downloads the original training manifest
data_bucket.download_file(training_list_path, "training_lst.lst")

trainDF = pd.read_csv("training_lst.lst", sep = "\t", names = ["index", "Classification", "File"])
TotalYes = trainDF["Classification"].sum()
TotalNo = len(trainDF.index)-TotalYes

print(trainDF)

print(TotalNo)
print(TotalYes)
print(len(trainDF.index))
print(TotalNo - TotalYes)

#This samples random rows from the dataframe that have a "no" classification
#It samples the difference in the amount of no and yes responses
#The rows it samples are dropped from the manifest (note all the images files are still in the s3 bucket)
#This does assume there are more images classified as "no" than "yes"
toDrop = random.sample(trainDF[trainDF["Classification"] == 0].index.tolist(), TotalNo - TotalYes)
stratifiedDF = trainDF.drop(toDrop)
#stratifiedDF["File"] = stratifiedDF["File"].apply(lambda x: x[6:])
print(stratifiedDF)

       index  Classification  \
0          1               0   
1          2               1   
2          4               1   
3          5               0   
4          6               1   
...      ...             ...   
14438  17983               1   
14439  17984               1   
14440  17985               1   
14441  17987               0   
14442  17988               0   

                                                    File  
0           images/6805.230201090825_processed.wav_1.png  
1           images/6805.230201090825_processed.wav_2.png  
2           images/6805.230201090825_processed.wav_4.png  
3           images/6805.230201090825_processed.wav_5.png  
4           images/6805.230201090825_processed.wav_6.png  
...                                                  ...  
14438  images/671658014.181007153417_processed.wav_17...  
14439  images/671658014.181007153417_processed.wav_17...  
14440  images/671658014.181007153417_processed.wav_17...  
14441  images/671658014.1

In [3]:
stratifiedDF.to_csv("stratified_train.csv", sep = "\t", header = False, index = False)

In [6]:
data_bucket.upload_file("stratified_train.csv", "train/train_lst.lst")

In [5]:
len(stratifiedDF.index)

10204

In [8]:
stratifiedDF["Classification"].mean()

0.5

In [9]:
trainDF["Classification"].mean()

0.35325070968635325

In [10]:
data_bucket.download_file(validation_list_path, "validation_lst.lst")

trainDF = pd.read_csv("validation_lst.lst", sep = "\t", names = ["index", "Classification", "File"])
trainDF["File"] = trainDF["File"].apply(lambda x: x[33:])

print(trainDF)
trainDF.to_csv("stratified_val.csv", sep = "\t", header = False, index = False)
data_bucket.upload_file("stratified_val.csv", "LongTermPreprocessedImageStorage/validation/validation_lst.lst")

      index  Classification                                              File
0         6               1      images/6805.230201090825_processed.wav_6.png
1         7               0      images/6805.230201090825_processed.wav_7.png
2        10               0     images/6805.230201090825_processed.wav_10.png
3        11               0     images/6805.230201090825_processed.wav_11.png
4        13               0     images/6805.230201090825_processed.wav_13.png
...     ...             ...                                               ...
2215  10931               1  images/6805.230207120827_processed.wav_10931.png
2216  10937               0  images/6805.230207120827_processed.wav_10937.png
2217  10942               1  images/6805.230207120827_processed.wav_10942.png
2218  10945               0  images/6805.230207120827_processed.wav_10945.png
2219  10956               0  images/6805.230207120827_processed.wav_10956.png

[2220 rows x 3 columns]
