In [1]:
import json
import pandas as pd

In [2]:
with open('inaturalist_train_label.json', 'r') as file:
    train_labels = json.load(file)

In [3]:
with open('inaturalist_test_label.json', 'r') as file:
    test_labels = json.load(file)

In [4]:
with open('inaturalist_val_label.json', 'r') as file:
    val_labels = json.load(file)

In [5]:
train_labels_dataframe = pd.DataFrame(train_labels['data'])
test_labels_dataframe = pd.DataFrame(test_labels['data'])
val_labels_dataframe = pd.DataFrame(val_labels['data'])

In [6]:
combined_dataframe = pd.concat([train_labels_dataframe, test_labels_dataframe, val_labels_dataframe])

In [7]:
combined_dataframe.info()

<class 'pandas.core.frame.DataFrame'>
Index: 167527 entries, 0 to 33128
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   wav      167527 non-null  object
 1   labels   167527 non-null  int64 
 2   caption  167527 non-null  object
dtypes: int64(1), object(2)
memory usage: 5.1+ MB


In [8]:
combined_dataframe['labels'].value_counts()

labels
986    1200
912    1200
960    1200
983    1200
554    1200
       ... 
96        1
396       1
190       1
58        1
69        1
Name: count, Length: 1244, dtype: int64

In [9]:
combined_dataframe = combined_dataframe.groupby('labels').filter(lambda g: len(g) >= 50)

In [10]:
combined_dataframe['labels'].value_counts()

labels
1042    1200
498     1200
849     1200
986     1200
824     1200
        ... 
522       50
109       50
838       50
953       50
623       50
Name: count, Length: 544, dtype: int64

In [11]:
print(len(combined_dataframe))

159165


In [12]:
from sklearn.model_selection import train_test_split

In [13]:
def stratified_split(df, label_col='labels'):
    # Group by label and pick 10 samples per class for the test set
    temp_df = df.groupby(label_col, group_keys=False).apply(lambda x: x.sample(n=10, random_state=42))

    # Remove test samples from the full dataset to form the training set
    train_df = df.drop(index=temp_df.index)

    '''train_df, temp_df = train_test_split(
        df,
        test_size=0.2,
        stratify=df[label_col],
        random_state=42
    )'''

    # Step 2: Split the 20% temp into 10% val and 10% test (i.e., 50% of temp each)
    val_df, test_df = train_test_split(
        temp_df,
        test_size=0.5,
        stratify=temp_df[label_col],
        random_state=42
    )

    return train_df, val_df, test_df

# Usage
train_df, val_df, test_df = stratified_split(combined_dataframe, label_col='labels')


  temp_df = df.groupby(label_col, group_keys=False).apply(lambda x: x.sample(n=10, random_state=42))


In [14]:
final_train_data = {}
final_val_data = {}
final_test_data = {}
final_train_data['data'] = train_df.to_dict('records')
final_val_data['data'] = val_df.to_dict('records')
final_test_data['data'] = test_df.to_dict('records')

In [15]:
print("train size:", len(train_df))
print("val size:", len(val_df))
print("test size:", len(test_df))

train size: 146988
val size: 2720
test size: 2720


In [16]:
with open("inaturalist_val_label_v1-3.json", "w") as f:
    json.dump(final_val_data, f)

In [17]:
with open("inaturalist_train_label_v1-3.json", "w") as f:
    json.dump(final_train_data, f)

In [18]:
with open("inaturalist_test_label_v1-3.json", "w") as f:
    json.dump(final_test_data, f)