In [1]:
import os
import sys
import pandas as pd
from sklearn.model_selection import StratifiedKFold

features_path = "/data/eihw-gpu6/gebhaale/BIRDS/ast-embeddings/features.csv"
features_df = pd.read_csv(features_path)
features_df = features_df.rename(columns={'file': 'filename'})
features_df['filename'] = features_df['filename'].apply(lambda x: x.lstrip('/'))
features_df['species'] = features_df['filename'].apply(lambda x: x.split('/')[0])
features_df

Unnamed: 0,filename,features,species
0,Accipiter_gentilis/308130.mp3,0000000000.npy,Accipiter_gentilis
1,Accipiter_gentilis/102849.mp3,0000000001.npy,Accipiter_gentilis
2,Accipiter_gentilis/308382.mp3,0000000002.npy,Accipiter_gentilis
3,Accipiter_gentilis/105802.mp3,0000000003.npy,Accipiter_gentilis
4,Accipiter_gentilis/308687.mp3,0000000004.npy,Accipiter_gentilis
...,...,...,...
104164,Vanellus_vanellus/77360.mp3,0000104164.npy,Vanellus_vanellus
104165,Vanellus_vanellus/707046.mp3,0000104165.npy,Vanellus_vanellus
104166,Vanellus_vanellus/91644.mp3,0000104166.npy,Vanellus_vanellus
104167,Vanellus_vanellus/707047.mp3,0000104167.npy,Vanellus_vanellus


In [52]:
X = features_df.drop(columns=['species']).values
y = features_df['species'].values

X, y

(array([['Accipiter_gentilis/308130.mp3', '0000000000.npy'],
        ['Accipiter_gentilis/102849.mp3', '0000000001.npy'],
        ['Accipiter_gentilis/308382.mp3', '0000000002.npy'],
        ...,
        ['Vanellus_vanellus/91644.mp3', '0000104166.npy'],
        ['Vanellus_vanellus/707047.mp3', '0000104167.npy'],
        ['Vanellus_vanellus/707048.mp3', '0000104168.npy']], dtype=object),
 array(['Accipiter_gentilis', 'Accipiter_gentilis', 'Accipiter_gentilis',
        ..., 'Vanellus_vanellus', 'Vanellus_vanellus', 'Vanellus_vanellus'],
       dtype=object))

In [53]:
num_folds = 5
seed = 42
skf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=seed)

destination_folder = f"/nas/staff/data_work/AG/BIRDS/{num_folds}-fold/"
os.makedirs(destination_folder, exist_ok=True)

In [54]:
for fold_idx, (train_val_indices, test_indices) in enumerate(skf.split(X, y)):
    print(f"Fold {fold_idx + 1}")
    fold_dir = os.path.join(destination_folder, str(fold_idx))
    os.makedirs(fold_dir, exist_ok=True)

    # Split the train_val_indices further into train and validation sets
    skf_train_val = StratifiedKFold(n_splits=4, shuffle=True, random_state=seed)
    train_indices, val_indices = next(skf_train_val.split(X[train_val_indices], y[train_val_indices]))

    # Get the data and labels for each set in this fold
    X_train, y_train = X[train_indices], y[train_indices]
    X_val, y_val = X[val_indices], y[val_indices]
    X_test, y_test = X[test_indices], y[test_indices]

    train_df = features_df.iloc[train_indices]
    dev_df = features_df.iloc[val_indices]
    test_df = features_df.iloc[test_indices]

    print("Len Train: ", len(train_df)/len(features_df))
    print("Len Val: ", len(dev_df)/len(features_df))
    print("Len Test: ", len(test_df)/len(features_df))
    train_df.to_csv(os.path.join(fold_dir, 'train.csv'), index=None)
    dev_df.to_csv(os.path.join(fold_dir, 'devel.csv'), index=None)
    test_df.to_csv(os.path.join(fold_dir, 'test.csv'), index=None)


Fold 1
Len Train:  0.5999961600860141
Len Val:  0.20000191995699296
Len Test:  0.20000191995699296
Fold 2
Len Train:  0.5999961600860141
Len Val:  0.20000191995699296
Len Test:  0.20000191995699296
Fold 3
Len Train:  0.5999961600860141
Len Val:  0.20000191995699296
Len Test:  0.20000191995699296
Fold 4
Len Train:  0.5999961600860141
Len Val:  0.20000191995699296
Len Test:  0.20000191995699296
Fold 5
Len Train:  0.6000057598709789
Len Val:  0.20000191995699296
Len Test:  0.19999232017202814


In [55]:
print(train_df['species'].value_counts())

Parus_major               3529
Phylloscopus_collybita    2627
Fringilla_coelebs         2565
Loxia_curvirostra         2544
Erithacus_rubecula        2105
                          ... 
Milvus_migrans             165
Columba_oenas              134
Ciconia_ciconia             80
Pernis_apivorus             76
Milvus_milvus               73
Name: species, Length: 81, dtype: int64


In [56]:
print(dev_df['species'].value_counts())

Parus_major               1192
Fringilla_coelebs          870
Phylloscopus_collybita     864
Loxia_curvirostra          837
Erithacus_rubecula         703
                          ... 
Milvus_migrans              46
Columba_oenas               39
Ciconia_ciconia             26
Milvus_milvus               25
Pernis_apivorus             21
Name: species, Length: 81, dtype: int64


In [57]:
print(test_df['species'].value_counts())

Parus_major               944
Phylloscopus_collybita    699
Fringilla_coelebs         687
Loxia_curvirostra         676
Turdus_merula             604
                         ... 
Milvus_migrans             43
Columba_oenas              34
Ciconia_ciconia            22
Pernis_apivorus            19
Milvus_milvus              19
Name: species, Length: 95, dtype: int64


### Partition the species in an exclusive way for train, dev, test for 0-shot

In [68]:
destination_path = "/nas/staff/data_work/AG/BIRDS/zsl-5fold/"
# os.makedirs(destination_path, exist_ok=True)

X = features_df.drop(columns=["species"]).values
y = features_df["species"].values
groups = features_df["species"].values

num_folds = 10
gkf = GroupKFold(n_splits=num_folds)
gkf_train_val = GroupKFold(n_splits=8)

In [70]:
for i, (train_val_indices, test_indices) in enumerate(gkf.split(X, y, groups)):
    if i > 4:
        break
    print(f"\nFold {i}:")
    store_path = os.path.join(destination_path, str(i))
    os.makedirs(store_path, exist_ok=True)

    # groups_to_drop = features_df.iloc[test_indices]['species'].unique()
    # train_indices, val_indices = next(gkf_train_val.split(X[train_val_indices], y[train_val_indices], groups[train_val_indices]))
    train_val_df = features_df.iloc[train_val_indices]
    # dev = features_df.iloc[val_indices]
    test_df = features_df.iloc[test_indices]
    groups_train_val = features_df["species"].iloc[train_val_indices]
    
    train_indices, val_indices = next(gkf_train_val.split(train_val_df.values, train_val_df['species'].values, train_val_df['species'].values))
    train_df = train_val_df.iloc[train_indices]
    val_df = train_val_df.iloc[val_indices]

    print("Labels train: ", len(train_df['species'].unique()))
    print("Labels val: ", len(val_df['species'].unique()))
    print("Labels test: ", len(test_df['species'].unique()))
    train_df.to_csv(os.path.join(store_path, 'train.csv'), index=None)
    val_df.to_csv(os.path.join(store_path, 'devel.csv'), index=None)
    test_df.to_csv(os.path.join(store_path, 'test.csv'), index=None)



Fold 0:
Labels train:  76
Labels val:  11
Labels test:  8

Fold 1:
Labels train:  76
Labels val:  10
Labels test:  9

Fold 2:
Labels train:  76
Labels val:  10
Labels test:  9

Fold 3:
Labels train:  76
Labels val:  10
Labels test:  9

Fold 4:
Labels train:  76
Labels val:  9
Labels test:  10
