In [2]:
import librosa
import numpy as np
import pandas as pd

In [3]:
# Read the metadata file
df = pd.read_csv("./Data2/metadata.csv") # You may need to adjust the path to find the Data folder
path = './Data2/'

# Remove entries with duration 0 seconds, these files are likely to be too short
indexZero = df[ df['seconds'] == 0 ].index
df.drop(indexZero , inplace=True)

# Group the df by class and seconds. 
g = df.groupby('class')['seconds'].apply(lambda x: np.unique(x))
classes = g.index
uni_durations = g.values

# Split the df into sub dataframes based on class and duration
sub_dfs = []
for cls, dur in zip(classes, uni_durations):
        for d in dur:
            sub_df = df.loc[(df["class"] == cls) & (df["seconds"] == d)]
            sub_dfs.append(sub_df)

# Extract audio features for each file in each sub_df
grped_dfs = []
for sub_df in sub_dfs:
        grp_df = sub_df
        audios, srs = [], []
        for fn, cls in zip(grp_df["filename"], grp_df['class']):
                path = path  # You may need to adjust the path to find the Data folder
                fp = f"{path}{cls}/{fn}"

                try:
                        audio, sr = librosa.load(fp, res_type="kaiser_best")
                        audios.append(audio)
                        srs.append(sr)
                except(ValueError):
                        audios.append(np.nan)
                        srs.append(np.nan)

        grp_df['audio'] = audios
        grp_df.drop('Unnamed: 0', axis=1, inplace=True)
        grped_dfs.append(grp_df)


find_dup_dfs = []
for grped_df in grped_dfs:

    find_dup_df = grped_df
    # Reset index because we will want to be able to reference the original index 
    find_dup_df = find_dup_df.reset_index()
    find_dup_df.rename(columns = {'index':'original_index'}, inplace = True)

    # Algorithm to find duplicates
    a = find_dup_df['audio']
    n =  len(a)
    l, r = 0, 1

    while(r < n):
        current_row = a.iloc[l]
        subseq_rows = a.iloc[r:]
        dup= []
        for i in subseq_rows:
            try:
                if np.sum(current_row - i) == 0:
                    dup.append("Y")
                else:
                    dup.append("N")
            except(ValueError):
                dup.append("N")
        
        n_nas = n - len(dup)
        l_nas = [np.nan] * n_nas
        dup = l_nas+dup
        find_dup_df[f"dup_{l}"] = dup
        l += 1
        r += 1

    #Create a copy and reset index to reference duplicates later
    find_dup_copy = find_dup_df.reset_index()
    find_dup_copy.rename(columns = {'index':'dup_index'}, inplace = True)

    #Filter each df to show only "Y" duplicates
    filtered_dup_df = find_dup_df[find_dup_df.isin(['Y']).any(axis=1)]
    filtered_dup_df = filtered_dup_df.reset_index()
    filtered_dup_df.rename(columns = {'index':'dup_index'}, inplace = True)
    
    dup_cols = []
    for i in range(l): # Reuse pointer from before
        dup_col_name = f"dup_{i}"
        dup_cols.append(dup_col_name)

    dups = []
    for dup_col in dup_cols:
        s = filtered_dup_df.loc[(filtered_dup_df[dup_col] == "Y")]
        s = s.loc[:, ['dup_index', 'filename', 'seconds', 'class', dup_col]]
        if len(s) > 0:
            dup_pair = (s['dup_index'].values[0], s[dup_col].name)
            dups.append(dup_pair)
            # print(s['dup_index'].values[0], s[dup_col].name)

    find_dup_copy = find_dup_copy.loc[:, ['dup_index', 'original_index', 'filename', 'seconds', 'class']]
    filtered_dup_df = filtered_dup_df.loc[:, ['dup_index', 'original_index', 'filename', 'seconds', 'class']]
    
    if len(filtered_dup_df) > 0:
        ps = []
        for d in dups:  

            di = d[0]
            di_2 = d[1].partition('_')[2]

            p = filtered_dup_df.loc[(filtered_dup_df['dup_index'] == di)]
            p2 = find_dup_copy.loc[(find_dup_copy['dup_index'] == int(di_2))]
            p['dup_filename'] = p2.filename.values[0]
            ps.append(p.values)
        
        new_rows = []
        for p in ps:
            new_row = list(p[0])
            new_rows.append(new_row)

        new_df = pd.DataFrame(new_rows, columns=['dup_index', 'original_index', 'filename', 'seconds', 'class', 'duplicate_filename'])
        find_dup_dfs.append(new_df)

final_dups = pd.concat(find_dup_dfs) 
final_dups = final_dups.reset_index()
final_dups.drop("index", axis=1, inplace=True)     
final_dups.to_csv(f'{path}duplicates.csv')              
            

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  grp_df['audio'] = audios
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  grp_df.drop('Unnamed: 0', axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  grp_df['audio'] = audios
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/ind

In [4]:
pd.read_csv('./Data2/duplicates.csv')

Unnamed: 0.1,Unnamed: 0,dup_index,original_index,filename,seconds,class,duplicate_filename
0,0,1,170,X03498.mp3,114.0,Caligavis chrysops Yellow-faced honeyeater,X03492.mp3
1,1,1,278,X03688.mp3,73.0,Colluricincla harmonica Grey shrikethrush,X01256.mp3
2,2,1,227,X01238.mp3,75.0,Colluricincla harmonica Grey shrikethrush,X01233.mp3
3,3,1,249,X01260.mp3,78.0,Colluricincla harmonica Grey shrikethrush,X01231.mp3
4,4,1,234,X01245.mp3,93.0,Colluricincla harmonica Grey shrikethrush,X01234.mp3
5,5,1,284,X03694.mp3,127.0,Colluricincla harmonica Grey shrikethrush,X01240.mp3
6,6,1,266,X02667.mp3,185.0,Colluricincla harmonica Grey shrikethrush,X02139.mp3
7,7,1,255,X02302.mp3,292.0,Colluricincla harmonica Grey shrikethrush,X01236.mp3
8,8,1,305,X01453.mp3,53.0,Corvus coronoides Australian raven,X01445.mp3
9,9,1,338,X03894.mp3,178.0,Corvus coronoides Australian raven,X01446.mp3
