In [1]:
import pandas as pd

In [2]:
df_train = pd.read_csv("train_index.csv")
df_test = pd.read_csv("test_index.csv")
df = pd.concat([df_train, df_test], ignore_index=True)
df = df[['filename', 'speaker', 'gender']]
df['id'] = df.filename.apply(lambda x: x.split('-')[-5])
df_filtered = df[df.groupby("speaker")["id"].transform("nunique") == 2]

In [None]:
from sklearn.model_selection import train_test_split

def preprocess_data(n_speakers=110, random_state=83):
    """
    Choose speakers and prepare train-val split
    
    Args:
        df: DataFrame to preprocess.
        
    Returns:
        Preprocessed DataFrame.
    """
    df_train = pd.read_csv("train_index.csv")
    df_test = pd.read_csv("test_index.csv")
    df = pd.concat([df_train, df_test], ignore_index=True)
    df = df[['filename', 'speaker', 'gender']]
    # df['filename'] = df['filename'].str.replace('VOiCES_devkit', 'spectrograms_dataset', regex=False)
    # df['filename'] = df['filename'].str.replace('.wav', '.pt', regex=False)
    spk_df = df[['speaker', 'gender']].drop_duplicates()
    # przydziel etykiety
    allowed_idx = (
        spk_df.groupby('gender', group_keys=False)
                .apply(lambda x: x.sample(frac=n_speakers/spk_df.shape[0], random_state=random_state), include_groups=False)
                ['speaker']
    )
    df  = df[df['speaker'].isin(allowed_idx)]
    print(f"Number of speakers in the dataset: {df['speaker'].nunique()}")
    print(f"Number of segments in the dataset: {df.shape[0]}")
    spk_df = df[['speaker', 'gender']].drop_duplicates()
    allowed_idx = (
        spk_df.groupby('gender', group_keys=False)
                .apply(lambda x: x.sample(frac=0.10, random_state=random_state), include_groups=False)
                ['speaker']
    )
    allowed_df  = df[df['speaker'].isin(allowed_idx)]
    non_allowed_df = df[~df['speaker'].isin(allowed_idx)]
    allowed_df['label'] = 1
    non_allowed_df['label'] = 0
    # df = pd.concat([allowed_df, non_allowed_df], ignore_index=True)
    # train-test split
    allowed_df_train, allowed_df_test = train_test_split(
        allowed_df,                      # Twój pełny DataFrame
        test_size=0.2,           # 20 % w test / 80 % w train (zmień wedle potrzeb)
        random_state=42,         # powtarzalność losowania
        stratify=allowed_df['gender']  # << klucz stratyfikacji
    )
    allowed_df_val, allowed_df_test = train_test_split(
        allowed_df_test,                      # Twój pełny DataFrame
        test_size=0.5,           # 20 % w test / 80 % w train (zmień wedle potrzeb)
        random_state=42,         # powtarzalność losowania
        stratify=allowed_df_test['gender']  # << klucz stratyfikacji
    )
    non_allowed_df_train, non_allowed_df_test = train_test_split(
        non_allowed_df,                      # Twój pełny DataFrame
        test_size=0.2,           # 20 % w test / 80 % w train (zmień wedle potrzeb)
        random_state=42,         # powtarzalność losowania
        stratify=non_allowed_df['gender']  # << klucz stratyfikacji
    )
    non_allowed_df_val, non_allowed_df_test = train_test_split(
        non_allowed_df_test,                      # Twój pełny DataFrame
        test_size=0.5,           # 20 % w test / 80 % w train (zmień wedle potrzeb)
        random_state=42,         # powtarzalność losowania
        stratify=non_allowed_df_test['gender']  # << klucz stratyfikacji
    )
    train_df = pd.concat([allowed_df_train, non_allowed_df_train], ignore_index=True)
    val_df = pd.concat([allowed_df_val, non_allowed_df_val], ignore_index=True)
    test_df = pd.concat([allowed_df_test, non_allowed_df_test], ignore_index=True)
    # sortowanie
    train_df.to_csv("data/train_df1.csv", index=False)
    val_df.to_csv("data/val_df1.csv", index=False)
    test_df.to_csv("data/test_df1.csv", index=False)

In [3]:
def preprocess_data_non_leakage(n_speakers=66, random_state=83):
    """
    Choose speakers and prepare train-val split
    
    Args:
        df: DataFrame to preprocess.
        
    Returns:
        Preprocessed DataFrame.
    """
    df_train = pd.read_csv("train_index.csv")
    df_test = pd.read_csv("test_index.csv")
    df = pd.concat([df_train, df_test], ignore_index=True)
    df = df[['filename', 'speaker', 'gender']]
    df['id'] = df.filename.apply(lambda x: x.split('-')[-5])
    df = df[df.groupby("speaker")["id"].transform("nunique") == 2]

    spk_df = df[['speaker', 'gender']].drop_duplicates()
    # przydziel etykiety
    allowed_idx = (
        spk_df.groupby('gender', group_keys=False)
                .apply(lambda x: x.sample(frac=n_speakers/spk_df.shape[0], random_state=random_state), include_groups=False)
                ['speaker']
    )
    df  = df[df['speaker'].isin(allowed_idx)]
    print(f"Number of speakers in the dataset: {df['speaker'].nunique()}")
    print(f"Number of segments in the dataset: {df.shape[0]}")
    spk_df = df[['speaker', 'gender']].drop_duplicates()
    allowed_idx = (
        spk_df.groupby('gender', group_keys=False)
                .apply(lambda x: x.sample(frac=0.10, random_state=random_state), include_groups=False)
                ['speaker']
    )
    allowed_df  = df[df['speaker'].isin(allowed_idx)]
    non_allowed_df = df[~df['speaker'].isin(allowed_idx)]
    allowed_df['label'] = 1
    non_allowed_df['label'] = 0
    # df = pd.concat([allowed_df, non_allowed_df], ignore_index=True)
    # train-test split po dwóch rodzajach nagrania dla każdego speakera
    # allowed speakers
    allowed_df["grp"] = (
        allowed_df.groupby("speaker")["id"]          # grupujemy po speaker
        .transform(lambda x: pd.factorize(x)[0])  # 0 dla 1. id, 1 dla 2. id
    )
    allowed_df_train = allowed_df[allowed_df["grp"] == 0].drop(columns="grp").copy()
    allowed_df_test = allowed_df[allowed_df["grp"] == 1].drop(columns="grp").copy()
    # non-allowed speakers
    non_allowed_df["grp"] = (
        non_allowed_df.groupby("speaker")["id"]          # grupujemy po speaker
        .transform(lambda x: pd.factorize(x)[0])  # 0 dla 1. id, 1 dla 2. id
    )
    non_allowed_df_train = non_allowed_df[non_allowed_df["grp"] == 0].drop(columns="grp").copy()
    non_allowed_df_test = non_allowed_df[non_allowed_df["grp"] == 1].drop(columns="grp").copy()

    #val-test split for allowed speakers
    allowed_df_test_shuffled = (
        allowed_df_test
        .groupby("speaker", group_keys=False)
        .apply(lambda g: g.sample(frac=1, random_state=83))
    )
    allowed_df_test_shuffled["part"] = (
        allowed_df_test_shuffled.groupby("speaker").cumcount() % 2
    )
    allowed_df_val = allowed_df_test_shuffled[allowed_df_test_shuffled["part"] == 0].drop(columns="part").copy()
    allowed_df_test = allowed_df_test_shuffled[allowed_df_test_shuffled["part"] == 1].drop(columns="part").copy()

    #val-test split for non-allowed speakers
    non_allowed_df_test_shuffled = (
        non_allowed_df_test
        .groupby("speaker", group_keys=False)
        .apply(lambda g: g.sample(frac=1, random_state=83))
    )
    non_allowed_df_test_shuffled["part"] = (
        non_allowed_df_test_shuffled.groupby("speaker").cumcount() % 2
    )
    non_allowed_df_val = non_allowed_df_test_shuffled[non_allowed_df_test_shuffled["part"] == 0].drop(columns="part").copy()
    non_allowed_df_test = non_allowed_df_test_shuffled[non_allowed_df_test_shuffled["part"] == 1].drop(columns="part").copy()


    train_df = pd.concat([allowed_df_train, non_allowed_df_train], ignore_index=True)
    val_df = pd.concat([allowed_df_val, non_allowed_df_val], ignore_index=True)
    test_df = pd.concat([allowed_df_test, non_allowed_df_test], ignore_index=True)
    # sortowanie
    train_df.to_csv("data/train_df.csv", index=False)
    val_df.to_csv("data/val_df.csv", index=False)
    test_df.to_csv("data/test_df.csv", index=False)
    print(f"Number of speakers in the train set: {train_df['speaker'].nunique()}")
    print(f"Number of speakers in the val set: {val_df['speaker'].nunique()}")
    print(f"Number of speakers in the test set: {test_df['speaker'].nunique()}")

In [4]:
preprocess_data_non_leakage()

Number of speakers in the dataset: 66
Number of segments in the dataset: 4224
Number of speakers in the train set: 66
Number of speakers in the val set: 66
Number of speakers in the test set: 66


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  allowed_df['label'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  non_allowed_df['label'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  allowed_df["grp"] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the

In [5]:
import pandas as pd

def get_spectrogram_df(phase):
    df = pd.read_csv(f'data/{phase}_df.csv')
    df['filename'] = df['filename'].str.replace('speech', 'spectrograms_dataset', regex=False)
    df['filename'] = df['filename'].str.replace('.wav', '.pt', regex=False)
    df.to_csv(f'data/{phase}_spectogram_df.csv', index=False)

In [6]:
get_spectrogram_df('train')
get_spectrogram_df('val')
get_spectrogram_df('test')

In [None]:
df_train = pd.read_csv(f'data/train_spectogram_df.csv')
df_val = pd.read_csv(f'data/val_spectogram_df.csv')
df_test = pd.read_csv(f'data/test_spectogram_df.csv')

In [None]:
df_train.speaker.nunique(), df_val.speaker.nunique(), df_test.speaker.nunique()

In [None]:
len(set(df_train.speaker.unique()) & set(df_test.speaker.unique()))

In [None]:
# znajdz różnicę symetryczną dwóch list
set(df_train.speaker.unique()) ^ set(df_test.speaker.unique())

In [None]:
# znajdz część wspólna dwóch list
def find_common_elements(list1, list2):
    """
    Find common elements between two lists.
    
    Args:
        list1: First list of elements.
        list2: Second list of elements.
        
    Returns:
        List of common elements.
    """
    return list(set(list1) & set(list2))

In [None]:
df_train = pd.read_csv("data/train_df.csv")
df_train.groupby("speaker")["id"].nunique().values

In [None]:
df_train = pd.read_csv("data/train_df.csv")
df_val = pd.read_csv("data/val_df.csv")
df_test = pd.read_csv("data/test_df.csv")
df_test.speaker.unique().shape

In [None]:
df_train

In [None]:
df_val

In [None]:
df_test

In [None]:
df_train = pd.read_csv("data/train_df1.csv")
df_val = pd.read_csv("data/val_df1.csv")
df_test = pd.read_csv("data/test_df1.csv")
# df = pd.concat([df_train, df_test], ignore_index=True)
# df = df[['filename', 'speaker', 'gender']]
df['filename'] = df['filename'].str.replace('VOiCES_devkit', 'spectrograms_dataset', regex=False)
df['filename'] = df['filename'].str.replace('.wav', '.pt', regex=False)

In [None]:
df_train.label.value_counts()

In [None]:
df_test.drop_duplicates(subset=['filename']).shape

In [None]:
df.groupby("speaker")["id"].nunique().values

In [None]:
df[df.speaker == 8152]

In [None]:
[f for f in df.filename if os.path.exists(f) ]

In [None]:
df['id'] = df['filename'].apply(lambda x: x[-6:-3])

In [None]:
df[df.id == '020']

In [None]:
df['id'] = df.filename.apply(lambda x: x.split('-')[-5])

In [None]:
# mając dataframe df chce sprawdzić czy każdy speaker jest przypisany do dokładnie jednego id
df

In [None]:
df.id.value_counts()

In [None]:
speaker_id_counts = df.groupby("speaker")["id"].nunique()
speaker_id_counts

In [None]:
speaker_id_counts.values

In [None]:
df["grp"] = (
    df.groupby("speaker")["id"]          # grupujemy po speaker
      .transform(lambda x: pd.factorize(x)[0])  # 0 dla 1. id, 1 dla 2. id
)


In [None]:
df1 = df[df["grp"] == 0].drop(columns="grp").copy()
df2 = df[df["grp"] == 1].drop(columns="grp").copy()


In [None]:
df2.groupby("speaker")["id"].nunique().values