In [104]:
import pandas as pd
import os

In [105]:
storage_path = '/work/bbc6523/diverse_voices/'

In [106]:
df = pd.read_csv(storage_path + 'data/english_accents.csv')

In [107]:
df = df[['age', 'age_onset', 'birthplace', 'filename', 'native_language', 'sex',
       'speakerid', 'country', 'file_missing?', 'file_found', 'city',
       'city_nums', 'state']]

In [108]:
import pandas as pd
import plotly.express as px

# Aggregate counts
grouped = df.groupby(['native_language', 'sex']).size().reset_index(name='count')

# Plotly grouped bar chart
fig = px.bar(
    grouped,
    x='native_language',
    y='count',
    color='sex',
    barmode='group',
    title='Number of Samples by Accent and Gender'
)

fig.show()

In [109]:
matched_rows = []

# Group by accent
for accent, group in df.groupby('native_language'):
    males = group[group['sex'] == 'male'].copy()
    females = group[group['sex'] == 'female'].copy()

    used_male_idx = set()
    used_female_idx = set()

    # Loop through age differences from 0 to 5
    for age_diff in range(0, 6):
        for f_idx, f_row in females.iterrows():
            if f_idx in used_female_idx:
                continue

            # Match on country too if accent is "english"
            if accent == "english":
                condition = (
                    (males['country'] == f_row['country']) &
                    (~males.index.isin(used_male_idx)) &
                    (abs(males['age'] - f_row['age']) == age_diff)
                )
            else:
                condition = (
                    (~males.index.isin(used_male_idx)) &
                    (abs(males['age'] - f_row['age']) == age_diff)
                )

            possible_matches = males[condition]

            if not possible_matches.empty:
                m_idx = possible_matches.index[0]  # take first match
                matched_rows.append(f_row)
                matched_rows.append(males.loc[m_idx])
                used_female_idx.add(f_idx)
                used_male_idx.add(m_idx)

# Final matched, balanced DataFrame
balanced_df = pd.DataFrame(matched_rows).reset_index(drop=True)

In [110]:
# Aggregate counts
grouped = balanced_df.groupby(['native_language', 'sex']).size().reset_index(name='count')

# Plotly grouped bar chart
fig = px.bar(
    grouped,
    x='native_language',
    y='count',
    color='sex',
    barmode='group',
    title='Number of Samples by Accent and Gender'
)

fig.show()

In [111]:
balanced_df['filename'] = storage_path + 'english_accent_audio/' + balanced_df['filename'] + '.mp3'

In [112]:
balanced_df = balanced_df.rename(columns={'sex': 'gender', 'native_language': 'accent', 'filename':'audio_file'})

In [113]:
balanced_df['source'] = 'english_accents'

In [114]:
balanced_df.columns

Index(['age', 'age_onset', 'birthplace', 'audio_file', 'accent', 'gender',
       'speakerid', 'country', 'file_missing?', 'file_found', 'city',
       'city_nums', 'state', 'source'],
      dtype='object')

In [115]:
english_accents = balanced_df[['age', 'age_onset', 'birthplace', 'audio_file', 'accent', 'gender',
       'speakerid', 'country', 'city',
       'city_nums', 'state', 'source']].copy()

In [184]:
english_accents = english_accents[english_accents['accent'] == 'english']

In [185]:
english_accents

Unnamed: 0,age,age_onset,birthplace,audio_file,accent,gender,speakerid,country,city,city_nums,state,source
192,35.0,0.0,"davenport, iowa, usa",/work/bbc6523/diverse_voices/english_accent_au...,english,female,62,usa,davenport,3,iowa,english_accents
193,35.0,0.0,"oakland, california, usa",/work/bbc6523/diverse_voices/english_accent_au...,english,male,535,usa,oakland,3,california,english_accents
194,23.0,0.0,"miami, florida, usa",/work/bbc6523/diverse_voices/english_accent_au...,english,female,63,usa,miami,3,florida,english_accents
195,23.0,0.0,"west palm beach, florida, usa",/work/bbc6523/diverse_voices/english_accent_au...,english,male,497,usa,west palm beach,3,florida,english_accents
196,18.0,0.0,"toronto, ontario, canada",/work/bbc6523/diverse_voices/english_accent_au...,english,female,64,canada,toronto,3,ontario,english_accents
...,...,...,...,...,...,...,...,...,...,...,...,...
657,43.0,0.0,"syracuse, new york, usa",/work/bbc6523/diverse_voices/english_accent_au...,english,male,1555,usa,syracuse,3,new york,english_accents
658,38.0,0.0,"san leandro, california, usa",/work/bbc6523/diverse_voices/english_accent_au...,english,female,2170,usa,san leandro,3,california,english_accents
659,43.0,0.0,"castro valley, california, usa",/work/bbc6523/diverse_voices/english_accent_au...,english,male,109,usa,castro valley,3,california,english_accents
660,48.0,0.0,"carthage, texas, usa",/work/bbc6523/diverse_voices/english_accent_au...,english,female,154,usa,carthage,3,texas,english_accents


## British dialects

In [201]:
import librosa

def add_audio_duration(row):
    audio, sr = librosa.load(row['audio_file'])
    # Duration in seconds
    duration = len(audio) / sr
    return duration

In [202]:
british_dialects = pd.read_csv(storage_path +'/data/'+ f'british_dialects.csv')

In [200]:


matched_rows = []
total_rows = 0
for accent in ['scottish', 'midlands', 'welsh', 'southern', 'northern']:
    df_male = pd.read_csv(storage_path +'/data/'+ f'british_dialects_{accent}_male.csv')
    df_male = df_male[['line_id', 'audio_file', 'text']]
    df_male['gender'] = 'male'
    df_male['accent'] = accent
    df_male['audio_file'] =  storage_path + 'british_dialects_audio/' + f'british_dialects_{accent}_male/'+ df_male['audio_file'] + '.wav'
    total_rows +=len(df_male)
    df_female = pd.read_csv(storage_path +'/data/'+ f'british_dialects_{accent}_female.csv')
    df_female = df_female[['line_id', 'audio_file', 'text']]
    df_female['gender'] = 'female'
    df_female['accent'] = accent
    df_female['audio_file'] =  storage_path + 'british_dialects_audio/' + f'british_dialects_{accent}_female/'+ df_female['audio_file'] + '.wav'
    total_rows +=len(df_female)
    df_female['audio_duration'] = df_female.apply(lambda row: add_audio_duration(row), axis=1)
    df_male['audio_duration'] = df_male.apply(lambda row: add_audio_duration(row), axis=1)

    df_male = df_male[df_male['audio_duration'] > 4]
    df_female = df_female[df_female['audio_duration'] > 4]
    
    used_female_idx = set()
    used_male_idx = set()

    for f_idx, f_row in df_female.iterrows():
        if f_idx in used_female_idx:
            continue

        condition = (
            (df_male['line_id'] == f_row['line_id']) &
            (df_male['text'] == f_row['text']) &
            (~df_male.index.isin(used_male_idx)) &
            (abs(df_male['audio_duration'] - f_row['audio_duration']) < 1.0)
        )

        possible_matches = df_male[condition]

        if not possible_matches.empty:
            m_idx = possible_matches.index[0]  # take the first match
            matched_rows.append(f_row)
            matched_rows.append(df_male.loc[m_idx])
            used_female_idx.add(f_idx)
            used_male_idx.add(m_idx)

    # Final matched, balanced DataFrame
    balanced_df = pd.DataFrame(matched_rows).reset_index(drop=True)



KeyboardInterrupt: 

In [151]:
len(balanced_df)

8070

In [152]:
# Aggregate counts
grouped = balanced_df.groupby(['accent', 'gender']).size().reset_index(name='count')

# Plotly grouped bar chart
fig = px.bar(
    grouped,
    x='accent',
    y='count',
    color='gender',
    barmode='group',
    title='Number of Samples by Accent and Gender'
)

fig.show()

In [203]:
british_dialects['source'] = 'british_dialects'

In [204]:
british_dialects

Unnamed: 0,line_id,audio_file,text,audio_duration,gender,accent,source
0,EN0003,/work/bbc6523/diverse_voices/british_dialects_...,These take the shape of a long round arch with...,8.192000,male,scottish,british_dialects
1,EN0049,/work/bbc6523/diverse_voices/british_dialects_...,The powers of the appointed chair are limited ...,7.082667,male,scottish,british_dialects
2,EN0017,/work/bbc6523/diverse_voices/british_dialects_...,The width of the coloured band increases as th...,6.058667,male,scottish,british_dialects
3,EN0015,/work/bbc6523/diverse_voices/british_dialects_...,Many complicated ideas about the rainbow have ...,4.864000,male,scottish,british_dialects
4,EN0031,/work/bbc6523/diverse_voices/british_dialects_...,Nuclear fusion on a large scale in an explosio...,8.192000,male,scottish,british_dialects
...,...,...,...,...,...,...,...
495,EN0012,/work/bbc6523/diverse_voices/british_dialects_...,Others have tried to explain the phenomenon ph...,5.888000,female,welsh,british_dialects
496,EN0031,/work/bbc6523/diverse_voices/british_dialects_...,Nuclear fusion on a large scale in an explosio...,10.752000,female,welsh,british_dialects
497,EN0008,/work/bbc6523/diverse_voices/british_dialects_...,Some have accepted it as a miracle without phy...,6.314667,female,welsh,british_dialects
498,EN1321,/work/bbc6523/diverse_voices/british_dialects_...,Pillow lavas are lavas that contain characteri...,10.837333,female,welsh,british_dialects


## Stereoset

In [226]:
df = pd.read_csv(storage_path + 'data/gender_stereoset.csv')

In [227]:
import pandas as pd

# Step 1: Keep only contexts with both genders
gender_counts = df.groupby('context')['gender'].nunique()
valid_contexts = gender_counts[gender_counts == 2].index
df = df[df['context'].isin(valid_contexts)]

# Step 2: Subsample exactly 3 male and 3 female per context
def balanced_sample(group):
    males = group[group['gender'] == 'male']
    females = group[group['gender'] == 'female']
    if len(males) >= 1 and len(females) >= 1:
        sampled = pd.concat([
            males.sample(n=1, random_state=42),
            females.sample(n=1, random_state=42)
        ])
        return sampled
    else:
        return pd.DataFrame()  # skip if not enough per gender

stereoset_sampled = (
    df.groupby('context', group_keys=False)
    .apply(balanced_sample)
    .reset_index(drop=True)
)






In [229]:
def generate_file_name(row):
    string_id = str(row['id'])
    new_id = (5-len(string_id)) * '0' + str(string_id)
    filename = storage_path + 'stereoset_audio/' +  new_id + '_' + row['speaker'] + '.wav'
    return filename

stereoset_sampled['audio_file'] = stereoset_sampled.apply(lambda row: generate_file_name(row), axis=1)
stereoset_sampled['audio_duration'] = stereoset_sampled.apply(lambda row: add_audio_duration(row), axis=1)


In [231]:
stereoset_sampled = stereoset_sampled[stereoset_sampled['audio_duration'] > 3]

In [230]:
len(stereoset_sampled[stereoset_sampled['audio_duration'] > 3.5])

309

In [232]:
stereoset_sampled['source'] = 'stereoset_speech_gender'



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [233]:
stereoset_sampled = stereoset_sampled.rename(columns={'context': 'text'})

In [234]:
full_df = pd.concat([english_accents, british_dialects, stereoset_sampled])

In [236]:
full_df.to_csv(storage_path + 'data/full_diverse_voices.csv', index=False)

In [235]:
full_df.source.value_counts()

source
british_dialects           500
english_accents            470
stereoset_speech_gender    463
Name: count, dtype: int64

In [174]:
len(stereoset_sampled.id.unique())

2736

In [181]:
stereoset_sampled['audio_file'][2735]

'/work/bbc6523/diverse_voices/stereoset_audio/01911_en-US-SaraNeural.wav'