In [2]:
import pandas as pd
from prompts import *

import plotly.express as px
import glob


pd.set_option('display.max_colwidth', None)

import warnings
warnings.filterwarnings("ignore")

In [3]:
def parse_adjectives(row, col, adjectivelist):
    adj_found = []
    for adj in list(set(adjectivelist)):
        if adj.lower() in row[col].lower():
            adj_found.append(adj)

    return adj_found

# Binary setup

In [None]:
all_files_binary = {}
for file in glob.glob('/work/bbc6523/diverse_voices/british_dialects_*.csv'):
    if len(file.split('/')[-1].split('_')) == 4:
        df = pd.read_csv(file)
        qwen_there = False
        phi_there = False
        meralion_there = False
        print(file)
        for col in df.columns:
            if 'Qwen' in col:
                qwen_there = True
            if 'MERaLiON' in col:
                meralion_there = True
            if 'Phi' in col:
                phi_there = True
        if not qwen_there:
            print('Qwen')
        if not meralion_there:
            print('MERaLiON')
        if not phi_there:
            print('Phi')


/work/bbc6523/diverse_voices/british_dialects_scottish_male.csv
/work/bbc6523/diverse_voices/british_dialects_midlands_male.csv
/work/bbc6523/diverse_voices/british_dialects_midlands_female.csv
/work/bbc6523/diverse_voices/british_dialects_welsh_male.csv
/work/bbc6523/diverse_voices/british_dialects_southern_female.csv
Qwen
MERaLiON
Phi
/work/bbc6523/diverse_voices/british_dialects_southern_male.csv
Qwen
MERaLiON
Phi
/work/bbc6523/diverse_voices/british_dialects_scottish_female.csv
/work/bbc6523/diverse_voices/british_dialects_northern_male.csv
Qwen
MERaLiON
Phi
/work/bbc6523/diverse_voices/british_dialects_northern_female.csv
Qwen
MERaLiON
Phi
/work/bbc6523/diverse_voices/british_dialects_welsh_female.csv


## Profession Comparison

In [57]:
profession_list = [prof.strip() for sublist in PROFESSIONS_BINARY['english'] for prof in sublist]

all_dfs = []
for file in glob.glob('/work/bbc6523/diverse_voices/british_dialects_*.csv'):
    if len(file.split('/')[-1].split('_')) == 4:
        print(file)
        for i in range(22):
            col_name = 'model_response_profession_binary_' +str(i) + '_Phi-4-multimodal-instruct'
            df = pd.read_csv(file)
            if col_name in df.columns: 

                df['profession'] = df.apply(lambda row: parse_adjectives(row, col_name, PROFESSIONS_BINARY['english'][i]), axis=1)

                df_exploded = df.explode('profession')['profession'].value_counts().reset_index()
                df_exploded['percentage'] = df_exploded['count'] / df_exploded['count'].sum()
                df_exploded['gender'] = file.split('/')[-1].split('_')[-1][:-4]
                df_exploded['accent'] = file.split('/')[-1].split('_')[2]
                all_dfs.append(df_exploded)


full_df = pd.concat(all_dfs)


/work/bbc6523/diverse_voices/british_dialects_scottish_male.csv
/work/bbc6523/diverse_voices/british_dialects_midlands_male.csv
/work/bbc6523/diverse_voices/british_dialects_midlands_female.csv
/work/bbc6523/diverse_voices/british_dialects_welsh_male.csv
/work/bbc6523/diverse_voices/british_dialects_southern_female.csv
/work/bbc6523/diverse_voices/british_dialects_southern_male.csv
/work/bbc6523/diverse_voices/british_dialects_scottish_female.csv
/work/bbc6523/diverse_voices/british_dialects_northern_male.csv
/work/bbc6523/diverse_voices/british_dialects_northern_female.csv
/work/bbc6523/diverse_voices/british_dialects_welsh_female.csv


In [64]:
for gender in full_df.gender.unique():
    subset = full_df[full_df['gender'] == gender]
    subset['profession'] = pd.Categorical(subset['profession'], categories=profession_list, ordered=True)

    fig = px.bar(
        subset,
        title=gender +' Phi',
        x='profession',
        y='percentage',
        color='accent',    # Different colors for each group
        barmode='group',  # Place bars side-by-side rather than stacking them
    )
    fig.show()


In [65]:
for accent in full_df.accent.unique():
    subset = full_df[full_df['accent'] == accent]
    subset['profession'] = pd.Categorical(subset['profession'], categories=profession_list, ordered=True)

    fig = px.bar(
        subset,
        title=accent +' Phi',
        x='profession',
        y='percentage',
        color='gender',    # Different colors for each group
        barmode='group',  # Place bars side-by-side rather than stacking them
    )
    fig.show()


In [67]:
profession_list = [prof.strip() for sublist in PROFESSIONS_BINARY['english'] for prof in sublist]

all_dfs = []
for file in glob.glob('/work/bbc6523/diverse_voices/british_dialects_*.csv'):
    if len(file.split('/')[-1].split('_')) == 4:
        print(file)
        for i in range(22):
            col_name = 'model_response_profession_binary_' +str(i) + '_MERaLiON-AudioLLM-Whisper-SEA-LION'
            df = pd.read_csv(file)
            if col_name in df.columns: 

                df['profession'] = df.apply(lambda row: parse_adjectives(row, col_name, PROFESSIONS_BINARY['english'][i]), axis=1)

                df_exploded = df.explode('profession')['profession'].value_counts().reset_index()
                df_exploded['percentage'] = df_exploded['count'] / df_exploded['count'].sum()
                df_exploded['gender'] = file.split('/')[-1].split('_')[-1][:-4]
                df_exploded['accent'] = file.split('/')[-1].split('_')[2]
                all_dfs.append(df_exploded)


full_df = pd.concat(all_dfs)


/work/bbc6523/diverse_voices/british_dialects_scottish_male.csv
/work/bbc6523/diverse_voices/british_dialects_midlands_male.csv
/work/bbc6523/diverse_voices/british_dialects_midlands_female.csv
/work/bbc6523/diverse_voices/british_dialects_welsh_male.csv
/work/bbc6523/diverse_voices/british_dialects_southern_female.csv
/work/bbc6523/diverse_voices/british_dialects_southern_male.csv
/work/bbc6523/diverse_voices/british_dialects_scottish_female.csv
/work/bbc6523/diverse_voices/british_dialects_northern_male.csv
/work/bbc6523/diverse_voices/british_dialects_northern_female.csv
/work/bbc6523/diverse_voices/british_dialects_welsh_female.csv


In [68]:
for gender in full_df.gender.unique():
    subset = full_df[full_df['gender'] == gender]
    subset['profession'] = pd.Categorical(subset['profession'], categories=profession_list, ordered=True)

    fig = px.bar(
        subset,
        title=gender +' Phi',
        x='profession',
        y='percentage',
        color='accent',    # Different colors for each group
        barmode='group',  # Place bars side-by-side rather than stacking them
    )
    fig.show()


In [69]:
for accent in full_df.accent.unique():
    subset = full_df[full_df['accent'] == accent]
    subset['profession'] = pd.Categorical(subset['profession'], categories=profession_list, ordered=True)

    fig = px.bar(
        subset,
        title=accent +' Phi',
        x='profession',
        y='percentage',
        color='gender',    # Different colors for each group
        barmode='group',  # Place bars side-by-side rather than stacking them
    )
    fig.show()


In [73]:
profession_list = [prof.strip() for sublist in PROFESSIONS_BINARY['english'] for prof in sublist]

all_dfs = []
for file in glob.glob('/work/bbc6523/diverse_voices/british_dialects_*.csv'):
    if len(file.split('/')[-1].split('_')) == 4:
        print(file)
        for i in range(22):
            col_name = 'model_response_profession_binary_' +str(i) + '_Qwen2-Audio-7B-Instruct'
            df = pd.read_csv(file)
            if col_name in df.columns: 

                df['profession'] = df.apply(lambda row: parse_adjectives(row, col_name, PROFESSIONS_BINARY['english'][i]), axis=1)

                df_exploded = df.explode('profession')['profession'].value_counts().reset_index()
                df_exploded['percentage'] = df_exploded['count'] / df_exploded['count'].sum()
                df_exploded['gender'] = file.split('/')[-1].split('_')[-1][:-4]
                df_exploded['accent'] = file.split('/')[-1].split('_')[2]
                all_dfs.append(df_exploded)


full_df = pd.concat(all_dfs)


/work/bbc6523/diverse_voices/british_dialects_scottish_male.csv
/work/bbc6523/diverse_voices/british_dialects_midlands_male.csv
/work/bbc6523/diverse_voices/british_dialects_midlands_female.csv
/work/bbc6523/diverse_voices/british_dialects_welsh_male.csv
/work/bbc6523/diverse_voices/british_dialects_southern_female.csv
/work/bbc6523/diverse_voices/british_dialects_southern_male.csv
/work/bbc6523/diverse_voices/british_dialects_scottish_female.csv
/work/bbc6523/diverse_voices/british_dialects_northern_male.csv
/work/bbc6523/diverse_voices/british_dialects_northern_female.csv
/work/bbc6523/diverse_voices/british_dialects_welsh_female.csv


In [None]:
for gender in full_df.gender.unique():
    subset = full_df[full_df['gender'] == gender]
    subset['profession'] = pd.Categorical(subset['profession'], categories=profession_list, ordered=True)

    fig = px.bar(
        subset,
        title=gender +' Phi',
        x='profession',
        y='percentage',
        color='accent',    # Different colors for each group
        barmode='group',  # Place bars side-by-side rather than stacking them
    )
    fig.show()


In [None]:
for accent in full_df.accent.unique():
    subset = full_df[full_df['accent'] == accent]
    subset['profession'] = pd.Categorical(subset['profession'], categories=profession_list, ordered=True)

    fig = px.bar(
        subset,
        title=accent +' Phi',
        x='profession',
        y='percentage',
        color='gender',    # Different colors for each group
        barmode='group',  # Place bars side-by-side rather than stacking them
    )
    fig.show()


# Compare setup

In [101]:
for file in glob.glob('/work/bbc6523/diverse_voices/british_dialects_southern_*_*'):
    df = pd.read_csv(file)
    qwen_there = False
    phi_there = False
    meralion_there = False
    print(file)
    for col in df.columns:
        if 'Qwen' in col:
            qwen_there = True
        if 'MERaLiON' in col:
            meralion_there = True
        if 'Phi' in col:
            phi_there = True
    if not qwen_there:
        print('Qwen')
    if not meralion_there:
        print('MERaLiON')
    if not phi_there:
        print('Phi')
    


/work/bbc6523/diverse_voices/british_dialects_southern_northern_male.csv
/work/bbc6523/diverse_voices/british_dialects_southern_scottish_male.csv
/work/bbc6523/diverse_voices/british_dialects_southern_midlands_male.csv
/work/bbc6523/diverse_voices/british_dialects_southern_midlands_female.csv
/work/bbc6523/diverse_voices/british_dialects_southern_northern_female.csv
/work/bbc6523/diverse_voices/british_dialects_southern_welsh_male.csv
/work/bbc6523/diverse_voices/british_dialects_southern_welsh_female.csv
/work/bbc6523/diverse_voices/british_dialects_southern_scottish_female.csv


In [9]:
def parse_answer(row):
    if row['accent'] == 'standard-dialect':
        if row['selection'] == 'A':
            return 'standard'
        elif row['selection'] == 'B':
            return 'dialect'
        else:
            return 'both'
    else: 
        if row['selection'] == 'B':
            return 'standard'
        elif row['selection'] == 'A':
            return 'dialect'
        else:
            return 'both'
        
profession_list = [prof for prof in PROFESSIONS['english']]


for model in ['Qwen2-Audio-7B-Instruct', 'MERaLiON-AudioLLM-Whisper-SEA-LION', 'Phi-4-multimodal-instruct']:
    all_dfs = []
    for file in glob.glob('/work/bbc6523/diverse_voices/british_dialects_southern_*_*'):
        print(file)
        df = pd.read_csv(file)
        for i in range(len(profession_list)):
            col_name = 'model_response_profession_compare_' + str(i) + '_' + model

             # Step 1: Get all unique values
            gender = file.split('/')[-1].split('_')[-1][:-4]
            accents = df['dialect_order'].unique()
            words = df[col_name].unique()

            # Step 2: Create all combinations (cartesian product)
            combinations = pd.MultiIndex.from_product([accents, words], names=['accent', col_name])

            # Step 3: Group by and count
            result = df.groupby(['dialect_order', col_name]).size().reindex(combinations, fill_value=0).reset_index(name='count')

            result['profession'] = PROFESSIONS['english'][i]
            result['gender'] = file.split('/')[-1].split('_')[-1][:-4]
            result['dialect'] = file.split('/')[-1].split('_')[3]
            
            result.rename(columns={col_name:'selection'}, inplace=True)
            all_dfs.append(result)

    full_df = pd.concat(all_dfs)

    full_df['parsed_selection'] = full_df.apply(lambda row: parse_answer(row), axis=1)

    for gender in full_df['gender'].unique():
        subset = full_df[full_df['gender'] == gender]
        grouped = subset.groupby(['profession', 'parsed_selection'])['count'].sum().reset_index()
        grouped['percentage'] = grouped.groupby('profession')['count'].transform(lambda x: x / x.sum() * 100)

        # Assume `counts` is your DataFrame with 'accent', 'col1', 'count'
        fig = px.bar(
            grouped,
            x='profession',
            y='percentage',
            color='parsed_selection',
            barmode='group',
            title=model + ' ' + gender,
            color_discrete_map={'standard': 'blue', 'dialect': 'red', 'both': 'green'}
        )

        fig.show()


/work/bbc6523/diverse_voices/british_dialects_southern_northern_male.csv
/work/bbc6523/diverse_voices/british_dialects_southern_scottish_male.csv
/work/bbc6523/diverse_voices/british_dialects_southern_midlands_male.csv
/work/bbc6523/diverse_voices/british_dialects_southern_midlands_female.csv
/work/bbc6523/diverse_voices/british_dialects_southern_northern_female.csv
/work/bbc6523/diverse_voices/british_dialects_southern_welsh_male.csv
/work/bbc6523/diverse_voices/british_dialects_southern_welsh_female.csv
/work/bbc6523/diverse_voices/british_dialects_southern_scottish_female.csv


/work/bbc6523/diverse_voices/british_dialects_southern_northern_male.csv
/work/bbc6523/diverse_voices/british_dialects_southern_scottish_male.csv
/work/bbc6523/diverse_voices/british_dialects_southern_midlands_male.csv
/work/bbc6523/diverse_voices/british_dialects_southern_midlands_female.csv
/work/bbc6523/diverse_voices/british_dialects_southern_northern_female.csv
/work/bbc6523/diverse_voices/british_dialects_southern_welsh_male.csv
/work/bbc6523/diverse_voices/british_dialects_southern_welsh_female.csv
/work/bbc6523/diverse_voices/british_dialects_southern_scottish_female.csv


/work/bbc6523/diverse_voices/british_dialects_southern_northern_male.csv
/work/bbc6523/diverse_voices/british_dialects_southern_scottish_male.csv
/work/bbc6523/diverse_voices/british_dialects_southern_midlands_male.csv
/work/bbc6523/diverse_voices/british_dialects_southern_midlands_female.csv
/work/bbc6523/diverse_voices/british_dialects_southern_northern_female.csv
/work/bbc6523/diverse_voices/british_dialects_southern_welsh_male.csv
/work/bbc6523/diverse_voices/british_dialects_southern_welsh_female.csv
/work/bbc6523/diverse_voices/british_dialects_southern_scottish_female.csv


In [1]:
full_df['gender'].unique()

NameError: name 'full_df' is not defined

In [16]:
subset = full_df[full_df['gender'] == 'female']

In [19]:
grouped = subset.groupby(['profession', 'parsed_selection'])['count'].sum().reset_index()

In [20]:
grouped

Unnamed: 0,profession,parsed_selection,count
0,CEO,both,56
1,CEO,dialect,80
2,CEO,standard,1040
3,CTO,both,1
4,CTO,dialect,327
...,...,...,...
108,teacher,dialect,20
109,teacher,standard,1066
110,writer,both,334
111,writer,dialect,68
