In [None]:
input_dir = 'rebalanced/input'
output_dir = 'rebalanced/output'

In [1]:
import os
import pandas as pd
import json

df = {}

all_dataframes = []
for type in ['train','val', 'test']:
    dataframes = []
    for file_name in os.listdir(type):
        if file_name.endswith('.json'):
            file_name, file_extension = os.path.splitext(file_name)
            base_path = os.path.join(type, file_name)


            with open(f'{base_path}.json', 'r') as json_file:
                data = json.load(json_file)
            
            # Use the contents of the text file instead of the json file
            if os.path.isfile(f'{base_path}.txt'):
                    with open(f'{base_path}.txt', 'r') as file:
                        data['text'] = file.read().strip()
            
            dataframe = pd.json_normalize(data)
            dataframes.append(dataframe)
            all_dataframes.append(dataframe)

    df[type] = pd.concat(dataframes, ignore_index=True)
    

full_df = pd.concat(all_dataframes, ignore_index=True) 

label_columns = [col for col in full_df.columns if col.startswith('label_')]
analysis_columns = [col for col in full_df.columns if col.startswith('analysis_')]
categories = [col.replace('label_', '') for col in label_columns]
text_column = 'text'

print(f"Categories: {categories}")
print(f"Labels: {label_columns}")
print(f"Analysis: {analysis_columns}")
print(f"Input: {text_column}")

for type, dataframe in df.items():
    print(f"\nRows ({type}): {len(dataframe)}")
    for category in categories:
        label = f"label_{category}"
        bias = len(dataframe[dataframe[label] == True])
        unbiased = len(dataframe[dataframe[label] == False])
        print(f"\t{category}: {bias} biased, {unbiased} unbiased")
    
    non_neutral = dataframe[label_columns].apply(lambda x: any([i for i in x]), axis=1).sum()
    neutral = dataframe[label_columns].apply(lambda x: all([not i for i in x]), axis=1).sum()
    print(f'\tHas at least one bias category: {non_neutral} ({non_neutral/len(dataframe):.2%})')
    print(f'\tHas no bias categories: {neutral} ({neutral/len(dataframe):.2%})')

non_neutral = full_df[label_columns].apply(lambda x: any([i for i in x]), axis=1).sum()
neutral = full_df[label_columns].apply(lambda x: all([not i for i in x]), axis=1).sum()

print(f'\nFULL-DATASET Rows: {len(full_df)}')
print(f'FULL-DATASET Has at least one bias category: {non_neutral} ({non_neutral/len(full_df):.2%})')
print(f'FULL-DATASET Has no bias categories: {neutral} ({neutral/len(full_df):.2%})')


Categories: ['age', 'disability', 'masculine', 'feminine', 'racial', 'sexuality', 'general']
Labels: ['label_age', 'label_disability', 'label_masculine', 'label_feminine', 'label_racial', 'label_sexuality', 'label_general']
Analysis: ['analysis_age', 'analysis_disability', 'analysis_masculine', 'analysis_feminine', 'analysis_racial', 'analysis_sexuality', 'analysis_general']
Input: text

Rows (train): 520
	age: 40 biased, 480 unbiased
	disability: 41 biased, 479 unbiased
	masculine: 38 biased, 482 unbiased
	feminine: 39 biased, 481 unbiased
	racial: 39 biased, 481 unbiased
	sexuality: 39 biased, 481 unbiased
	general: 41 biased, 479 unbiased
	Has at least one bias category: 250 (48.08%)
	Has no bias categories: 270 (51.92%)

Rows (val): 1050
	age: 80 biased, 970 unbiased
	disability: 81 biased, 969 unbiased
	masculine: 81 biased, 969 unbiased
	feminine: 79 biased, 971 unbiased
	racial: 77 biased, 973 unbiased
	sexuality: 81 biased, 969 unbiased
	general: 81 biased, 969 unbiased
	Has at

# Plot Functions

In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import json


def plot_by_category(dataframe, x_ideal=200):
    melted_df = dataframe.melt(id_vars='synthetic', value_vars=label_columns, var_name='categories')
    melted_df = melted_df[melted_df['value'] == 1]
    
    
    real_df = dataframe[dataframe['synthetic'] == False]
    synthetic_df = dataframe[dataframe['synthetic'] == True]
    
    non_neutral_real = real_df[label_columns].apply(lambda x: any([i for i in x]), axis=1).sum()
    neutral_real = real_df[label_columns].apply(lambda x: all([not i for i in x]), axis=1).sum()
    
    non_neutral_synthetic = synthetic_df[label_columns].apply(lambda x: any([i for i in x]), axis=1).sum()
    neutral_synthetic = synthetic_df[label_columns].apply(lambda x: all([not i for i in x]), axis=1).sum()
    
    total_real_str = '{:,.0f}'.format(non_neutral_real + neutral_real)
    total_synthetic_str = '{:,.0f}'.format(non_neutral_synthetic + neutral_synthetic)
    
    pivot_df = melted_df.pivot_table(index='categories', columns='synthetic', aggfunc='size', fill_value=0)
    pivot_df.index = pivot_df.index.str.replace('label_', '')
    pivot_df = pivot_df.sort_values(by=False, ascending=True)
    pivot_df = pivot_df.rename(columns={True: 'Synthetic', False: 'Real'})
    
    plt.figure(figsize=(20, 20))
    bar_plot = pivot_df.plot(kind='barh', stacked=True, color=['#6495ED', '#FFA500'])
    
    plt.title('Verified Biased Job Descriptions', fontsize=18)
    plt.xlabel('', fontsize=16)
    plt.ylabel('', fontsize=16)
    plt.yticks(fontsize=16)
    plt.xticks(fontsize=16)
    
    plt.legend([f'{total_real_str} Real', f'{total_synthetic_str} Synthetic'],loc='center left', bbox_to_anchor=(1, 0.5), fontsize=16, title='')
    
    # Add values onto the chart
    #for rect in bar_plot.patches:
    #    width = rect.get_width()
    #    if width > 0:
    #        y = rect.get_y() + rect.get_height() / 2
    #        x = rect.get_x() + width / 2
    #        bar_plot.text(x, y, str(int(width)), va='center', fontsize=16) 
    
    for rect in bar_plot.patches:
        width, height = rect.get_width(), rect.get_height()
        x, y = rect.get_xy()
        bar_plot.text(width / 2 + x, y + height / 2,
                      '{:.0f}'.format(width),
                      ha='center',
                      va='center',
                      color='black',
                      fontsize=18)
    
    
    # Set x limit 25% more than max value
    plt.xlim(0, pivot_df.values.max() * 1.25)
    
    
    # Draw a vertical line at x = 200
    plt.axvline(x=x_ideal, color="red", linestyle='--')
    
    plt.show()


def plot_neutral_vs_bias(dataframe):
    real_df = dataframe[dataframe['synthetic'] == False]
    synthetic_df = dataframe[dataframe['synthetic'] == True]
    
    non_neutral_real = real_df[label_columns].apply(lambda x: any([i for i in x]), axis=1).sum()
    neutral_real = real_df[label_columns].apply(lambda x: all([not i for i in x]), axis=1).sum()
    
    non_neutral_synthetic = synthetic_df[label_columns].apply(lambda x: any([i for i in x]), axis=1).sum()
    neutral_synthetic = synthetic_df[label_columns].apply(lambda x: all([not i for i in x]), axis=1).sum()
    
    #print(f'non neutral synthetic: {non_neutral_synthetic}')
    #print(f'non neutral real: {non_neutral_real}')
    #print(f'neutral synthetic: {neutral_synthetic}')
    #print(f'neutral real: {neutral_real}')
    
    total_real_str = '{:,.0f}'.format(non_neutral_real + neutral_real)
    total_synthetic_str = '{:,.0f}'.format(non_neutral_synthetic + neutral_synthetic)
    neutral_str = '{:,.0f} Neutral'.format(neutral_synthetic + neutral_real)
    bias_cont_str = '{:,.0f} Bias'.format(non_neutral_synthetic + non_neutral_real)
    total_str = '{:,.0f}'.format(non_neutral_synthetic + neutral_synthetic + neutral_real + non_neutral_real)
    
    data = {'Category': [bias_cont_str, bias_cont_str, neutral_str, neutral_str],
            'Count': [non_neutral_synthetic, non_neutral_real, neutral_synthetic, neutral_real],
            'synthetic': ['Synthetic', 'Real', 'Synthetic', 'Real']}
    synth_df = pd.DataFrame(data)
    
    
    plt.figure(figsize=(20, 10))
    
    bar_plot = synth_df['Count'].groupby([synth_df['Category'], synth_df['synthetic']]).sum().unstack().plot.barh(
        stacked=True, color=['#6495ED', '#FFA500'])
    
    plt.title(f'{total_str} Verified Job Descriptions', fontsize=18)
    plt.xlabel('', fontsize=16)
    plt.ylabel('', fontsize=16)
    plt.yticks(fontsize=16)
    plt.xticks(fontsize=16)
    plt.legend([f'{total_real_str} Real', f'{total_synthetic_str} Synthetic'],loc='center left', bbox_to_anchor=(1, 0.5), fontsize=16, title='')
    
    patches = bar_plot.patches
    values = [non_neutral_real,neutral_real,non_neutral_synthetic,neutral_synthetic]
    
    # Display values for each segment
    for patch, value in zip(patches, values):
        width, height = patch.get_width(), patch.get_height()
        x, y = patch.get_xy()
        bar_plot.text(width / 2 + x, y + height / 2,
                      '{:,.0f}'.format(value),
                      ha='center',
                      va='center',
                      color='black',
                      fontsize=18)
    
    plt.show()


def plot_by_models(dataframe):
    dataframe = dataframe[dataframe['synthetic'] == True]

    models = {}
    
    for meta in dataframe['metadata']:
        meta = json.loads(meta)
        model_name = meta['model']
        model_name = model_name.split(':')[-1]
        if model_name not in models:
            models[model_name] = 1
        else:
            models[model_name] += 1


    dataframe = pd.DataFrame(list(models.items()), columns=['Model', 'Count'])
    
    #model_names = dataframe['Model']
    model_counts = dataframe['Count']
    
    plt.figure(figsize=(10, 5))
    
    bar_plot = dataframe['Count'].groupby(dataframe['Model']).sum().plot.barh(color='#6495ED')
    
    plt.title('Verified Synthetic Job Descriptions', fontsize=18)
    plt.xlabel('', fontsize=16)
    plt.ylabel('', fontsize=16)
    plt.yticks(fontsize=16)
    plt.xticks(fontsize=16)
    
    patches = bar_plot.patches
    
    # Display values for each segment
    for patch, count in zip(patches, model_counts):
        width, height = patch.get_width(), patch.get_height()
        x, y = patch.get_xy()
        bar_plot.text(width / 2 + x, y + height / 2,
                      '{:,.0f}'.format(count),
                      ha='center',
                      va='center',
                      color='black',
                      fontsize=18)
    plt.show()

In [3]:
df['train'].columns

In [4]:
df['train'].to_parquet(f'output/verified-train.parquet')
df['val'].to_parquet(f'output/verified-val.parquet')
df['test'].to_parquet(f'output/verified-test.parquet')

# Full Dataset

In [5]:
plot_by_category(full_df, 200)

In [6]:
plot_neutral_vs_bias(full_df)

In [7]:
plot_by_models(full_df)

# Train Dataset

In [8]:
plot_by_category(df['train'], 40)

In [9]:
#plot_neutral_vs_bias(df['train'])

In [10]:
#plot_by_models(df['train'])

# Val Dataset

In [11]:
plot_by_category(df['val'], 80)

In [12]:
#plot_neutral_vs_bias(df['val'])

In [13]:
#plot_by_models(df['val'])

# Test Dataset

In [14]:
plot_by_category(df['test'], 80)

In [15]:
#plot_neutral_vs_bias(df['test'])

In [78]:
#plot_by_models(df['test'])