# Load Parquet

In [1]:
import pandas as pd

pd.set_option('display.float_format', '{:,.4f}'.format)

dir = '.'
dir = '/home/teveritt/Datasets/2024-mcm-everitt-ryan/datasets/benchmark'

df = pd.read_parquet(f'{dir}/job-bias-synthetic-human-benchmark.parquet.snappy')
print(f"Column Names: {df.columns.values}")
print(f"Columns: {len(df.columns)}")
print(f"Rows: {len(df)}")

In [2]:
label_columns = [col for col in df.columns if col.startswith('label_')]
analysis_columns = [col for col in df.columns if col.startswith('analysis_')]
categories = [col.replace('label_', '') for col in label_columns]
text_column = 'text'

print(f"Categories: {categories}")
print(f"Labels: {label_columns}")
print(f"Analysis: {analysis_columns}")
print(f"Input: {text_column}")

In [3]:
# Preview dataframe
df.head(3)

In [4]:
# Preview job description text

df.loc[0, 'text']

# Synthetic vs Real vs Verified

In [5]:
import matplotlib.pyplot as plt

counts = df.groupby(['verified', 'synthetic']).size().unstack(fill_value=0)

counts.plot(kind='bar', stacked=True, figsize=(10, 7))

plt.title('Stacked Bar Chart of Verified and Synthetic')
plt.xlabel('Verified')
plt.ylabel('Count')
plt.show()

In [6]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

cross_tab = pd.crosstab(df['verified'], df['synthetic'])


def custom_format(x):
    return '{:,}'.format(x)


cross_tab_fmt = cross_tab.map(lambda x: "{:,}".format(x))

fig, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(cross_tab, annot=cross_tab_fmt, fmt='', cmap="YlGnBu", ax=ax,
            annot_kws={"size": 18})

ax.invert_yaxis()
#ax.invert_xaxis()    
ax.tick_params(axis='both', which='major', labelsize=18)
ax.set_xlabel('Synthetic', fontsize=22)
ax.set_ylabel('Verified', fontsize=22)

plt.show()


plt.show()

In [7]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

columns = 2

fig, axs = plt.subplots(nrows=4, ncols=columns, figsize=(20, 40))

i = 0
for category in categories:
    cross_tab = pd.crosstab(index=[df['verified'], df['synthetic']],
                            columns=df[f'label_{category}'])

    row_index = (i - 1) // columns
    col_index = (i - 1) % columns

    ax = axs[row_index, col_index]
    
    sns.heatmap(cross_tab, annot=True, fmt='d', cmap='YlGnBu', ax=ax, annot_kws={"size": 18})
    ax.invert_yaxis()
    ax.invert_xaxis()
    ax.tick_params(axis='both', which='major', labelsize=18)
    ax.tick_params(axis='both', which='minor', labelsize=18)
    ax.set_xlabel(category, fontsize=22)
    ax.set_ylabel('Verified - Synthetic', fontsize=22)

    i = i + 1
    
# Remove the unused subplots
fig.delaxes(axs[3, 0])

#plt.tight_layout() 
plt.show()


# Bias Samples

In [8]:
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick

plt.figure(figsize=(12, 8))

ax = df[label_columns].sum(axis=1).plot.hist(log=True, bins=len(categories)  )
ax.get_yaxis().set_major_formatter(mtick.ScalarFormatter())
ax.get_yaxis().get_major_formatter().set_useOffset(False)
ax.xaxis.set_major_locator(mtick.MaxNLocator(integer=True))
ax.tick_params(axis='both', which='major', labelsize=18)
ax.tick_params(axis='both', which='minor', labelsize=18)

ax.set_xlabel('', fontsize=22)
ax.set_ylabel('', fontsize=22)
ax.set_title('Number of Labels per Sample', fontsize=22)

plt.show()

In [9]:
import matplotlib.pyplot as plt

counts = df[label_columns].sum()

# Remove the 'label_' prefix from the index
counts.index = counts.index.str.replace('label_', '')
counts = counts.sort_values(ascending=False)

fig, ax = plt.subplots(figsize=(10, 8))  # Set the size of the figure
counts.plot(kind='barh', ax=ax)

ax.tick_params(axis='both', which='major', labelsize=18)
ax.tick_params(axis='both', which='minor', labelsize=18)
ax.set_xlabel('', fontsize=22)
ax.set_ylabel('', fontsize=22)
ax.set_title('Bias Samples', size=22)
ax.set_xlim([0, None])

plt.gca().invert_yaxis()  # invert the y-axis to show the highest count at the top
plt.show()

In [10]:
import matplotlib.pyplot as plt
import pandas as pd

df_real_verified = df[~df['synthetic'] & df['verified']]
df_real_unverified = df[~df['synthetic'] & ~df['verified']]

df_synthetic_verified = df[df['synthetic'] & df['verified']]
df_synthetic_unverified = df[df['synthetic'] & ~df['verified']]

dfs = [df_real_verified, df_real_unverified, df_synthetic_verified, df_synthetic_unverified]
titles = ['Real & Verified', 'Real & Machine Labelled', 'Synthetic & Verified', 'Synthetic & Not Verified']

fig, axs = plt.subplots(2, 2, figsize=(20, 16))
axs = axs.ravel()  # Flatten array of axes


for i, df_sub in enumerate(dfs):
    counts = df_sub[label_columns].sum()
    counts.index = counts.index.str.replace('label_', '')
    counts = counts.sort_values(ascending=False)

    counts.plot(kind='barh', ax=axs[i])

    axs[i].tick_params(axis='both', which='major', labelsize=18)
    axs[i].tick_params(axis='both', which='minor', labelsize=18)
    axs[i].set_xlabel('', fontsize=22)
    axs[i].set_ylabel('', fontsize=22)

    axs[i].set_title(titles[i], size=22)
    axs[i].invert_yaxis()  # invert the y-axis
    #axs[i].set_xlim([0, 1000])

#plt.tight_layout()
plt.show()


In [30]:
for column in label_columns:
    true_count = df_real_verified[column].sum()  # it works if True is 1 and False is 0
    print(f"{column}: {true_count}")


In [33]:
labels_df = df_real_verified[label_columns]
neutral = (~labels_df).all(axis=1).sum()
print(f'Neutral: {neutral}')

# Label Diversity

High entropy indicates that the label distribution is fairly uniform, and no single label dominates.
This suggests a balanced dataset.

The maximum entropy occurs when all labels are equally probable.
The maximum entropy can be calculated using the formula: log_2(n)

In [11]:
df_real_verified = df[~df['synthetic'] & df['verified']]
df_real_unverified = df[~df['synthetic'] & ~df['verified']]

df_synthetic_verified = df[df['synthetic'] & df['verified']]
df_synthetic_unverified = df[df['synthetic'] & ~df['verified']]

diversity_df = [df, df_real_verified, df_real_unverified, df_synthetic_verified, df_synthetic_unverified]
diversity_captions = ['Full Dataset','Real & Verified', 'Real & Machine Labelled', 'Synthetic & Verified', 'Synthetic & Not Verified']

In [12]:
df[label_columns].sum()

In [13]:
import numpy as np

# Calculate entropy for labels in dataset.
# Source: https://colab.research.google.com/drive/1pddMaJJIHR0O8MND42hfzYRxOPMV82KA?usp=sharing#scrollTo=RkVuiK_loty4

def categorical_entropy(df: pd.DataFrame, labels: list):
    # Calculate frequency for each label 
    label_frequency = df[labels].sum()

    # Calculate probabilities for each label
    label_probabilities = label_frequency / label_frequency.sum()

    # Calculate entropy
    entropy = -np.sum(label_probabilities * np.log2(label_probabilities))

    return entropy

In [14]:
import pandas as pd
import math

diversity_data = []

max_entropy = math.log2(len(categories))
max_entropy_data = {"Dataset": "Maximum Entropy Possible", "Entropy": max_entropy}
diversity_data.append(max_entropy_data)

for idx, sub_df in enumerate(diversity_df):
    caption = diversity_captions[idx]
    entropy = categorical_entropy(sub_df, label_columns)
    diversity_data.append({"Dataset": caption, "Entropy": entropy})

entropy_df = pd.DataFrame(diversity_data)
sorted_entropy_df = entropy_df.sort_values(by='Entropy', ascending=False)
sorted_entropy_df.reset_index(drop=True, inplace=True)

sorted_entropy_df

# Label Correlations

In [15]:
plt.figure(figsize=(20, 15))
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)

corr = df[label_columns].corr(method='pearson')

# Remove the 'label_' prefix from the index
corr.index = corr.index.str.replace('label_', '')
corr.columns = corr.columns.str.replace('label_', '')

heatmap = sns.heatmap(corr, vmin=-1, vmax=1, annot=True, cmap='BrBG', annot_kws={"fontsize": 18})
heatmap.set_title('Correlation of Labels', fontdict={'fontsize': 22}, pad=16);

In [16]:
# Source: https://medium.com/@szabo.bibor/how-to-create-a-seaborn-correlation-heatmap-in-python-834c0686b88e

plt.figure(figsize=(20, 15))
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)

corr = df[label_columns].corr(method ='pearson')

# Remove the 'label_' prefix from the index
corr.index = corr.index.str.replace('label_', '')
corr.columns = corr.columns.str.replace('label_', '')


mask = np.triu(np.ones_like(corr, dtype=np.bool_))
heatmap = sns.heatmap(corr, mask=mask, vmin=-1, vmax=1, annot=True, cmap='BrBG',annot_kws={"fontsize": 18})
heatmap.set_title('Triangle Correlation of Labels', fontdict={'fontsize':22}, pad=16);

# Countries

Country data is only available for real job postings, synthetic ones don't have one

In [17]:
print(f"Countries: {len(df['country'].unique())}")

In [18]:
df['country'].value_counts()

In [19]:
import matplotlib.pyplot as plt

n = 20

counts = df['country'].value_counts().sort_values(ascending=True).tail(n)

fig, ax = plt.subplots(figsize=(15, 12))
counts.plot(kind='barh', ax=ax)

ax.tick_params(axis='both', which='major', labelsize=18)
ax.tick_params(axis='both', which='minor', labelsize=18)
ax.set_xlabel('', fontsize=22)
ax.set_ylabel('', fontsize=22)
ax.set_title(f'Top {n} Country Sample Counts (all)', size=22)
ax.invert_yaxis()
ax.set_xlim([0, None])
plt.gca().invert_yaxis()

plt.show()

In [20]:
import matplotlib.pyplot as plt

n = 20

counts = df[df['verified'] == True]['country'].value_counts().sort_values(ascending=True).tail(n)

fig, ax = plt.subplots(figsize=(15, 12))
counts.plot(kind='barh', ax=ax)

ax.tick_params(axis='both', which='major', labelsize=18)
ax.tick_params(axis='both', which='minor', labelsize=18)
ax.set_xlabel('', fontsize=22)
ax.set_ylabel('', fontsize=22)
ax.set_title(f'Top {n} Country Sample Counts (verified)', size=22)
ax.invert_yaxis()
ax.set_xlim([0, None])
plt.gca().invert_yaxis()

plt.show()

# Website Sources

Only real job postings have website sources

In [21]:
print(f"Website Sources: {len(df['source'].unique())}")

In [22]:
df['source'].value_counts()

In [23]:
import matplotlib.pyplot as plt

n = 20

counts = df['source'].value_counts().sort_values(ascending=True).tail(n)

fig, ax = plt.subplots(figsize=(15, 12))
counts.plot(kind='barh', ax=ax)

ax.tick_params(axis='both', which='major', labelsize=18)
ax.tick_params(axis='both', which='minor', labelsize=18)
ax.set_xlabel('', fontsize=22)
ax.set_ylabel('', fontsize=22)
ax.set_title(f'Top {n} Website Sample Counts (all)', size=22)
ax.invert_yaxis()
ax.set_xlim([0, None])
plt.gca().invert_yaxis()

plt.show()

In [24]:
import matplotlib.pyplot as plt

n = 20

counts = df[df['verified'] == True]['source'].value_counts().sort_values(ascending=True).tail(n)

fig, ax = plt.subplots(figsize=(15, 12))
counts.plot(kind='barh', ax=ax)

ax.tick_params(axis='both', which='major', labelsize=18)
ax.tick_params(axis='both', which='minor', labelsize=18)
ax.set_xlabel('', fontsize=22)
ax.set_ylabel('', fontsize=22)
ax.set_title(f'Top {n} Website Sample Counts (verified)', size=22)
ax.invert_yaxis()
ax.set_xlim([0, None])
plt.gca().invert_yaxis()

plt.show()

x# Companies and Job Titles

Only real job postings have companies, synthetics do have job titles

In [25]:
print(f"Companies: {len(df['company'].unique())}")
print(f"Job Titles: {len(df['position'].unique())}")

# Longest Text

Use different tokenisers to estimate the number of tokens

In [26]:

from transformers import AutoTokenizer


def print_max_tokens(model_id):
    tokenizer = AutoTokenizer.from_pretrained(model_id, add_prefix_space=True)
    max_tokens = len(tokenizer.encode(longest_text))
    print(f"Max '{model_id}' tokens: {max_tokens}")


def print_encode_decoded(model_id, longest_text):
    tokenizer = AutoTokenizer.from_pretrained(model_id, add_prefix_space=True)
    encoded_tokens = tokenizer.encode(longest_text)
    print(f"Tokens: {encoded_tokens}")
    print(f"Decoded tokens: {tokenizer.decode(encoded_tokens)}")


def print_tokens(model_id, longest_text):
    tokenizer = AutoTokenizer.from_pretrained(model_id, add_prefix_space=True)
    tokens = tokenizer.tokenize(longest_text)
    print(f"Tokens: {tokens}")


In [27]:
df[text_column].fillna('', inplace=True)
longest_text = df[text_column].apply(lambda x: (len(x), x)).max()[1]

max_char = len(longest_text)
max_words = len(longest_text.split())

print(f'Max characters: {max_char}')
print(f'Max words: {max_words}')
for model_id in ['roberta-base', 'bert-base-uncased', 'microsoft/deberta-v3-small']:
    print_max_tokens(model_id)


Explore what the tokens look like (in roberta)

In [28]:
print_tokens('roberta-base', longest_text)

In [29]:
longest_text

In [30]:
print_encode_decoded('roberta-base', longest_text)