# Preprocessing of the fairness attributes data set

In [1]:
import pandas as pd

In [2]:
def read_and_preprocess_fairness_attributes(filepath='data/trec_2022_articles_discrete.json', subcont_regions_only=False, sample=False):
    '''
    Use this function to read the fairness attributes file and preprocess it.
    The preprocessing includes:
        - removing columns that are unrelated to fairness
        - removing attributes where similar easier to handle categorical attributes are available
        - normalizing lists and nested json objects
        - one-hot-encoding categorical attributes
        
    :param filepath str: path of fairnett attributes file
    :param subcont_regions_only bool: if True, only use data of subcontinental regions and drop countries
    :param sample bool/int: if integer value is specified, only read this amount of lines (for testing purposes)
    :return: pandas Dataframe with preprocessed fairness attributes
    '''
    
    # read fairness attributes data
    print('reading file... ', end='')
    if sample:
        attributes = pd.read_json(filepath, lines=True, nrows=sample)
    else:
        attributes = pd.read_json(filepath, lines=True)
    
    # drop firstletter
    attributes = attributes.drop(columns=['first_letter', 'first_letter_category'])
    
    # some attributes are supplied as categorical values as well
    # for example: years and years_category
    # only keep categorical values for simplicity
    attributes = attributes.drop(columns=[
        'pred_qual', # -> qual_cat
        'gender', # -> gender_category
        'years', # -> years_category
        'num_sitelinks', # -> num_sitelinks_category
        'relative_pageviews', # ->relative_pageviews_category
        'creation_date', # -> creation_date_category
    ])
    
    # one-hot-encode categorical attributes
    print('one-hot-encoding categorical attributes... ', end='')
    cat_cols = ['qual_cat', 'gender_category', 'years_category', 'num_sitelinks_category', 'relative_pageviews_category', 'creation_date_category']
    for col in cat_cols:
        dummies = pd.get_dummies(attributes[col])
        dummies.columns = [col + '_' + inner_col for inner_col in dummies.columns]
        attributes = attributes.drop(columns=col)
        attributes = pd.concat([attributes, dummies], axis=1)
    
    # normalize lists
    print('normalizing lists... ', end='')
    if subcont_regions_only:
        attributes = attributes.drop(columns='page_countries')
        list_cols = ['page_subcont_regions', 'occupations']
    else:
        list_cols = ['page_countries', 'page_subcont_regions', 'occupations']
    for col in list_cols:
        unnested_data = attributes[col].str.join(',').str.get_dummies(sep=',').astype(bool)
        unnested_data.columns = [col + '_' + inner_col for inner_col in unnested_data.columns]
        attributes = attributes.drop(columns=col)
        attributes = pd.concat([attributes, unnested_data], axis=1)
        
    # normalize nested dicts
    print('normalizing nested dicts... ', end='')
    if subcont_regions_only:
        attributes = attributes.drop(columns='source_countries')
        dict_cols = ['source_subcont_regions']
    else:
        dict_cols = ['source_countries', 'source_subcont_regions']
    for col in dict_cols:
        unnested_data = pd.json_normalize(attributes[col])
        unnested_data = unnested_data.fillna(0)
        unnested_data = unnested_data.astype(int)
        unnested_data.columns = [col + '_' + inner_col for inner_col in unnested_data.columns]
        attributes = attributes.drop(columns=col)
        attributes = pd.concat([attributes, unnested_data], axis=1)
        
    # set page_id as index
    attributes = attributes.set_index('page_id')
    
    return attributes

In [None]:
# apply function and save file
preprocessed_fairness_attributes = read_and_preprocess_fairness_attributes()
preprocessed_fairness_attributes.to_csv('data/preprocessed_fairness_attributes.csv', index=True)

reading file... one-hot-encoding categorical attributes... normalizing lists... normalizing nested dicts... 

In [None]:
# apply function and save file
preprocessed_fairness_attributes_small = read_and_preprocess_fairness_attributes(subcont_regions_only=True)
preprocessed_fairness_attributes_small.to_csv('data/preprocessed_fairness_attributes_small.csv', index=True)