# Preprocessing of the fairness attributes data set

In [1]:
import pandas as pd

In [2]:
def read_and_preprocess_fairness_attributes(filepath='data/trec_2022_articles_discrete.json', subcont_regions_only=False, sample=False):
    '''
    Use this function to read the fairness attributes file and preprocess it.
    The preprocessing includes:
        - removing columns that are unrelated to fairness
        - removing attributes where similar easier to handle categorical attributes are available
        - normalizing lists and nested json objects
        - one-hot-encoding categorical attributes
        
    :param filepath str: path of fairnett attributes file
    :param subcont_regions_only bool: if True, only use data of subcontinental regions and drop countries
    :param sample bool/int: if integer value is specified, only read this amount of lines (for testing purposes)
    :return: pandas Dataframe with preprocessed fairness attributes
    '''
    
    # read fairness attributes data
    if sample:
        attributes = pd.read_json(filepath, lines=True, nrows=sample)
    else:
        attributes = pd.read_json(filepath, lines=True)
    
    # drop firstletter
    attributes = attributes.drop(columns=['first_letter', 'first_letter_category'])
    
    # some attributes are supplied as categorical values as well
    # for example: years and years_category
    # only keep categorical values for simplicity
    attributes = attributes.drop(columns=[
        'pred_qual', # -> qual_cat
        'gender', # -> gender_category
        'years', # -> years_category
        'num_sitelinks', # -> num_sitelinks_category
        'relative_pageviews', # ->relative_pageviews_category
        'creation_date', # -> creation_date_category
    ])
    
    # one-hot-encode categorical attributes
    cat_cols = ['qual_cat', 'gender_category', 'years_category', 'num_sitelinks_category', 'relative_pageviews_category', 'creation_date_category']
    for col in cat_cols:
        dummies = pd.get_dummies(attributes[col])
        dummies.columns = [col + '_' + inner_col for inner_col in dummies.columns]
        attributes = attributes.drop(columns=col)
        attributes = pd.concat([attributes, dummies], axis=1)
    
    # normalize lists
    if subcont_regions_only:
        attributes = attributes.drop(columns='page_countries')
        list_cols = ['page_subcont_regions', 'occupations']
    else:
        list_cols = ['page_countries', 'page_subcont_regions', 'occupations']
    for col in list_cols:
        unnested_data = attributes[col].str.join(',').str.get_dummies(sep=',').astype(bool)
        unnested_data.columns = [col + '_' + inner_col for inner_col in unnested_data.columns]
        attributes = attributes.drop(columns=col)
        attributes = pd.concat([attributes, unnested_data], axis=1)
        
    # normalize nested dicts
    if subcont_regions_only:
        attributes = attributes.drop(columns='source_countries')
        dict_cols = ['source_subcont_regions']
    else:
        dict_cols = ['source_countries', 'source_subcont_regions']
    for col in dict_cols:
        unnested_data = pd.json_normalize(attributes[col])
        unnested_data = unnested_data.fillna(0)
        unnested_data = unnested_data.astype(int)
        unnested_data.columns = [col + '_' + inner_col for inner_col in unnested_data.columns]
        attributes = attributes.drop(columns=col)
        attributes = pd.concat([attributes, unnested_data], axis=1)
    
    return attributes

In [3]:
test = read_and_preprocess_fairness_attributes(sample=10000)
test.to_csv('data/preprocessed_fairness_attributes.csv')

In [4]:
test.head()

Unnamed: 0,page_id,qual_cat_B,qual_cat_C,qual_cat_FA,qual_cat_GA,qual_cat_Start,qual_cat_Stub,gender_category_Man,gender_category_Unknown,gender_category_Woman,...,source_subcont_regions_South-eastern Asia,source_subcont_regions_Caribbean,source_subcont_regions_Western Africa,source_subcont_regions_Southern Africa,source_subcont_regions_Middle Africa,source_subcont_regions_Eastern Africa,source_subcont_regions_Central Asia,source_subcont_regions_Antarctica,source_subcont_regions_Melanesia,source_subcont_regions_Micronesia
0,12,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,25,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,39,0,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,290,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,303,0,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
test.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 533 columns):
 #    Column                                                         Non-Null Count  Dtype
---   ------                                                         --------------  -----
 0    page_id                                                        10000 non-null  int64
 1    qual_cat_B                                                     10000 non-null  uint8
 2    qual_cat_C                                                     10000 non-null  uint8
 3    qual_cat_FA                                                    10000 non-null  uint8
 4    qual_cat_GA                                                    10000 non-null  uint8
 5    qual_cat_Start                                                 10000 non-null  uint8
 6    qual_cat_Stub                                                  10000 non-null  uint8
 7    gender_category_Man                                            100

In [6]:
# apply function and save file
#preprocessed_fairness_attributes = read_and_preprocess_fairness_attributes()
#preprocessed_fairness_attributes.to_csv('data/preprocessed_fairness_attributes.csv')