In [None]:
import sys
sys.path.append('src/')
from percentparser import parse_percentage

import os
import json
import pandas as pd
import glob
import numpy as np
import datetime
import regex as re

import warnings
warnings.filterwarnings("ignore")

pd.set_option("display.max_rows", 100)

In [None]:
start_run = datetime.datetime.now()

# Data Cleaning

In [None]:
file = glob.glob("output_data/*.jsonl.zip")[0]

df = pd.read_json(file, lines=True)

# Normalize nested data to access 'model' inside 'response.body' and 'content' inside 'response.body.choices'
df = pd.json_normalize(
    df.to_dict(orient='records'),
    record_path=['response', 'body', 'choices'],
    meta=[['custom_id'], ['response', 'body', 'model']],
    record_prefix='choices.'
)

df_final = df[['custom_id', 'response.body.model', 'choices.message.content']]
df_final.columns = ['custom_id', 'model', 'content']  

df_seed = pd.read_csv("input_data/age_name_edu_seed.csv")
df_seed['custom_id'] = 'task-' + df_seed['run_id'].astype(str)

df_merged = pd.merge(df_final, df_seed, on='custom_id', how='inner')

# from percentparser.py: return nan for values <0 or >100
df_merged['query_response'] = df_merged['content'].apply(parse_percentage)
#df_merged[['content', 'query_response']].sort_values('query_response')

df = df_merged
df.drop(columns="query_response_raw", inplace=True)
df.rename(columns={"content": "query_response_raw"}, inplace=True)
len(df)

In [None]:
df.head()

In [None]:
len(df['query_response_raw'].unique())

In [None]:
df['query_response'].unique()

In [None]:
len(df['query_response'].unique())

In [None]:
df[df['query_response'].isna()]

In [None]:
df['gender'] = df['gender'].str.replace('None-Control', 'Gender-Neutral')
df[df['gender'] == 'Gender-Neutral'].head()

In [None]:
keepcols = ['custom_id', 'model', 'name', 'gender', 'race', 'education', 'age', 'query_response_raw', 'query_response']
df[keepcols].to_csv("processed_data/age_name_edu_data.csv.zip", index=False, compression='zip')

In [None]:
print("Elapsed time:", datetime.datetime.now() - start_run)