# Preparing the dataset

Pulling manual data classification and placing it into train-validate datasets

In [110]:
import pandas as pd

df_chi = pd.read_csv('../data/chi2.csv')

In [111]:
df_chi['selftext'] = df_chi['selftext'].fillna('')
df_chi['text'] = df_chi['title'] + '\n' + df_chi['selftext']

In [112]:
manual_bool = ~df_chi.loc[:, 'negative'].str.startswith('0.')
df_manual = df_chi.loc[manual_bool]

In [113]:
df_manual.loc[:, ['tone', 'emotion', 'theme']].describe()

Unnamed: 0,tone,emotion,theme
count,151,151,151
unique,4,8,8
top,neutral,fear,question
freq,95,74,71


In [116]:
tone_labels = ['negative', 'neutral', 'positive']
emotion_labels = ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']
theme_labels = ['clinical update', 'community', 'question', 'education', 'advocating', 'dissuading', 'other']

In [133]:
import json
import os

def rows_to_jsonl(df, label_key, score_keys):
  df['label'] = df[label_key]
  df['score'] = pd.NA
  df['answer'] = pd.NA
  df['meta'] = '{"annotated": false, "score": 0}'

  manual_bool = ~df.loc[:, 'negative'].str.startswith('0.')
  df.loc[~manual_bool, ['score']] = df.loc[~manual_bool, score_keys].max(axis=1)
  df['score'] = pd.to_numeric(df['score'], errors='coerce')
  df.loc[manual_bool, ['answer']] = 'accept'

  df.loc[manual_bool, ['meta']] = df.loc[manual_bool]['id'].apply(lambda x: f'{{"annotated": true, "id": "{x}"}}')
  df.loc[~manual_bool, ['meta']] = (
    df.loc[~manual_bool]['score'].apply(lambda x: f'{{"annotated": false, "score": {x}, ').astype(str) + 
    df.loc[~manual_bool]['id'].apply(lambda x: f'"id": "{x}", ') +
    df.loc[~manual_bool][label_key].apply(lambda x: f'"label": "{x}"}}').astype(str)
  )
  df.loc[~manual_bool, ['label']] = pd.NA

  filename = f'../datasets/{label_key}.jsonl'
  df.loc[~manual_bool, ['text', 'meta', 'label', 'score', 'answer']].to_json(filename, orient='records', lines=True)
  tmp_filename = '../datasets/tmp.jsonl'

  with open(filename, 'r') as f:
    with open(tmp_filename, 'w') as f2:
      for line in f:
        row = json.loads(line)
        try:
          row['meta'] = json.loads(row['meta'])
        except:
          print(row['meta'])
          raise
        f2.write(json.dumps(row) + "\n")
  
  os.remove(filename)
  os.rename(tmp_filename, filename)


In [134]:
rows_to_jsonl(df_chi, 'tone', tone_labels)
rows_to_jsonl(df_chi, 'emotion', emotion_labels)
rows_to_jsonl(df_chi, 'theme', theme_labels)