In [3]:
import pandas as pd
import json

In [10]:
LF_DIR = "/data/karen/LLaMA-Factory/"

In [2]:
BASE_DIR = "/data/karen/debate-divisiveness/code/LM/revision"

In [4]:
EXT = "_maskedAll"
DATA_TYPES = ["debates", "sotu", "campaign"]
ORIG_TEXT = "text_orig" if "unmasked" in EXT else "text_orig_masked" 



In [7]:
def add_sent_ids(lm_data):
    
    grouped = lm_data.groupby('speech_id').size()
    speech_lengths =  dict(grouped)
    
    lm_dict = lm_data.to_dict("records")
        
    spch = 0
    sent_id = 0
    prev_spch = -1
    spch_len = -1

    for row in lm_dict:
        spch = row["speech_id"]
        if spch == prev_spch:
            sent_id += 1
            row["sent_id"] = sent_id / spch_len * 100
        else:
            prev_spch = spch
            spch_len = speech_lengths[spch]
            sent_id = 0
            row["sent_id"] = sent_id / spch_len * 100

    lm_data = pd.DataFrame(lm_dict)

    return lm_data

def load_data_and_speakers(PROJ_DIR, DATA_TYPE, IN_DATA, load_metadata=False):

    with open(f"{PROJ_DIR}/speakers.json", "r") as f:
        speakers = json.load(f)
        del speakers["Other"]

    if DATA_TYPE == "sotu": # or DATA_TYPE == "campaign":
        with open(f"{PROJ_DIR}/speakers_metadata.json", "r") as f:
            metadata = json.load(f)
            speakers = {v["clean"]: v["lm_id"] for _, v in metadata.items()}

    if DATA_TYPE == "campaign":
        with open(f"{PROJ_DIR}/speakers_metadata.json", "r") as f:
            metadata = json.load(f)
            speakers = {v["clean"]: v["lm_id"] for _, v in metadata.items()
                        if len(v["year"]) >0 and int(v["year"][-1]) >= 2008}

    spk_data = None
    if load_metadata:
        with open(f"{PROJ_DIR}/speakers_metadata.json", "r") as f:
            if DATA_TYPE == "sotu" or DATA_TYPE == "campaign":
                metadata = json.load(f)
                spk_data = {v["clean"]: v for _, v in metadata.items()}
            else:
                spk_data = json.load(f)

    if "Filtered" in EXT and DATA_TYPE in ["sotu", "debates"]:
        alt_ext = "_maskedAll" if EXT == "_maskedFiltered" else "_unmaskedAll"
        in_file = f"{PROJ_DIR}/data/data{alt_ext}/lm_data{alt_ext}_ppl.csv"
    else:
        in_file = f"{IN_DATA}/lm_data{EXT}.csv"
    lm_data = pd.read_csv(in_file)
    lm_data = add_sent_ids(lm_data)


    return lm_data, speakers, spk_data





In [8]:
lm_datas = {}
data_speakers = {}
metadatas = {}
spk_dfs = {}
for DATA_TYPE in DATA_TYPES:
    # PROJ_DIR = f"{BASE_DIR}/{DATA_TYPE}_{EXT}" 
    ppl_dir = f"{BASE_DIR}/{DATA_TYPE}{EXT}" # if args.base_dir else f"/data/karen/debate-divisiveness/code/LM/ppl_{data_type}"
    in_data = f"{BASE_DIR}/{DATA_TYPE}{EXT}/data" # if args.base_dir else f"/data/karen/debate-divisiveness/code/LM/ppl_{data_type}/data/data{EXT}"


    lm_data, speakers, spk_data = load_data_and_speakers(ppl_dir, DATA_TYPE, in_data, load_metadata=True)
    lm_data["data_type"] = DATA_TYPE
    lm_data["type"] = lm_data["mentions_opponent"]
    lm_data.rename(columns={"speaker_clean": "speaker"}, inplace=True)
    lm_data["is_trump"] = lm_data["speaker"] == "Donald Trump"  
    lm_data["party"] = lm_data["speaker"].apply(lambda x: spk_data[x]["party"] if x in spk_data else None)

    lm_datas[DATA_TYPE] = lm_data
    data_speakers[DATA_TYPE] = speakers
    metadatas[DATA_TYPE] = spk_data
    spk_dfs[DATA_TYPE] = lm_data[lm_data["speaker"].isin(data_speakers[DATA_TYPE].keys())]

In [13]:
grouped_dfs = {}
for DATA_TYPE in DATA_TYPES:
    print(DATA_TYPE)
    df = lm_datas[DATA_TYPE]
    grouped_dfs[DATA_TYPE] = df.groupby('speech_id')['text'].apply('\n'.join).reset_index()


debates
sotu
campaign


In [14]:
grouped_dfs["debates"]

Unnamed: 0,speech_id,text
0,0,<DEBATE_START> Other: Former Vice President <E...
1,2,<DEBATE_START> Other: Former Vice President <E...
2,16,<DEBATE_START> Other: Former Secretary of Stat...
3,17,<DEBATE_START> Other: Former Secretary of Stat...
4,19,<DEBATE_START> Other: Former Secretary of Stat...
5,49,<DEBATE_START> Other: Good evening from the ca...
6,50,<DEBATE_START> Other: .\nOther: Good evening f...
7,52,<DEBATE_START> Other: Good evening from the Ma...
8,73,<DEBATE_START> Other: Good evening.\nOther: An...
9,74,<DEBATE_START> Other: Good evening from Belmon...


For the **pre-training datasets**, only the `prompt` column will be used for training, for example:

```json
[
  {"text": "document"},
  {"text": "document"}
]
```

Regarding the above dataset, the description in `dataset_info.json` should be:

```json
"dataset_name": {
  "file_name": "data.json",
  "columns": {
    "prompt": "text"
  }
}
```

In [28]:
def print_config_for_lf(dt, outfile):
    config = {
        "file_name": outfile.split("\\")[-1],
        "columns": {
        "prompt": "text"
        }
    }
    print(json.dumps(config))

def convert_to_lf_format(lm_datas, data_type, outfile):
    df = lm_datas[data_type]
    grouped_dfs[data_type] = df.groupby('speech_id')['text'].apply('\n'.join).reset_index()

    res = [ {"text": doc} for doc in grouped_dfs[data_type]["text"].tolist() ]
    # print(len(res))
    
    with open(outfile, "w") as fout:
        json.dump(res, fout, indent=2)

    print_config_for_lf(data_type, outfile)
    




In [29]:
for DATA_TYPE in DATA_TYPES:
    # print(DATA_TYPE)
    outfile = f"{LF_DIR}/data/{DATA_TYPE}{EXT}.json"
    convert_to_lf_format(lm_datas, DATA_TYPE, outfile)

{"debates": {"file_name": "/data/karen/LLaMA-Factory//data/debates_maskedAll.json", "columns": {"prompt": "text"}}}
{"sotu": {"file_name": "/data/karen/LLaMA-Factory//data/sotu_maskedAll.json", "columns": {"prompt": "text"}}}
{"campaign": {"file_name": "/data/karen/LLaMA-Factory//data/campaign_maskedAll.json", "columns": {"prompt": "text"}}}
