Putting json with scores for summaries into a json, and only looking at non reference summaries.

In [1]:
import pandas as pd
import numpy as np
import sqlite3

scores_df = pd.read_json("../data/scores.jsonl", lines=True)
summaries_df = pd.read_json("../data/summaries.jsonl", lines=True)


scores_df = scores_df[scores_df["system"] != "reference"]
scores_df["hter_gold"] = [row["hter"]["gold"] for row in scores_df['prompts']]
scores_df["sim"] = [row["hter"]["sim"] for row in scores_df['prompts']]
scores_df = scores_df.set_index(["id", "system"])

summaries_df["has_edits"] = [row["output"]["edit"] is not None and row["input"]["contents"]["system"] != "reference" for _, row in summaries_df.iterrows()]
summaries_df["system"] = [row["contents"]["system"] for row in summaries_df["input"]]
summaries_df["id"] = [int(row["contents"]["id"]) for row in summaries_df["input"]]
summaries_df = summaries_df.set_index(["id", "system"])

summaries_df = summaries_df[summaries_df["has_edits"]]
matches = scores_df.index.intersection(summaries_df.index)

scores_df = scores_df.loc[matches.values]
summaries_df = summaries_df.loc[matches.values]

Putting all the relevant data into 1 dataframe

In [2]:
# edit_df = pd.DataFrame(columns=["id", "system", "edit", "hter", "sim", "system_summary", "reference"])
rows_list = []
for _, row in summaries_df.iterrows():
    human_eval_data = row["output"]["_responses"]
    input_data = row["input"]["contents"]
    post_edits = human_eval_data["edit"]
    if len(post_edits) != 2:
        print(row.name)
    for post_edit_index in range(len(post_edits)):
        edit_row = {'system': row.name[1], 'id': row.name[0] }
        edit_row["edit"] = post_edits[post_edit_index]
        edit_row["hter"] = human_eval_data["hter"][post_edit_index]
        edit_row["annotator_id"] = human_eval_data["worker_ids"][post_edit_index]
        edit_row["reference"] = input_data["reference"]
        edit_row["system_summary"] = input_data["text"]
        edit_row["sim"] = scores_df.at[row.name, "sim"]
        #weird 1 case where there are separate records for data for the same system and id
        if type(scores_df.at[row.name, "sim"]) is np.ndarray:
            continue
        edit_row["overall"] = human_eval_data["overall"][post_edit_index]
        edit_row["grammar"] = human_eval_data["grammar"][post_edit_index]
        edit_row["redundancy"] = human_eval_data["redundancy"][post_edit_index]
        

        rows_list.append(edit_row)

edits_df = pd.DataFrame(rows_list)


(2192, 'ml+rl')
(2199, 'pointer')
(2263, 'pointer')
(2501, 'pointer')
(2828, 'pointer')
(2836, 'ml')
(3194, 'ml')
(3457, 'ml+rl')
(3953, 'pointer')
(4856, 'ml')
(8116, 'ml')
(8283, 'seq2seq')
(10549, 'ml+rl')
(10929, 'ml+rl')
(10930, 'seq2seq')


Putting new data into a dataframe

In [3]:
conn = sqlite3.connect('../data/cdm_postedits.db')
c = conn.cursor()
rows = edits_df.values.tolist()

c.execute('DROP TABLE IF EXISTS cdm_postedits;')

# Create table
c.execute('''CREATE TABLE cdm_postedits
             (annotator_id integer, edit text, grammar integer, hter integer, id integer, overall integer, redundancy integer, reference text, sim real, system text, system_summary text)''')

# Insert a row of data
c.executemany('INSERT INTO cdm_postedits VALUES (?,?,?,?,?,?,?,?,?,?,?)', rows)

# Save (commit) the changes
conn.commit()


conn.close()
