In [8]:
# QMSum
from datasets import load_dataset

dataset = load_dataset(
    'json',
    data_files={
        'train': ['data/qmsum/raw_data/train.jsonl'],
        'validation': ['data/qmsum/raw_data/val.jsonl'],
        'test': ['data/qmsum/raw_data/test.jsonl']
    }
)
print(dataset)

def process(exs):
    dialogues = []
    summaries = []
    ids = []
    query_list = exs['specific_query_list']
    meetings = exs['meeting_transcripts']
    for i, meeting in enumerate(meetings):
        meeting = list(map(lambda x: x['speaker'] + ": " + x['content'], meeting))
        for j, pair in enumerate(query_list[i]):
            query = pair['query']
            summary = pair['answer']
            context = "\r\n".join(['\r\n'.join(meeting[int(st):int(ed)+1]) for st, ed in pair['relevant_text_span']])
            dialogue = query + ' ##\r\n' + context
            dialogues.append(dialogue)
            summaries.append(summary)
            ids.append(str(i)+'_'+str(j))
    return {
        "dialogue": dialogues,
        "summary": summaries,
        "id": ids
    }
processed_dataset = dataset.map(
    process,
    batched=True,
    num_proc=1,
    remove_columns=dataset["train"].column_names,
    load_from_cache_file=False,
    desc="Running process on dataset",
)

processed_dataset.save_to_disk('data/qmsum/qmsum')
print(processed_dataset)

Using custom data configuration default-de61977f3e5a3574
Reusing dataset json (/home/v-yichengzou/.cache/huggingface/datasets/json/default-de61977f3e5a3574/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['topic_list', 'general_query_list', 'specific_query_list', 'meeting_transcripts'],
        num_rows: 162
    })
    validation: Dataset({
        features: ['topic_list', 'general_query_list', 'specific_query_list', 'meeting_transcripts'],
        num_rows: 35
    })
    test: Dataset({
        features: ['topic_list', 'general_query_list', 'specific_query_list', 'meeting_transcripts'],
        num_rows: 35
    })
})


Running process on dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

Running process on dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

Running process on dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['dialogue', 'summary', 'id'],
        num_rows: 1095
    })
    validation: Dataset({
        features: ['dialogue', 'summary', 'id'],
        num_rows: 237
    })
    test: Dataset({
        features: ['dialogue', 'summary', 'id'],
        num_rows: 244
    })
})


In [13]:
# dialsumm
from datasets import load_dataset, DatasetDict

dataset = DatasetDict({
    "train": load_dataset('json', data_files='data/dialsumm/raw_data/dialogsum.train.jsonl', split='train'),
    "validation": load_dataset('json', data_files='data/dialsumm/raw_data/dialogsum.dev.jsonl', split='train'),
    "test": load_dataset('json', data_files='data/dialsumm/raw_data/dialogsum.test.jsonl', split='train')
})

print(dataset)

def process(exs):
    summaries = []
    if "summary1" in exs:
        for s1, s2, s3 in zip(exs['summary1'], exs['summary2'], exs['summary3']):
            s = s1
            if len(s1) < len(s2):
                s = s2
            if len(s2) < len(s3):
                s = s3
            summaries.append(s)
    return {
        "dialogue": exs['dialogue'],
        "summary": exs['summary'] if "summary1" not in exs else summaries,
        "id": exs['fname']
    }
process_train = dataset['train'].map(
    process,
    batched=True,
    num_proc=1,
    remove_columns=dataset['train'].column_names,
    load_from_cache_file=False,
    desc="Running process on dataset",
)
process_dev = dataset['validation'].map(
    process,
    batched=True,
    num_proc=1,
    remove_columns=dataset['validation'].column_names,
    load_from_cache_file=False,
    desc="Running process on dataset",
)
process_test = dataset['test'].map(
    process,
    batched=True,
    num_proc=1,
    remove_columns=dataset['test'].column_names,
    load_from_cache_file=False,
    desc="Running process on dataset",
)
processed_dataset = DatasetDict({
    "train": process_train,
    "validation": process_dev,
    "test": process_test
})
processed_dataset.save_to_disk('data/dialsumm/dialsumm')
print(processed_dataset)

Using custom data configuration default-00a44acd03064a31
Reusing dataset json (/home/v-yichengzou/.cache/huggingface/datasets/json/default-00a44acd03064a31/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b)
Using custom data configuration default-c4407731ccbc3b5b
Reusing dataset json (/home/v-yichengzou/.cache/huggingface/datasets/json/default-c4407731ccbc3b5b/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b)
Using custom data configuration default-a4e55bb77cf59c64
Reusing dataset json (/home/v-yichengzou/.cache/huggingface/datasets/json/default-a4e55bb77cf59c64/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b)


DatasetDict({
    train: Dataset({
        features: ['fname', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['fname', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
    test: Dataset({
        features: ['fname', 'dialogue', 'summary1', 'topic1', 'summary2', 'topic2', 'summary3', 'topic3'],
        num_rows: 500
    })
})


Running process on dataset:   0%|          | 0/13 [00:00<?, ?ba/s]

Running process on dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

Running process on dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['dialogue', 'summary', 'id'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['dialogue', 'summary', 'id'],
        num_rows: 500
    })
    test: Dataset({
        features: ['dialogue', 'summary', 'id'],
        num_rows: 500
    })
})


In [7]:
# summscreen
from datasets import load_dataset, DatasetDict, concatenate_datasets

dataset = DatasetDict({
    "train": concatenate_datasets([
        load_dataset('json', data_files='data/summscreen/raw_data/ForeverDreaming/fd_train.json', split='train'),
        load_dataset('json', data_files='data/summscreen/raw_data/TVMegaSite/tms_train.json', split='train'),
    ]),
    "validation": concatenate_datasets([
        load_dataset('json', data_files='data/summscreen/raw_data/ForeverDreaming/fd_dev.json', split='train'),
        load_dataset('json', data_files='data/summscreen/raw_data/TVMegaSite/tms_dev.json', split='train'),
    ]),
    "test": concatenate_datasets([
        load_dataset('json', data_files='data/summscreen/raw_data/ForeverDreaming/fd_test.json', split='train'),
        load_dataset('json', data_files='data/summscreen/raw_data/TVMegaSite/tms_test.json', split='train'),
    ])
})

print(dataset)

def process(exs):
    dialogues = []
    summaries = []
    for d in exs['Transcript']:
        newd = "\r\n".join([s.replace("@@ ", "").replace(" '", "'") for s in d])
        dialogues.append(newd)
    for d in exs['Recap']:
        newd = " ".join([s.replace("@@ ", "").replace(" '", "'") for s in d])
        summaries.append(newd)
    return {
        "dialogue": dialogues,
        "summary": summaries,
        "id": exs['filename']
    }
processed_dataset = dataset.map(
    process,
    batched=True,
    num_proc=1,
    remove_columns=dataset['train'].column_names,
    load_from_cache_file=False,
    desc="Running process on dataset",
)
processed_dataset.save_to_disk('data/summscreen/summscreen')
print(processed_dataset)

Using custom data configuration default-b0aa02ed7e9cb25a
Reusing dataset json (/home/v-yichengzou/.cache/huggingface/datasets/json/default-b0aa02ed7e9cb25a/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b)
Using custom data configuration default-c89997efc37619e6
Reusing dataset json (/home/v-yichengzou/.cache/huggingface/datasets/json/default-c89997efc37619e6/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b)
Using custom data configuration default-e9ed5e55bfc11921
Reusing dataset json (/home/v-yichengzou/.cache/huggingface/datasets/json/default-e9ed5e55bfc11921/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b)
Using custom data configuration default-f290029cf0075597
Reusing dataset json (/home/v-yichengzou/.cache/huggingface/datasets/json/default-f290029cf0075597/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b)
Using custom data configuration default-32c81f4204f23a49
Reusing dataset json (/home/v-y

DatasetDict({
    train: Dataset({
        features: ['Recap', 'Transcript', 'filename'],
        num_rows: 22588
    })
    validation: Dataset({
        features: ['Recap', 'Transcript', 'filename'],
        num_rows: 2133
    })
    test: Dataset({
        features: ['Recap', 'Transcript', 'filename'],
        num_rows: 2130
    })
})


Running process on dataset:   0%|          | 0/23 [00:00<?, ?ba/s]

Running process on dataset:   0%|          | 0/3 [00:00<?, ?ba/s]

Running process on dataset:   0%|          | 0/3 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['dialogue', 'summary', 'id'],
        num_rows: 22588
    })
    validation: Dataset({
        features: ['dialogue', 'summary', 'id'],
        num_rows: 2133
    })
    test: Dataset({
        features: ['dialogue', 'summary', 'id'],
        num_rows: 2130
    })
})


In [1]:
# tweetsumm
import json
from datasets import Dataset, DatasetDict, concatenate_datasets
from data.tweetsumm.raw_data.Tweetsumm.tweet_sum_processor import TweetSumProcessor

processor = TweetSumProcessor("/home/v-yichengzou/projects/summ/data/tweetsumm/raw_data/twcs/twcs.csv")
test_set = {"dialogue": [], "summary": []}
with open("/home/v-yichengzou/projects/summ/data/tweetsumm/raw_data/Tweetsumm/tweet_sum_data_files/final_test_tweetsum.jsonl") as f:
    dialog_with_summaries = processor.get_dialog_with_summaries(f.readlines())
    for dialog_with_summary in dialog_with_summaries:
        json_format = dialog_with_summary.get_json()
        json_data = json.loads(json_format)
        test_set['dialogue'].append(json_data['dialog'])
        test_set['summary'].append(json_data['summaries']['abstractive_summaries'])

eval_set = {"dialogue": [], "summary": []}
with open("/home/v-yichengzou/projects/summ/data/tweetsumm/raw_data/Tweetsumm/tweet_sum_data_files/final_valid_tweetsum.jsonl") as f:
    dialog_with_summaries = processor.get_dialog_with_summaries(f.readlines())
    for dialog_with_summary in dialog_with_summaries:
        json_format = dialog_with_summary.get_json()
        json_data = json.loads(json_format)
        eval_set['dialogue'].append(json_data['dialog'])
        eval_set['summary'].append(json_data['summaries']['abstractive_summaries'])

train_set = {"dialogue": [], "summary": []}
with open("/home/v-yichengzou/projects/summ/data/tweetsumm/raw_data/Tweetsumm/tweet_sum_data_files/final_train_tweetsum.jsonl") as f:
    dialog_with_summaries = processor.get_dialog_with_summaries(f.readlines())
    for dialog_with_summary in dialog_with_summaries:
        json_format = dialog_with_summary.get_json()
        json_data = json.loads(json_format)
        train_set['dialogue'].append(json_data['dialog'])
        train_set['summary'].append(json_data['summaries']['abstractive_summaries'])

dataset = DatasetDict({
    "train": Dataset.from_dict(train_set),
    "validation": Dataset.from_dict(eval_set),
    "test": Dataset.from_dict(test_set)
})

print(dataset)

def process(exs):
    dialogues = []
    summaries = []
    ids = []
    for d in exs['dialogue']:
        ids.append(d['dialog_id'])
        newd = "\r\n".join([' '.join(s['sentences']) for s in d['turns']])
        dialogues.append(newd)
    for summary in exs['summary']:
        best_s, len_s = "", 0
        for s in summary:
            news = " ".join(s)
            if len(news) > len_s:
                best_s, len_s = news, len(news)
        summaries.append(best_s)
    return {
        "dialogue": dialogues,
        "summary": summaries,
        "id": ids
    }
processed_dataset = dataset.map(
    process,
    batched=True,
    num_proc=1,
    remove_columns=dataset['train'].column_names,
    load_from_cache_file=False,
    desc="Running process on dataset",
)
processed_dataset.save_to_disk('data/tweetsumm/tweetsumm')
print(processed_dataset)

DatasetDict({
    train: Dataset({
        features: ['dialogue', 'summary'],
        num_rows: 879
    })
    validation: Dataset({
        features: ['dialogue', 'summary'],
        num_rows: 110
    })
    test: Dataset({
        features: ['dialogue', 'summary'],
        num_rows: 110
    })
})


Running process on dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

Running process on dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

Running process on dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['dialogue', 'summary', 'id'],
        num_rows: 879
    })
    validation: Dataset({
        features: ['dialogue', 'summary', 'id'],
        num_rows: 110
    })
    test: Dataset({
        features: ['dialogue', 'summary', 'id'],
        num_rows: 110
    })
})


In [1]:
# emailsum
import json
from datasets import Dataset, DatasetDict, concatenate_datasets

train_set = {"dialogue": [], "summary": [], "id": []}
with open("data/emailsum/raw_data/avocado/data_email_long/train.source.txt", "r") as f:
    for line in f:
        train_set['dialogue'].append(line.replace("|||", '\r\n'))
with open("data/emailsum/raw_data/avocado/data_email_long/train.target.txt", "r") as f:
    for line in f:
        train_set['summary'].append(line)
with open("data/emailsum/raw_data/avocado/data_email_long/train.id.txt", "r") as f:
    for line in f:
        train_set['id'].append(line)

eval_set = {"dialogue": [], "summary": [], "id": []}
with open("data/emailsum/raw_data/avocado/data_email_long/dev.source.txt", "r") as f:
    for line in f:
        eval_set['dialogue'].append(line.replace("|||", '\r\n'))
with open("data/emailsum/raw_data/avocado/data_email_long/dev.target.txt", "r") as f:
    for line in f:
        eval_set['summary'].append(line)
with open("data/emailsum/raw_data/avocado/data_email_long/dev.id.txt", "r") as f:
    for line in f:
        eval_set['id'].append(line)

test_set = {"dialogue": [], "summary": [], "id": []}
with open("data/emailsum/raw_data/avocado/data_email_long/test.source.txt", "r") as f:
    for line in f:
        test_set['dialogue'].append(line.replace("|||", '\r\n'))
with open("data/emailsum/raw_data/avocado/data_email_long/test.target.txt", "r") as f:
    for line in f:
        test_set['summary'].append(line)
with open("data/emailsum/raw_data/avocado/data_email_long/test.id.txt", "r") as f:
    for line in f:
        test_set['id'].append(line)

dataset = DatasetDict({
    "train": Dataset.from_dict(train_set),
    "validation": Dataset.from_dict(eval_set),
    "test": Dataset.from_dict(test_set)
})

dataset.save_to_disk('data/emailsum/emailsum_long')

train_set = {"dialogue": [], "summary": [], "id": []}
with open("data/emailsum/raw_data/avocado/data_email_short/train.source.txt", "r") as f:
    for line in f:
        train_set['dialogue'].append(line.replace("|||", '\r\n'))
with open("data/emailsum/raw_data/avocado/data_email_short/train.target.txt", "r") as f:
    for line in f:
        train_set['summary'].append(line)
with open("data/emailsum/raw_data/avocado/data_email_short/train.id.txt", "r") as f:
    for line in f:
        train_set['id'].append(line)

eval_set = {"dialogue": [], "summary": [], "id": []}
with open("data/emailsum/raw_data/avocado/data_email_short/dev.source.txt", "r") as f:
    for line in f:
        eval_set['dialogue'].append(line.replace("|||", '\r\n'))
with open("data/emailsum/raw_data/avocado/data_email_short/dev.target.txt", "r") as f:
    for line in f:
        eval_set['summary'].append(line)
with open("data/emailsum/raw_data/avocado/data_email_short/dev.id.txt", "r") as f:
    for line in f:
        eval_set['id'].append(line)

test_set = {"dialogue": [], "summary": [], "id": []}
with open("data/emailsum/raw_data/avocado/data_email_short/test.source.txt", "r") as f:
    for line in f:
        test_set['dialogue'].append(line.replace("|||", '\r\n'))
with open("data/emailsum/raw_data/avocado/data_email_short/test.target.txt", "r") as f:
    for line in f:
        test_set['summary'].append(line)
with open("data/emailsum/raw_data/avocado/data_email_short/test.id.txt", "r") as f:
    for line in f:
        test_set['id'].append(line)

dataset = DatasetDict({
    "train": Dataset.from_dict(train_set),
    "validation": Dataset.from_dict(eval_set),
    "test": Dataset.from_dict(test_set)
})

dataset.save_to_disk('data/emailsum/emailsum_short')
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['dialogue', 'summary', 'id'],
        num_rows: 1800
    })
    validation: Dataset({
        features: ['dialogue', 'summary', 'id'],
        num_rows: 249
    })
    test: Dataset({
        features: ['dialogue', 'summary', 'id'],
        num_rows: 500
    })
})


In [2]:
!pip install openpyxl

Collecting openpyxl
  Downloading openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB)
Downloading openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
Downloading et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-2.0.0 openpyxl-3.1.5


In [4]:
from datasets import load_from_disk
import pandas as pd
from nltk.tokenize import word_tokenize
from pandas import ExcelWriter

datalist = [
            '/home/gaya/group1/OLDS/data/tweetsumm/tweetsumm/omission.save',
            '/home/gaya/group1/OLDS/data/qmsum_omission/omission.save',
            ]
# datalist = ['data/emailsum/emailsum_short', 'data/emailsum/emailsum_long']
output_excel = ExcelWriter("data_statistics.xlsx")

results = []
for data_path in datalist:
    dataset = load_from_disk(data_path)
    for split in ['train', 'validation', 'test']:
        num = len(dataset[split])
        turns = 0
        tokens = 0
        summary_tokens = 0
        max_turns = 0
        max_tokens = 0
        max_summary_tokens = 0
        nAll = 0
        nbartL = 0
        nbartB = 0
        nt5B = 0
        nt5S = 0
        nbaseline = 0
        npegasus = 0
        omitRateAll = 0
        omitRatebartL = 0
        omitRatebartB = 0
        omitRatet5B = 0.
        omitRatet5S = 0
        omitRatebaseline = 0
        omitRatepegasus = 0
        turnAll = 0
        turnbartL = 0
        turnbartB = 0
        turnt5B = 0
        turnt5S = 0
        turnbaseline = 0
        turnpegasus = 0
        SomitRateAll = 0
        SomitRatebartL = 0
        SomitRatebartB = 0
        SomitRatet5B = 0.
        SomitRatet5S = 0
        SomitRatebaseline = 0
        SomitRatepegasus = 0
        oracleAll = 0
        oraclebartL = 0
        oraclebartB = 0
        oraclet5B = 0
        oraclet5S = 0
        oraclebaseline = 0
        oraclepegasus = 0
        for ex in dataset[split]:
            turn_num = len(ex['dialogue'].replace('\r\n', '\n').split('\n'))
            turns += turn_num
            max_turns = max(max_turns, turn_num)
            dial = word_tokenize(ex['dialogue'])
            summ = word_tokenize(ex['summary'])
            tokens += len(dial)
            summary_tokens += len(summ)
            max_tokens = max(max_tokens, len(dial))
            max_summary_tokens = max(max_summary_tokens, len(summ))
            labels = ex['omission_labels']
            oracles = ex['oracle_labels']
            for i, p in enumerate(ex['preds']):
                nAll += 1
                turnAll += turn_num
                oracleAll += len(oracles)
                if len(labels[i]) > 0:
                    omitRateAll += 1
                    SomitRateAll += len(labels[i])
                if p['source'] == 'bart_large':
                    nbartL += 1
                    turnbartL += turn_num
                    oraclebartL += len(oracles)
                    if len(labels[i]) > 0:
                        omitRatebartL += 1
                        SomitRatebartL += len(labels[i])
                elif p['source'] == 'bart_base':
                    nbartB += 1
                    turnbartB += turn_num
                    oraclebartB += len(oracles)
                    if len(labels[i]) > 0:
                        omitRatebartB += 1
                        SomitRatebartB += len(labels[i])
                elif p['source'] == 't5_base':
                    nt5B += 1
                    turnt5B += turn_num
                    oraclet5B += len(oracles)
                    if len(labels[i]) > 0:
                        omitRatet5B += 1
                        SomitRatet5B += len(labels[i])
                elif p['source'] == 't5_small':
                    nt5S += 1
                    turnt5S += turn_num
                    oraclet5S += len(oracles)
                    if len(labels[i]) > 0:
                        omitRatet5S += 1
                        SomitRatet5S += len(labels[i])
                elif p['source'] == 'baseline':
                    nbaseline += 1
                    turnbaseline += turn_num
                    oraclebaseline += len(oracles)
                    if len(labels[i]) > 0:
                        omitRatebaseline += 1
                        SomitRatebaseline += len(labels[i])
                else:
                    npegasus += 1
                    turnpegasus += turn_num
                    oraclepegasus += len(oracles)
                    if len(labels[i]) > 0:
                        omitRatepegasus += 1
                        SomitRatepegasus += len(labels[i])

        print("%s %s:\nnumber: %d\nAvg. turns: %.2f\nAvg. dialog length: %.2f\nAvg. turn length: %.2f\nAvg. summary length: %.2f" %
            (data_path, split, num, turns / num, tokens / num, tokens / turns, summary_tokens / num))
        print("Max turns: %d\nMax dialog length: %d\nMax summary length: %d" %
            (max_turns, max_tokens, max_summary_tokens))
        print("All/bartL/bartB/t5B/t5S/baseline/pegasus: %d/%d/%d/%d/%d/%d/%d" %
            (nAll, nbartL, nbartB, nt5B, nt5S, nbaseline, npegasus))
        print("All/bartL/bartB/t5B/t5S/baseline/pegasus: %.2f/%.2f/%.2f/%.2f/%.2f/%.2f/%.2f" %
            (omitRateAll / (nAll+1e-8) * 100,
             omitRatebartL / (nbartL+1e-8) * 100,
             omitRatebartB / (nbartB+1e-8) * 100,
             omitRatet5B / (nt5B+1e-8) * 100,
             omitRatet5S / (nt5S+1e-8) * 100,
             omitRatebaseline / (nbaseline+1e-8) * 100,
             omitRatepegasus / (npegasus+1e-8) * 100))
        print("All/bartL/bartB/t5B/t5S/baseline/pegasus: %.2f/%.2f/%.2f/%.2f/%.2f/%.2f/%.2f\n" %
            (SomitRateAll / (turnAll+1e-8) * 100,
             SomitRatebartL / (turnbartL+1e-8) * 100,
             SomitRatebartB / (turnbartB+1e-8) * 100,
             SomitRatet5B / (turnt5B+1e-8) * 100,
             SomitRatet5S / (turnt5S+1e-8) * 100,
             SomitRatebaseline / (turnbaseline+1e-8) * 100,
             SomitRatepegasus / (turnpegasus+1e-8) * 100))
        print("All/bartL/bartB/t5B/t5S/baseline/pegasus: %.2f/%.2f/%.2f/%.2f/%.2f/%.2f/%.2f\n" %
            (SomitRateAll / (oracleAll+1e-8) * 100,
             SomitRatebartL / (oraclebartL+1e-8) * 100,
             SomitRatebartB / (oraclebartB+1e-8) * 100,
             SomitRatet5B / (oraclet5B+1e-8) * 100,
             SomitRatet5S / (oraclet5S+1e-8) * 100,
             SomitRatebaseline / (oraclebaseline+1e-8) * 100,
             SomitRatepegasus / (oraclepegasus+1e-8) * 100))
        results.append([data_path.split('/')[1], split, num, turns / num, tokens / num, tokens / turns, summary_tokens / num,
                        max_turns, max_tokens, max_summary_tokens, nAll, nbartL, nbartB, nt5B, nt5S, nbaseline, npegasus,
                        omitRateAll / (nAll+1e-8) * 100,
                        omitRatebartL / (nbartL+1e-8) * 100,
                        omitRatebartB / (nbartB+1e-8) * 100,
                        omitRatet5B / (nt5B+1e-8) * 100,
                        omitRatet5S / (nt5S+1e-8) * 100,
                        omitRatebaseline / (nbaseline+1e-8) * 100,
                        omitRatepegasus / (npegasus+1e-8) * 100,
                        SomitRateAll / (turnAll+1e-8) * 100,
                        SomitRatebartL / (turnbartL+1e-8) * 100,
                        SomitRatebartB / (turnbartB+1e-8) * 100,
                        SomitRatet5B / (turnt5B+1e-8) * 100,
                        SomitRatet5S / (turnt5S+1e-8) * 100,
                        SomitRatebaseline / (turnbaseline+1e-8) * 100,
                        SomitRatepegasus / (turnpegasus+1e-8) * 100,
                        SomitRateAll / (oracleAll+1e-8) * 100,
                        SomitRatebartL / (oraclebartL+1e-8) * 100,
                        SomitRatebartB / (oraclebartB+1e-8) * 100,
                        SomitRatet5B / (oraclet5B+1e-8) * 100,
                        SomitRatet5S / (oraclet5S+1e-8) * 100,
                        SomitRatebaseline / (oraclebaseline+1e-8) * 100,
                        SomitRatepegasus / (oraclepegasus+1e-8) * 100])
        df = pd.DataFrame(results)
df.to_excel(output_excel, "Sheet1", index=False)
output_excel.close()

/home/gaya/group1/OLDS/data/tweetsumm/tweetsumm/omission.save train:
number: 879
Avg. turns: 10.50
Avg. dialog length: 243.99
Avg. turn length: 23.23
Avg. summary length: 48.16
Max turns: 25
Max dialog length: 716
Max summary length: 113
All/bartL/bartB/t5B/t5S/baseline/pegasus: 8790/0/2426/0/1986/4378/0
All/bartL/bartB/t5B/t5S/baseline/pegasus: 97.42/0.00/92.13/0.00/98.19/100.00/0.00
All/bartL/bartB/t5B/t5S/baseline/pegasus: 26.08/0.00/19.71/0.00/22.56/31.35/0.00

All/bartL/bartB/t5B/t5S/baseline/pegasus: 41.79/0.00/31.23/0.00/36.27/50.48/0.00

/home/gaya/group1/OLDS/data/tweetsumm/tweetsumm/omission.save validation:
number: 110
Avg. turns: 10.17
Avg. dialog length: 226.05
Avg. turn length: 22.22
Avg. summary length: 48.41
Max turns: 19
Max dialog length: 453
Max summary length: 86
All/bartL/bartB/t5B/t5S/baseline/pegasus: 660/0/220/0/220/220/0
All/bartL/bartB/t5B/t5S/baseline/pegasus: 96.67/0.00/92.27/0.00/97.73/100.00/0.00
All/bartL/bartB/t5B/t5S/baseline/pegasus: 25.83/0.00/20.96/0

  df.to_excel(output_excel, "Sheet1", index=False)


In [None]:
# -*- encoding: utf-8 -*-
 
# 0.0~0.20 (slight), 0.21~0.40 (fair), 0.41~0.60 (moderate)
# 0.61~0.80 (substantial), 0.81~1 (almost perfect)
 
import numpy as np
import pandas as pd

 
def kappa(testData, k):
    dataMat = np.mat(testData)
    P0 = 0.0
    for i in range(k):
        P0 += dataMat[i, i]*1.0
    xsum = np.sum(dataMat, axis=1)
    ysum = np.sum(dataMat, axis=0)

    Pe  = float(ysum*xsum)/k**2
    P0 = float(P0/k*1.0)
    cohens_coefficient = float((P0-Pe)/(1-Pe))
    return cohens_coefficient
 
def fleiss_kappa(testData, N, k, n): # N samples, k classes, n annotators.
    dataMat = np.mat(testData, float)
    oneMat = np.ones((k, 1))
    sum = 0.0
    P0 = 0.0
    for i in range(N):
        temp = 0.0
        for j in range(k):
            sum += dataMat[i, j]
            temp += 1.0*dataMat[i, j]**2
        temp -= n
        temp /= (n-1)*n
        P0 += temp
    P0 = 1.0*P0/N
    ysum = np.sum(dataMat, axis=0)
    for i in range(k):
        ysum[0, i] = (ysum[0, i]/sum)**2
    Pe = ysum*oneMat*1.0
    ans = (P0-Pe)/(1-Pe)
    return ans[0, 0]
 
 
if __name__ == "__main__":

    data = [[0, 0] for _ in range(200)]
    # A annotator
    excel_a = pd.read_excel('annotation/A/data_dialsumm-A.xlsx')
    # Read the values of the file in the dataframe
    data_a = pd.DataFrame(excel_a, columns=['Omission Labels', 'Omission Words', 'Accept (Y/N)',
                                          'New Omission Labels', 'New Omission Words', 'Omission Error'])
    cnt_a = 0
    data_a = list(zip(*[data_a[key] for key in data_a.keys()]))
    for idx, ex in enumerate(data_a):
        la, w, acc, new_la, new_w, err = ex
        if acc == 'Y':
            cnt_a += 1
            data[idx][1] += 1
        elif la == new_la:
            cnt_a += 1
            data[idx][1] += 1
        else:
            data[idx][0] += 1
    print(cnt_a)

    # B annotator
    excel_b = pd.read_excel('annotation/B/data_dialsumm-B.xlsx')
    # Read the values of the file in the dataframe
    data_b = pd.DataFrame(excel_b, columns=['Omission Labels', 'Omission Words', 'Accept (Y/N)',
                                          'New Omission Labels', 'New Omission Words', 'Omission Error'])
    cnt_b = 0
    data_b = list(zip(*[data_b[key] for key in data_b.keys()]))
    for idx, ex in enumerate(data_b):
        la, w, acc, new_la, new_w, err = ex
        if acc == 'Y':
            cnt_b += 1
            data[idx][1] += 1
        elif la == new_la:
            cnt_b += 1
            data[idx][1] += 1
        else:
            data[idx][0] += 1
    print(cnt_b)

    # C annotator
    excel_c = pd.read_excel('annotation/C/data_dialsumm-C.xlsx')
    # Read the values of the file in the dataframe
    data_c = pd.DataFrame(excel_c, columns=['Omission Labels', 'Omission Words', 'Accept (Y/N)',
                                          'New Omission Labels', 'New Omission Words', 'Omission Error'])
    cnt_c = 0
    data_c = list(zip(*[data_c[key] for key in data_c.keys()]))
    for idx, ex in enumerate(data_c):
        la, w, acc, new_la, new_w, err = ex
        if acc == 'Y':
            cnt_c += 1
            data[idx][1] += 1
        elif la == new_la:
            cnt_c += 1
            data[idx][1] += 1
        else:
            data[idx][0] += 1
    print(cnt_c)
    print("%.4f, %.4f%%" % ((cnt_a + cnt_b + cnt_c) / 3., (cnt_a + cnt_b + cnt_c) / (200 * 3.) * 100))

    res = fleiss_kappa(data, 200, 2, 3)
    print(res)
 

In [6]:
# oracle omission distributions
import json
import pandas as pd
from collections import defaultdict
from datasets import load_from_disk
from pandas import ExcelWriter

dataset = load_from_disk("data/tweetsumm/omission.save")
output_excel = ExcelWriter("test_results.xlsx")
results = []

# uttrnum = defaultdict(int)
# oraclenum = defaultdict(int)
# omitnum = defaultdict(int)
# for ex in dataset['test']:
#     cand_num = len(ex['preds'])
#     turn_num = len(ex['dialogue'].replace('\r\n', '\n').split('\n'))
#     for idx in range(turn_num):
#         uttrnum[idx] += cand_num
#     for item in ex['oracle_labels']:
#         oraclenum[item] += cand_num
#     for la in ex['omission_labels']:
#         for item in la:
#             omitnum[item] += 1

# uttrnum = sorted(uttrnum.items(), key=lambda x: x[0])
# oraclenum = sorted(oraclenum.items(), key=lambda x: x[0])
# omitnum = sorted(omitnum.items(), key=lambda x: x[0])

for ex in dataset['test']:
    turn_num = len(ex['dialogue'].replace('\r\n', '\n').split('\n'))
    turn_class = min((turn_num-1) // 2 + 1, 10)
    for la in ex['omission_labels']:
        for item in la:
            results.append([turn_class, item / (turn_num-1)])

df = pd.DataFrame(results)
df.to_excel(output_excel, "Sheet1", index=False)
output_excel.save()

In [6]:
%cd ..

/home/gaya/group1/OLDS


In [13]:
!pip install git+https://github.com/google-research/bleurt.git

Collecting git+https://github.com/google-research/bleurt.git
  Cloning https://github.com/google-research/bleurt.git to /tmp/pip-req-build-3vru4wqw
  Running command git clone --filter=blob:none --quiet https://github.com/google-research/bleurt.git /tmp/pip-req-build-3vru4wqw
  Resolved https://github.com/google-research/bleurt.git to commit cebe7e6f996b40910cfaa520a63db47807e3bf5c
  Preparing metadata (setup.py) ... [?25ldone
Collecting tensorflow (from BLEURT==0.0.2)
  Downloading tensorflow-2.18.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB)
Collecting tf-slim>=1.1 (from BLEURT==0.0.2)
  Downloading tf_slim-1.1.0-py2.py3-none-any.whl.metadata (1.6 kB)
Collecting astunparse>=1.6.0 (from tensorflow->BLEURT==0.0.2)
  Using cached astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow->BLEURT==0.0.2)
  Using cached flatbuffers-24.3.25-py2.py3-none-any.whl.metadata (850 bytes)
Collecting gast!=0.5.0,!=0.5.1,

In [14]:
# Candidate Summary Evaluation
import json
import nltk
from statistics import mean
from src.others.metric import Metric
from datasets import load_from_disk, load_metric
from pandas import ExcelWriter
from tqdm.auto import tqdm

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels

def word_tokenize(preds, labels):
    preds = [nltk.word_tokenize(pred) for pred in preds]
    labels = [nltk.word_tokenize(label) for label in labels]
    return preds, labels

stop_word_set = set()
with open('src/others/stop_word_list', 'r') as f:
    for word in f:
        stop_word_set.add(word.strip())

dataset = load_from_disk("/home/gaya/group1/OLDS/data/tweetsumm/tweetsumm/omission.save")

metric = Metric()
refs = []
cands = []

omit_rates = []

for ex in dataset['test']:
    turns = ex['dialogue'].replace('\r\n', '\n').split('\n')
    oracle_word = set([w for w in nltk.word_tokenize(ex['summary'].lower()) if w not in stop_word_set])
    oracle_texts = " ".join([turns[idx].lower() for idx in ex['oracle_labels']])
    oracle_num = len(set([w for w in nltk.word_tokenize(oracle_texts) if w in oracle_word]))
    ref = ex['summary']
    # print(f"ref: {ref}")
    # print(f"preds: {ex['preds']}")
    # print(f"omission_words: {ex['omission_words']}")
    for i, p in enumerate(ex['preds']):
        print(p["pred"])
        cand = p['pred']
        omit_rates.append(len(sum(ex['omission_words'][i], [])) / (oracle_num+1e-8))
        print(omit_rates)
        refs.append(ref)
        cands.append(cand)

print("Total pairs: %d" % len(refs))
print("Avg omit rate %.4f" % mean(omit_rates))

cands, refs = postprocess_text(cands, refs)

tokenized_cands, tokenized_refs = word_tokenize(cands, refs)

# compute bleu
bleu_score = metric.bleu._compute(tokenized_cands, [[ref] for ref in tokenized_refs])
bleu_score = {'bleu': round(bleu_score['bleu'] * 100, 2)}

# compute rouge
metric.rouge.add_batch(predictions=cands, references=refs)
rouge_score = metric.rouge.compute(use_stemmer=True)
rouge_score = {key: value.mid.fmeasure * 100 for key, value in rouge_score.items()}
rouge_score = {k: round(v, 2) for k, v in rouge_score.items()}

# compute bertscore roberta-base
metric.bertscore.add_batch(predictions=cands, references=refs)
bert_score_base = metric.bertscore.compute(model_type='roberta-base')
bert_score_base = {'precision': round(mean(bert_score_base['precision']) * 100, 2),
                   'recall': round(mean(bert_score_base['recall']) * 100, 2),
                   'f1': round(mean(bert_score_base['f1']) * 100, 2)}

# compute bertscore roberta-large
metric.bertscore.add_batch(predictions=cands, references=refs)
bert_score_large = metric.bertscore.compute(model_type='roberta-large')
bert_score_large = {'precision': round(mean(bert_score_large['precision']) * 100, 2),
                    'recall': round(mean(bert_score_large['recall']) * 100, 2),
                    'f1': round(mean(bert_score_large['f1']) * 100, 2)}

# compute bleurt score
bleurt_metric = load_metric('bleurt', "BLEURT-20", keep_in_memory=True)
bleurt_score = bleurt_metric._compute(cands, refs)
bleurt_score = {'score': round(mean(bleurt_score['scores']) * 100, 2)}

print({
    'rouge': rouge_score,
    'bleu': bleu_score,
    'bertscore_base': bert_score_base,
    'bertscore_large': bert_score_large,
    'bleurt': bleurt_score
})
print(mean(omit_rates)* 100)
print(rouge_score['rouge1'])
print(rouge_score['rouge2'])
print(rouge_score['rougeLsum'])
print(bleu_score['bleu'])
print(bert_score_base['f1'])
print(bert_score_large['f1'])
print(bleurt_score['score'])

The customer says that his watchlist is not updating with new episodes (past couple days).
The agent asks to try navigating to season / episode manually and suggests to check the show page for these shows as the new eps will be there soon.
[0.4444444439506172]
Customer is complaining that their watchlist is not updating with new episodes (past couple days).
Agent updates that they are looking into this and suggests to check the show page for these shows as the new eps will be there soon.
[0.4444444439506172, 0.5555555549382716]
Customer is complaining that their watchlist is not updating with new episodes (past couple days).
Agent updates that their team is working hard to investigate and hopes to have a fix ready soon.
[0.4444444439506172, 0.5555555549382716, 0.4444444439506172]
Customer is complaining that the watchlist is not updating with new episodes.
Agent updates that the problem seems to be resolved and recommends checking the show page for these shows as the new eps will be th

2024-12-01 15:18:09.790442: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-01 15:18:09.802643: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1733033889.816519 4034223 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1733033889.820558 4034223 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-01 15:18:09.835653: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

ValueError: Value returned by __array__ is not a NumPy array

In [10]:
dataset = load_from_disk("/home/gaya/group1/OLDS/data/tweetsumm/tweetsumm/omission.save")
dataset

DatasetDict({
    train: Dataset({
        features: ['dialogue', 'summary', 'id', 'preds', 'oracle_labels', 'pred_labels', 'omission_labels', 'omission_words'],
        num_rows: 879
    })
    validation: Dataset({
        features: ['dialogue', 'summary', 'id', 'preds', 'oracle_labels', 'pred_labels', 'omission_labels', 'omission_words'],
        num_rows: 110
    })
    test: Dataset({
        features: ['dialogue', 'summary', 'id', 'preds', 'oracle_labels', 'pred_labels', 'omission_labels', 'omission_words'],
        num_rows: 110
    })
})

In [6]:
from datasets import load_from_disk 

tw_omission = load_from_disk("/home/gaya/group1/OLDS/data/tweetsumm/tweetsumm/omission.save")

tw_omission["train"][0]

import json

with open("tweetsum_omission.txt", "w") as f:
    json.dump(tw_omission["train"][0], f, ensure_ascii=False, indent=4)


In [7]:
!pwd

/home/gaya/group1/OLDS/process
