# QMSum

In [7]:
import json
# read the dataset
# please enter the path of your data
split = 'val' # train test
path = "/home"
data_path = f'{path}/QMSum-main/data/ALL/jsonl/' + split + '.jsonl'
data = []
with open(data_path) as f:
    for line in f:
        data.append(json.loads(line))
n_meetings = len(data)
print('Total {} meetings in the {} set.'.format(n_meetings, split))

Total 35 meetings in the val set.


In [2]:
from nltk import word_tokenize
# tokneize a sent
def tokenize(sent):
    tokens = ' '.join(word_tokenize(sent.lower()))
    return tokens

In [3]:
# filter some noises caused by speech recognition
def clean_data(text):
    text = text.replace('{ vocalsound }', '')
    text = text.replace('{ disfmarker }', '')
    text = text.replace('{ comment }', '')
    text = text.replace('a_m_i_', 'ami')
    text = text.replace('l_c_d_', 'lcd')
    text = text.replace('p_m_s', 'pms')
    text = text.replace('t_v_', 'tv')
    text = text.replace('{ pause }', '')
    text = text.replace('{ nonvocalsound }', '')
    text = text.replace('{ gap }', '')
    return text

In [8]:
# process data for Seq2Seq
# the input of the model here is the entire content of the meeting
bart_data = []
for i in range(len(data)):
    # get meeting content
    src = []
    for k in range(len(data[i]['meeting_transcripts'])):
        cur_turn = data[i]['meeting_transcripts'][k]['speaker'].lower() + ': '
        cur_turn = cur_turn + tokenize(data[i]['meeting_transcripts'][k]['content'])
        src.append(cur_turn)
    src = ' '.join(src)
    for j in range(len(data[i]['general_query_list'])):
        cur = {}
        query = tokenize(data[i]['general_query_list'][j]['query'])
        cur['instruction'] = clean_data(query)
        cur['input'] = clean_data(src)
        # cur['src'] = clean_data('<s> ' + query + ' </s> ' + src + ' </s>')
        target = tokenize(data[i]['general_query_list'][j]['answer'])
        # cur['tgt'] = target
        cur['output'] = target
        bart_data.append(cur)
    for j in range(len(data[i]['specific_query_list'])):
        cur = {}
        query = tokenize(data[i]['specific_query_list'][j]['query'])
        # cur['src'] = clean_data('<s> ' + query + ' </s> ' + src + ' </s>')
        cur['instruction'] = clean_data(query)
        cur['input'] = clean_data(src)
        target = tokenize(data[i]['specific_query_list'][j]['answer'])
        # cur['tgt'] = target
        cur['output'] = target
        bart_data.append(cur)
        
print('Total {} query-summary pairs in the {} set'.format(len(bart_data), split))
print(bart_data[2])
with open(f'{path}/datasets/QMSum/' + split + '.jsonl', 'w') as f:
    for i in range(len(bart_data)):
        print(json.dumps(bart_data[i]), file=f)

Total 1257 query-summary pairs in the train set
{'instruction': 'how did marketing design the product evaluation ?', 'input': "project manager: yep . soon as i get this . okay . this is our last meeting . um i 'll go ahead and go through the minutes from the previous meeting . uh and then we 'll have a , the prototype presentation .  um then we will um do an evaluation . uh or we 'll see what , what we need to have under the criteria for the evaluation . then we 'll go through the finance and see if we fall within the budget . um then we 'll do the evaluation , and then we can finish up after that with um any changes that we 'll need to make , or hopefully everything will fall right in line . um let 's see , minutes from the last meeting . um we looked at uh the the trends . we had uh the fashion trends that people want a fancy look-and-feel . it was twice as important as anything else . um they liked fruit and vegetables in the new styles . um and a spongy feel . so we were talking ab

In [8]:
# process data for BART
# the input of the model here is the gold span corresponding to each query
bart_data_gold = []
for i in range(len(data)):
    # get meeting content
    entire_src = []
    for k in range(len(data[i]['meeting_transcripts'])):
        cur_turn = data[i]['meeting_transcripts'][k]['speaker'].lower() + ': '
        cur_turn = cur_turn + tokenize(data[i]['meeting_transcripts'][k]['content'])
        entire_src.append(cur_turn)
    entire_src = ' '.join(entire_src)
    for j in range(len(data[i]['general_query_list'])):
        cur = {}
        query = tokenize(data[i]['general_query_list'][j]['query'])
        cur['instruction'] = clean_data(query)
        cur['input'] = clean_data(entire_src)
        # cur['src'] = clean_data('<s> ' + query + ' </s> ' + entire_src + ' </s>')
        target = tokenize(data[i]['general_query_list'][j]['answer'])
        # cur['tgt'] = target
        cur['output'] = target
        bart_data_gold.append(cur)
    for j in range(len(data[i]['specific_query_list'])):
        cur = {}
        query = tokenize(data[i]['specific_query_list'][j]['query'])
        src = []
        # get the content in the gold span for each query
        for span in data[i]['specific_query_list'][j]['relevant_text_span']:
            assert len(span) == 2
            st, ed = int(span[0]), int(span[1])
            for k in range(st, ed + 1):
                cur_turn = data[i]['meeting_transcripts'][k]['speaker'].lower() + ': '
                cur_turn = cur_turn + tokenize(data[i]['meeting_transcripts'][k]['content'])
                src.append(cur_turn)
        src = ' '.join(src)
        cur['instruction'] = clean_data(query)
        cur['input'] = clean_data(src)
        # cur['src'] = clean_data('<s> ' + query + ' </s> ' + src + ' </s>')
        target = tokenize(data[i]['specific_query_list'][j]['answer'])
        # cur['tgt'] = target
        cur['output'] = target
        bart_data_gold.append(cur)
        
print('Total {} query-summary pairs in the {} set'.format(len(bart_data_gold), split))
print(bart_data_gold[2])
with open(f'{path}/datasets/QMSum_gold_clean/' + split + '.jsonl', 'w') as f:
    for i in range(len(bart_data_gold)):
        print(json.dumps(bart_data_gold[i]), file=f)

Total 272 query-summary pairs in the val set
{'instruction': 'what was said on speech overlap ?', 'input': "phd d: uh - huh . ok . professor e: so .  right . phd f: as opposed to the rest of us . phd d: well -   ok . i  i remind that me  my first objective eh , in the project is to  to study difference parameters to  to find a  a good solution to detect eh , the overlapping zone in eh speech recorded . but eh ,  tsk ,   ehhh  in that way  i   i   i begin to  to study and to analyze the ehn  the recorded speech eh the different session to  to find and to locate and to mark eh the  the different overlapping zone . and eh so eh i was eh  i am transcribing the  the first session and i  i have found eh , eh one thousand acoustic events , eh besides the overlapping zones , eh i  i  i mean the eh breaths eh aspiration eh , eh , talk eh , eh , clap , eh   i do n't know what is the different names eh you use to  to name the  the  n speech phd a: nonspeech sounds ? phd d: yeah . grad g: oh , i d

In [7]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home1/caojie/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [14]:
split = 'train'
with open(f'{path}/datasets/QMSum/' + split + '.jsonl', 'r') as f:
    lines = f.readlines()
data = [json.loads(line) for line in lines]

In [15]:
input_lens = []
output_lens= []
for x in data:
    input_lens.append(len(x['input'].split()))
    output_lens.append(len(x['output'].split()))

print(sum(input_lens)/len(input_lens))
print(sum(output_lens)/len(output_lens))

10571.826571201273
77.92521877486078


In [4]:
with open(f'{path}/QMSum-main/extracted_span/'+split+'.txt', 'r') as f:
    lines = f.readlines()
querys = []
inputs = []
for x in lines:
    sp = x.split('</s>')
    query = sp[0].replace('<s>','')
    input = sp[1]
    querys.append(query)
    inputs.append(input)
    

In [12]:
print(len(data),len(inputs))

1257 1257


# CovidET

In [None]:
import json
tag = 'train' # test 
path = "/home"
datafile = f'{path}/CovidET-main/data/train_val_test_anonymized-WITH_POSTS/{tag}_anonymized-WITH_POSTS.json'
with open(datafile, 'r') as f:
    all = json.loads(f.read())

In [None]:
save_path = f'{path}/datasets/CovidET/{tag}.jsonl'
for k,v in all.items():
    data= {}
    # print(v)
    data['article'] = v['Reddit Post']
    for s,d in v['Annotations'].items():
        for x in d:
            if x.get('Abstractive'):
                data['abstract'] = x['Abstractive']
                data['phrases'] = x['Emotion']
                with open(save_path, 'a') as f:
                    json_data = json.dumps(data, ensure_ascii=False)
                    f.write(json_data + '\n')      

# SQuALITY Dataset

In [None]:
import json
tag = 'test'
path = "/home"
with open(f'{path}/SQuALITY/{tag}.jsonl', 'r') as f:
    lines = f.readlines()
data = [json.loads(line) for line in lines]

In [None]:
for x in data:
    document = x['document']
    questions = x['questions']
    for question in questions:
        query = question['question_text']
        for response in question['responses']:
            output = response['response_text']
            one = {}
            one['input'] = document
            one['instruction'] = query
            one['output'] = output
            with open(f'{path}/datasets/SQuALITY/{tag}.jsonl', 'a') as f:
                json_data = json.dumps(one, ensure_ascii=False)
                f.write(json_data + '\n')