In [64]:
import json

# read the dataset
# please enter the path of your data
split = 'test'
data_path = './data/Product/jsonl/' + split + '.jsonl'
docs = []
with open(data_path) as f:
    for line in f:
        docs.append(json.loads(line))
n_meetings = len(docs)
print('Total {} meetings in the {} set.'.format(n_meetings, split))

Total 20 meetings in the test set.


In [65]:
docs[0]

{'topic_list': [{'topic': 'Tool training on the conceptual design of remote control related animal characteristics',
   'relevant_text_span': [['10', '97']]},
  {'topic': 'Project plan and target revenue',
   'relevant_text_span': [['84', '215']]},
  {'topic': 'Brainstorming functional design on technical function and detailed design',
   'relevant_text_span': [['97', '236']]}],
 'general_query_list': [{'query': 'Summarize the whole meeting.',
   'answer': 'Project Manager invites teammates to brainstorm animal characteristics on remote control function and suggest a reflection on user experience to improve technical function design. Price goal is 25 Euro, profit aim is fifty million Euro, and sales goal is four million. To summarize, the function design will be user-friendly and practical, in which the technical function will be portable phone based, hand-sized, light with limited but all-functioned buttons.'}],
 'specific_query_list': [{'query': 'What did the group discuss about anim

In [66]:
from nltk import word_tokenize
# tokneize a sent
def tokenize(sent):
    tokens = ' '.join(word_tokenize(sent.lower()))
    return tokens

In [67]:
import re
# filter some noises caused by speech recognition
def clean_data(text):
    text = text.replace('{vocalsound} ', '')
    text = text.replace('{disfmarker} ', '')
    text = text.replace('a_m_i_', 'ami')
    text = text.replace('l_c_d_', 'lcd')
    text = text.replace('p_m_s', 'pms')
    text = text.replace('t_v_', 'tv')
    text = text.replace('{pause} ', '')
    text = text.replace('{nonvocalsound} ', '')
    text = text.replace('{gap} ', '')
    return text

In [68]:
import pandas as pd
import nltk

# Create a dictionary to store the data for train dataframe
train_data = {'meeting_id': [], 'caption': [], 'speaker': [], 'caption_group_id': []}

# Create a dictionary to store the data for test dataframe
test_data = {'meeting_id': [], 'st': [], 'en': [], 'topic': []}
meeting_id = 0

for data in docs:
    # Loop through each meeting transcript
    caption_id = 0
    for i, transcript in enumerate(data['meeting_transcripts']):
        speaker = transcript['speaker']
        caption = transcript['content']
        caption_id += 1
        sentences = nltk.sent_tokenize(caption)
        for s in sentences:
            train_data['meeting_id'].append(meeting_id)
            train_data['caption'].append(s)
            train_data['speaker'].append(speaker)
            train_data['caption_group_id'].append(caption_id)

    # Loop through each topic and its relevant text spans
    for j, topic in enumerate(data['topic_list']):
        for span in topic['relevant_text_span']:
            st, en = map(int, span)
            test_data['meeting_id'].append(meeting_id)
            test_data['topic'].append(topic['topic'])
            test_data['st'].append(st)
            test_data['en'].append(en)
    meeting_id = meeting_id + 1  # Meeting id starts from 1



# Create train and test dataframes
train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data).sort_values(by=['meeting_id', 'st'])

In [69]:
train_df.head()

Unnamed: 0,meeting_id,caption,speaker,caption_group_id
0,0,Here we go .,Project Manager,1
1,0,Welcome everybody .,Project Manager,1
2,0,"Um , I'm Abigail Claflin .",Project Manager,1
3,0,You can call me Abbie .,Project Manager,1
4,0,'S see .,Project Manager,1


In [70]:
test_df.head()

Unnamed: 0,meeting_id,st,en,topic
0,0,10,97,Tool training on the conceptual design of remo...
1,0,84,215,Project plan and target revenue
2,0,97,236,Brainstorming functional design on technical f...
3,1,0,47,Introduction of a prototype
4,1,48,76,Evaluation criteria of remote control


In [71]:
train_df.to_csv('train.csv', index=False)
test_df.to_csv('test.csv', index=False)