##### Install libraries

In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m76.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m27.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m124.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.15.1 tokenizers-0.13.3 transformers-4.29.2


In [2]:
!git clone https://github.com/Yale-LILY/QMSum.git

Cloning into 'QMSum'...
remote: Enumerating objects: 806, done.[K
remote: Counting objects: 100% (2/2), done.[K
remote: Compressing objects: 100% (2/2), done.[K
remote: Total 806 (delta 1), reused 0 (delta 0), pack-reused 804[K
Receiving objects: 100% (806/806), 13.76 MiB | 18.27 MiB/s, done.
Resolving deltas: 100% (446/446), done.


### PREPROCESSING

##### PREPROCESSING FUNCTIONS

In [3]:
import nltk
nltk.download('punkt')
from nltk import word_tokenize
import torch

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [4]:
# filter some noises caused by speech recognition
def clean_data(text):

    text = text.replace('{ vocalsound } ', '')
    text = text.replace('{ disfmarker } ', '')
    text = text.replace('a_m_i_', 'ami')
    text = text.replace('l_c_d_', 'lcd')
    text = text.replace('p_m_s', 'pms')
    text = text.replace('t_v_', 'tv')
    text = text.replace('{ pause } ', '')
    text = text.replace('{ nonvocalsound } ', '')
    text = text.replace('{ gap } ', '')
    return text

In [5]:
import re

# Function to preprocess and clean text
def preprocess_text(text):
    # Clean the data using the clean_data function
    text = clean_data(text)
    
    # Remove non-alphanumeric characters, except spaces
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    
    # Convert text to lowercase
    text = text.lower()
    
    # Remove leading and trailing whitespaces
    text = text.strip()
    
    return text

##### UPLOAD AND PREPROCESS TRAINING DATASET

In [10]:
import json
import pandas as pd

# read the dataset
# please enter the path of your data

split = 'train'
data_path = 'QMSum/data/ALL/jsonl/' + split + '.jsonl'

data = []
with open(data_path) as f:
    for line in f:
        data.append(json.loads(line))

# Convert JSON data to a dataframe
df = pd.DataFrame(data)

In [11]:
# Preprocess the data and create a new dataframe
preprocessed_data = []

for item in data:
    preprocessed_item = {}
    
    # Preprocess topic_list
    preprocessed_topic_list = [{'topic': preprocess_text(topic['topic']),
                                'relevant_text_span': topic['relevant_text_span']} for topic in item['topic_list']]
    preprocessed_item['topic_list'] = preprocessed_topic_list
    
    # Preprocess general_query_list
    preprocessed_general_query_list = [{'query': preprocess_text(query['query']),
                                        'answer': preprocess_text(query['answer'])} for query in item['general_query_list']]
    preprocessed_item['general_query_list'] = preprocessed_general_query_list
    
    # Preprocess specific_query_list
    preprocessed_specific_query_list = [{'query': preprocess_text(query['query']),
                                         'answer': preprocess_text(query['answer']),
                                         'relevant_text_span': query['relevant_text_span']} for query in item['specific_query_list']]
    preprocessed_item['specific_query_list'] = preprocessed_specific_query_list
    
    # Preprocess meeting_transcripts
    preprocessed_meeting_transcripts = [{'speaker': preprocess_text(transcript['speaker']),
                                         'content': preprocess_text(transcript['content'])} for transcript in item['meeting_transcripts']
                                        if len(preprocess_text(transcript['content']).split()) > 5]
    preprocessed_item['meeting_transcripts'] = preprocessed_meeting_transcripts
    
    preprocessed_data.append(preprocessed_item)

# Create a new dataframe from the preprocessed data
prep_df = pd.DataFrame(preprocessed_data)

### RUN TO UPLOAD LONGFORMER 

In [None]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from transformers import LongformerModel, LongformerTokenizer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # GPU selector
model_name = "allenai/longformer-base-4096"                           # Model selector
model = LongformerModel.from_pretrained(model_name).to(device)        # Send model to GPU/CPU
tokenizer = LongformerTokenizer.from_pretrained(model_name)           # Model tokenizer

### RUN TO UPLOAD ELECTRA

In [6]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from transformers import ElectraModel, ElectraTokenizer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # GPU selector
print(device)
model_name = "google/electra-base-discriminator"                      # Model selector for ELECTRA
model = ElectraModel.from_pretrained(model_name).to(device)            # Send model to GPU/CPU
tokenizer = ElectraTokenizer.from_pretrained(model_name)               # Model tokenizer

cuda


Downloading (…)lve/main/config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at google/electra-base-discriminator were not used when initializing ElectraModel: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.weight']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

### RUN TO UPLOAD BERT

In [15]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from transformers import BertModel, BertTokenizer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # GPU selector
print(device)
model_name = "bert-base-uncased"                                       # Model selector
model = BertModel.from_pretrained(model_name).to(device)                # Send model to GPU/CPU
tokenizer = BertTokenizer.from_pretrained(model_name)                   # Model tokenizer

cuda


Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

### RUN TO UPLOAD ROBERTA



In [19]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from transformers import RobertaModel, RobertaTokenizer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # GPU selector
print(device)
model_name = "roberta-base"                                            # Model selector
model = RobertaModel.from_pretrained(model_name).to(device)            # Send model to GPU/CPU
tokenizer = RobertaTokenizer.from_pretrained(model_name)               # Model tokenizer

cuda


Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

## TOKENIZATION, EMBEDDINGS

In [20]:
model.eval()

RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropou

##### TURN EMBEDDING, AVERAGING

- TURN EMBEDDING FUNCTION

In [7]:
def turn_embedding(turn):
    
    # EXTRACT UTTERANCES AND SPEAKERS FROM EACH TURN
    speakers, utterances = turn['speaker'], turn['content']

    # JOIN SPEAKERS AND UTTERANCES TO GET THE TURN
    turn = [speakers + ' ' + utterances]
    
    # TOKENIZE THE TURN
    tokenized_turn = tokenizer(turn, truncation=True, padding='max_length', return_tensors='pt') # singular turn

    input_ids = tokenized_turn['input_ids'].to(device)  # Move input_ids to GPU
    attention_mask = tokenized_turn['attention_mask'].to(device)  # Move attention_mask to GPU

    # EMBEDD EACH TURN WITH ROBERTA
    with torch.no_grad():
        outputs_turn = model(input_ids=input_ids, attention_mask=attention_mask)
        turn_embedding = outputs_turn.last_hidden_state.squeeze(0)

    return turn_embedding

- QUERY EMBEDDING & SPAN PROCESSING FUNCTION

In [8]:
def query_span_embedding(meeting):

    query_list = []
    span_list = []

    for ind, llista in enumerate(meeting):


        specific_query = llista['query']
        span = llista['relevant_text_span']

        ##### SPAN #####


        float_span = [float(i) for i in span[0]]
        span_tensor = torch.tensor(float_span, device=device, requires_grad=True)  # Move span_tensor to GPU
        span_list.append(span_tensor)

        ##### QUERY #####


        tokenized_query = tokenizer(specific_query, truncation=True, padding='max_length', return_tensors='pt') # singular query
        
        input_ids = tokenized_query['input_ids'].to(device)  # Move input_ids to GPU
        attention_mask = tokenized_query['attention_mask'].to(device)  # Move attention_mask to GPU

        # EMBED EACH QUERY WITH ROBERTA
        with torch.no_grad():
            outputs_query = model(input_ids=input_ids, attention_mask=attention_mask)
            query_embedding = outputs_query.last_hidden_state.squeeze(0).to(device)

        query_embedding_3D = query_embedding.unsqueeze(0).to(device).requires_grad_(True)  # Move query_embedding_3D to GPU

        query_list.append(query_embedding_3D)

    return query_list, span_list

In [9]:
def dictionary(ds):
  
    general_dct = {}

    for ind in range(len(ds)):

        individual_dct = {}
        
        print('Processing meeting: ', ind)
        transcripts = ds['meeting_transcripts'][ind]
        specific_query_meeting = ds['specific_query_list'][ind]
        
        ##############################################################
        ######################## TRANSCRIPT ##########################
        ##############################################################

        transcript_embeddings = []

        for i, turn in enumerate(transcripts):
            #print('Embedding turn: ', ind)
            turn_emb = turn_embedding(turn)
            transcript_embeddings.append(turn_emb)

        # Concatenate the turn embeddings along the first dimension
        transcript_tensor = torch.stack(transcript_embeddings, dim=0).to(device)  # Move transcript_tensor to GPU

        # Compute the average or sum of the turn embeddings
        average_transcript = torch.mean(transcript_tensor, dim=0)

        average_transcript.requires_grad_(True)

        embedded_transcript_3D = average_transcript.unsqueeze(0).to(device).requires_grad_(True)  # Move embedded_transcript_3D to GPU

        ##############################################################
        ###################### QUERY & SPANS #########################
        ##############################################################

        embedded_query_list, span_list = query_span_embedding(specific_query_meeting)

        ##############################################################
        ########################## LENGTH ############################
        ##############################################################

        individual_dct['Length'] = len(transcripts)

        ##############################################################
        ######################## DICTIONARY ##########################
        ##############################################################
        
        individual_dct['Transcript'] = embedded_transcript_3D
        individual_dct['Query'] = embedded_query_list
        individual_dct['Spans'] = span_list

        general_dct[f'Meeting {str(ind)}'] = individual_dct

    print('Dictionary created!')
    return general_dct

##### TRAINING DICTIONARY

In [24]:
torch.save(final_dictionary, '/content/drive/MyDrive/NCU/Data Science and Machine Learning/Final Project/Locator/dictionary_final_lc_ELECTRA.pt')

In [None]:
final_dictionary = dictionary(prep_df)

#### TRASH

In [None]:
count = 0
query_list = []

for i, meeting in enumerate(prep_df['specific_query_list']):
    print('Embedding meeting:', i)

    for ind, llista in enumerate(meeting):

        print('Embedding query: ', ind)
        specific_query = llista['query']

        tokenized_query = tokenizer(specific_query, truncation=True, padding='max_length', return_tensors='pt') # singular query
        
        input_ids = tokenized_query['input_ids']
        attention_mask = tokenized_query['attention_mask']


        #print(input_ids.shape, attention_mask.shape)

        # EMBED EACH QUERY WITH ROBERTA
        with torch.no_grad():
            outputs_query = model(input_ids=input_ids, attention_mask=attention_mask)
            query_embedding = outputs_query.last_hidden_state.squeeze(0)

        query_embedding_3D = query_embedding.unsqueeze(0)
        print('Query embedded!')

                    ##### CNN #####       INPUT SHAPE: [1,512,768], OUTPUT SHAPE: [1,50]

        
        # PASS THE QUERY EMBEDDING INTO THE CNN AND GET THE OUTPUT
        cnn_output = cnn(query_embedding_3D)

        query_list.append(cnn_output)

    print('query_list: ',query_list)
    count += 1

    if count == 2:
        break 

In [None]:
def transcript_embedding(prep_df):
    
    count = 0
    for i, transcript in enumerate(prep_df['meeting_transcripts']):
        print("Embedding transcript:", i)
        
        transcript_embeddings = []

        for ind, turn in enumerate(transcript):
            print('Embedding turn: ', ind)
            turn_emb = turn_embedding(turn)
            transcript_embeddings.append(turn_emb)
            print('Embedded turn added to list!')
            count += 1

            if count == 3:
                break

        # Concatenate the turn embeddings along the first dimension
        transcript_tensor = torch.stack(transcript_embeddings, dim=0)

        print('Averaging transcript tensors...')
        # Compute the average or sum of the turn embeddings
        average_transcript = torch.mean(transcript_tensor, dim=0)

        embedded_transcript_3D = average_transcript.unsqueeze(0)
        print('Embedded transcript shape:', embedded_transcript_3D.shape, 'Embedded transcript:', embedded_transcript_3D)

        return embedded_transcript_3D


In [None]:
# TRASH

# EXTRACT INPUT IDS AND ATTENTION MASKS FROM SPEAKERS AND UTTERANCES
    input_ids_utterance = tokenized_utterance['input_ids'].to(device)
    attention_mask_utterance = tokenized_utterance['attention_mask'].to(device)

    input_ids_speaker = tokenized_speaker['input_ids'].to(device)
    attention_mask_speaker = tokenized_speaker['attention_mask'].to(device)

In [None]:
import torch

lista = [0, 231, 234, 12, 245, 64, 124, 52, 2, 1, 1, 1, 1, 1, 1]
oscar = torch.tensor(lista)
print(oscar.shape, len(oscar))

concat_words = torch.tensor([], dtype=torch.int32)
print(concat_words)
for value in oscar:
    tensor_value = torch.tensor([[value]])
    concat_words = torch.cat((concat_words, tensor_value))
    print(tensor_value.shape,tensor_value)

concat_words_final = concat_words.unsqueeze(0)
print(concat_words_final.shape, concat_words_final)

torch.Size([15]) 15
tensor([], dtype=torch.int32)
torch.Size([1, 1]) tensor([[0]])
torch.Size([1, 1]) tensor([[231]])
torch.Size([1, 1]) tensor([[234]])
torch.Size([1, 1]) tensor([[12]])
torch.Size([1, 1]) tensor([[245]])
torch.Size([1, 1]) tensor([[64]])
torch.Size([1, 1]) tensor([[124]])
torch.Size([1, 1]) tensor([[52]])
torch.Size([1, 1]) tensor([[2]])
torch.Size([1, 1]) tensor([[1]])
torch.Size([1, 1]) tensor([[1]])
torch.Size([1, 1]) tensor([[1]])
torch.Size([1, 1]) tensor([[1]])
torch.Size([1, 1]) tensor([[1]])
torch.Size([1, 1]) tensor([[1]])
torch.Size([1, 15, 1]) tensor([[[  0],
         [231],
         [234],
         [ 12],
         [245],
         [ 64],
         [124],
         [ 52],
         [  2],
         [  1],
         [  1],
         [  1],
         [  1],
         [  1],
         [  1]]])


##### UTTERANCE TOKENIZE, UTTERANCE EMBED, CNN, SPEAKER TOKENIZE, CONCATENATE

In [None]:
def tokenize_embed_cnn(turn):

    # EXTRACT UTTERANCES AND SPEAKERS FROM EACH TURN
    speakers, utterances = turn['speaker'], turn['content']

    # TOKENIZE UTTERANCES, WITH PADDING = MAX LENGTH = 512
    tokenized_utterance = tokenizer(utterances, truncation=True, padding='max_length', return_tensors='pt') # singular utterance
    
    print(tokenized_utterance)
    # TOKENIZED SPEAKERS MAX_LENGTH = 10 ([0,word1, word2, ... ,word8, 2])
    tokenized_speaker = tokenizer(speakers, truncation=True, padding='max_length',max_length=10, return_tensors='pt') # singular speaker

    # CREATE EMPTY TENSOR TO PUT THE EMBEDDED WORDS LATER
    concat_words = torch.tensor([], dtype=torch.int32)

    # WORD EMBEDDING LOOP, WE TOKENIZE WORD BY WORD AND CONCATENATE THEM TOGETHER IN A NEW TENSOR
    for word in tokenized_utterance['input_ids'][0]:

    # ADD 2D TO THE EACH WORD --> SHAPE: [1,1]
        word_2D = torch.tensor([[word]])

    # EMBEDD EACH WORD WITH ROBERTA
        with torch.no_grad():
            outputs_word = model(word_2D)
            word_embedding = outputs_word.last_hidden_state.squeeze(0)

    # CONCATENATE ALL WORDS IN A NEW TENSOR
        concat_words = torch.cat((concat_words, word_embedding))

    # ADD A NEW DIMENSION TO THE TENSOR SO IT HAS A SHAPE ([1,512,768])
    concat_words_final = concat_words.unsqueeze(0) 

    print(concat_words_final.shape, concat_words_final)



                            ##### CNN #####       INPUT SHAPE: [1,512,768], OUTPUT SHAPE: [1,50]

    
    # PASS THE UTTERANCE EMBEDDING INTO THE CNN AND GET THE OUTPUT

    output = cnn_model(concat_words_final)
    print('this is test output', output)
    print(output.shape)

##### SPEAKERS + UTTERANCES, TOKENIZE, EMBED, CNN

In [None]:
def turn_embed_cnn(turn):

    # EXTRACT UTTERANCES AND SPEAKERS FROM EACH TURN
    speakers, utterances = turn['speaker'], turn['content']

    # JOIN SPEAKERS AND UTTERANCES TO GET THE TURN
    turn = [speakers + ' ' + utterances]
    
    # TOKENIZE THE TURN
    tokenized_turn = tokenizer(turn, truncation=True, padding='max_length', return_tensors='pt') # singular turn

    #print(tokenized_turn)
    # CREATE EMPTY TENSOR TO PUT THE EMBEDDED WORDS LATER
    concat_words = torch.tensor([], dtype=torch.int32)

    # WORD EMBEDDING LOOP, WE TOKENIZE WORD BY WORD AND CONCATENATE THEM TOGETHER IN A NEW TENSOR
    for word in tokenized_turn['input_ids'][0]:

    # ADD 2D TO THE EACH WORD --> SHAPE: [1,1]
        word_2D = torch.tensor([[word]])

    # EMBEDD EACH WORD WITH ROBERTA
        with torch.no_grad():
            outputs_word = model(word_2D)
            word_embedding = outputs_word.last_hidden_state.squeeze(0)
            #print(word_embedding.shape, word_embedding)
    # CONCATENATE ALL WORDS IN A NEW TENSOR
        concatenated_turn = torch.cat((concat_words, word_embedding)) # IT IS NOT CONCA
    print('concatenated turn: ',concatenated_turn.shape, tokenized_turn)
    # ADD A NEW DIMENSION TO THE TENSOR SO IT HAS A SHAPE ([1,512,768])
    turn_embed = concatenated_turn.unsqueeze(0) 

    print(turn_embed.shape, turn_embed)



## VALIDATION

##### PREPROCESS THE VALIDATION MEETING DATA

In [10]:
def val_prep(data):

    preprocessed_item = {}
    preprocessed_data = []

    specific_query_list = data['specific_query_list']
    transcript = data['meeting_transcripts']

    ############# PREPROCESS SPECIFIC QUERIES #################

    preprocessed_specific_query_list = []

    for item in range(len(specific_query_list)):
        preprocessed_query = {}
        
        preprocessed_query['query'] = preprocess_text(specific_query_list[item]['query'])
        preprocessed_query['answer'] = preprocess_text(specific_query_list[item]['answer'])
        preprocessed_query['relevant_text_span'] = specific_query_list[item]['relevant_text_span']

        preprocessed_specific_query_list.append(preprocessed_query)

    preprocessed_item['specific_query_list'] = preprocessed_specific_query_list

    ############### PREPROCESS TRANSCRIPTS #####################

    preprocessed_transcript_list = []

    for item in range(len(transcript)):
        preprocessed_transcript = {}
        
        if len(transcript[item]['content']) > 4:
            preprocessed_transcript['speaker'] = preprocess_text(transcript[item]['speaker'])
            preprocessed_transcript['content'] = preprocess_text(transcript[item]['content'])

            preprocessed_transcript_list.append(preprocessed_transcript)

    preprocessed_item['meeting_transcripts'] = preprocessed_transcript_list
    preprocessed_data.append(preprocessed_item)

    return preprocessed_data

##### VALIDATION EMBEDDING

In [21]:
import os
import json
import pandas as pd


##### LOAD THE VALIDATION MEETING DATA

directory = 'QMSum/data/ALL/test/'  

all_dict = {}

for filename in os.listdir(directory):

    filepath = os.path.join(directory, filename) 

    if os.path.isfile(filepath):

        print('Preprocessing meeting:', filename)

        with open(filepath) as f:
            data = json.load(f)

        # PREPROCESS EACH MEETING AND SAVE IT IN A DICTIONARY

        preprocessed_data = val_prep(data)
        val_meeting = pd.DataFrame(preprocessed_data)
        validation_dictionary = dictionary(val_meeting)

        all_dict[f'meeting: {filename}'] = validation_dictionary

Preprocessing meeting: ES2011b.json
Processing meeting:  0
Dictionary created!
Preprocessing meeting: Bed008.json
Processing meeting:  0
Dictionary created!
Preprocessing meeting: IS1003b.json
Processing meeting:  0
Dictionary created!
Preprocessing meeting: ES2011d.json
Processing meeting:  0
Dictionary created!
Preprocessing meeting: ES2004c.json
Processing meeting:  0
Dictionary created!
Preprocessing meeting: Bmr006.json
Processing meeting:  0
Dictionary created!
Preprocessing meeting: TS3004a.json
Processing meeting:  0
Dictionary created!
Preprocessing meeting: IS1003a.json
Processing meeting:  0
Dictionary created!
Preprocessing meeting: education_13.json
Processing meeting:  0
Dictionary created!
Preprocessing meeting: covid_4.json
Processing meeting:  0
Dictionary created!
Preprocessing meeting: covid_9.json
Processing meeting:  0
Dictionary created!
Preprocessing meeting: education_9.json
Processing meeting:  0
Dictionary created!
Preprocessing meeting: education_4.json
Proce

In [22]:
torch.save(all_dict, '/content/drive/MyDrive/NCU/Data Science and Machine Learning/Final Project/Data/VAL_TENSOR_ROBERTA.pt')

## LOCATOR MODEL

In [15]:
import torch
import math
import numpy as np
from math import sqrt
from torch import nn
from torch.nn import init
from torch.nn import functional as F
from torchsummary import summary
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm, trange

### MLP

In [16]:
# Here's MLP scoring test
class MLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.leakyrelu = nn.LeakyReLU(0.2)
        # self.dropout = nn.Dropout(p=0.2)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x1, x2, max_len):
        cosine_score = F.cosine_similarity(x1, x2)
        tensor_similarity = torch.Tensor([[cosine_score]])
        tensor_max_len = torch.Tensor([[max_len]])

        tensor_similarity.requires_grad_(True)
        tensor_max_len.requires_grad_(True)

        x = torch.cat((x1, x2, tensor_similarity, tensor_max_len), dim=1)
        x = self.fc1(x)
        x = self.leakyrelu(x)
        x = self.fc2(x)

        x = torch.abs(x)

        return x

### LOCATOR AND CNN

In [17]:
# Build a Locater with CNN+MLP
''' CNN accept transcript or query and convert it into average embedding, 
    MLP accept query embedding & transcript embedding and convert it into a 2-D vector as <start, end> of relevant spans '''

class testCNNModel(nn.Module):
    def __init__(self):
        super(testCNNModel, self).__init__()
        
        self.conv1d = nn.Conv1d(in_channels=512, out_channels=256, kernel_size=1)
        # self.relu = nn.ReLU()
        # self.drop = nn.Dropout(0.2)
        self.maxpool = nn.MaxPool1d(kernel_size=1)
        self.linear = nn.Linear(196608, 50)   # fully-connected layer
    
    def forward(self, x):
        x = x.requires_grad_(True)
        x = self.conv1d(x)
        # x = self.drop(x)
        x = self.maxpool(x)
        x = torch.flatten(x, start_dim=1)  # 将张量展平
        x = self.linear(x)
        return x

class Locater(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        self.CNN = testCNNModel()
        self.MLP = MLP(input_dim, hidden_dim, output_dim)

    def forward(self, transcript_emb, query_emb, max_len):
        transcript_cnn_out = self.CNN(transcript_emb)
        query_cnn_out = self.CNN(query_emb)

        spans_id = self.MLP(transcript_cnn_out, query_cnn_out, max_len)

        return spans_id


In [13]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [19]:
locator = torch.load('/content/drive/MyDrive/NCU/Data Science and Machine Learning/Final Project/Locator/Copy of locater(BERT).pth')

## SPAN TO TEXT

In [20]:
import pickle
import json

##### TENSOR MEETING DATA ######
input_file = '/content/drive/MyDrive/NCU/Data Science and Machine Learning/Final Project/Data/all_dict_tensor.pkl'
with open(input_file, 'rb') as f:
    all_dict = pickle.load(f)

tensorMeeting = all_dict

In [21]:
##### TEXT MEETING DATA ######
file_path = '/content/drive/MyDrive/NCU/Data Science and Machine Learning/Final Project/Data/all_dict.json'
with open(file_path, 'r') as file:
    json_data = json.load(file)
    
textMeeting = json_data

In [58]:
len(textMeeting['meeting: Bed003.json'][0]['specific_query_list']), textMeeting['meeting: Bed003.json'][0]['specific_query_list'][2]

(6,
 {'query': 'summarize what was said on intentionality',
  'answer': 'there was a demonstration of the structure and the function of a toy version of the beliefnet for the intentionality task the features nodes include things like prosody discourse verb choice landmarkiness of a building time of day and whether the admission fee was discussed',
  'relevant_text_span': [['54', '76']]})

In [22]:
def spans_to_text(tensor_meeting, textMeeting):

    device2 = 'cpu'
    meetings_to_summarize = {}

    for smeeting, meetings in tensor_meeting.items():
        for meeting_name, meeting_data in meetings.items():
 
            transcript = meeting_data['Transcript'].to(device2)
            length = meeting_data['Length']
            query_set = meeting_data['Query']

            meeting_n = {}

            for ind, tensor_query  in enumerate(query_set):

                

                ######### EXTRACT SPAN WITH LOCATOR ###########

                spanita = locator(transcript, tensor_query.to(device2), length)
                
                spanita_list =spanita.tolist()

                start = spanita_list[0][0]
                end = spanita_list[0][1]

                ########## TURN PREPROCESSING #################

                concatenated_content = ""

                turns = textMeeting[smeeting][0]['meeting_transcripts']
                query = textMeeting[smeeting][0]['specific_query_list'][ind]['query']
                answer = textMeeting[smeeting][0]['specific_query_list'][ind]['answer']


                concatenated_content += "<s> " + query + " </s> "

                for i, turn in enumerate(turns):

                      if i >= start and i <= end:
                        
                          concatenated_content +=  turn['speaker'] + ": " + turn['content'] + ". "

                concatenated_content += "</s> "

                meeting_n[f'span{ind}'] = concatenated_content
                meeting_n[f'answer{ind}'] = answer

            meetings_to_summarize[smeeting] = meeting_n
          
            print(f'Done with {smeeting}')
    print('Completed!')
    
    return meetings_to_summarize

In [23]:
FINAL = spans_to_text(tensorMeeting, textMeeting)

Done with meeting: IS1003d.json
Done with meeting: Bro027.json
Done with meeting: TS3011d.json
Done with meeting: TS3004b.json
Done with meeting: Bed008.json
Done with meeting: Bed016.json
Done with meeting: Bmr014.json
Done with meeting: ES2004b.json
Done with meeting: Bro019.json
Done with meeting: covid_9.json
Done with meeting: IS1003c.json
Done with meeting: ES2004c.json
Done with meeting: Bed003.json
Done with meeting: education_17.json
Done with meeting: education_4.json
Done with meeting: ES2004d.json
Done with meeting: covid_4.json
Done with meeting: ES2004a.json
Done with meeting: IS1003b.json
Done with meeting: TS3011a.json
Done with meeting: ES2011a.json
Done with meeting: TS3004c.json
Done with meeting: ES2011b.json
Done with meeting: education_9.json
Done with meeting: TS3011b.json
Done with meeting: ES2011c.json
Done with meeting: education_13.json
Done with meeting: Bmr006.json
Done with meeting: TS3004d.json
Done with meeting: ES2011d.json
Done with meeting: TS3011c.js

In [24]:
import json

output_folder = '/content/drive/MyDrive/NCU/Data Science and Machine Learning/Final Project/Data'
output_file = f'{output_folder}/final_data_BERT.json'

# Assuming FINAL is a dictionary containing the data
final_data = FINAL

with open(output_file, 'w') as f:
    json.dump(final_data, f)

print(f'Final data saved as JSON file: {output_file}')


Final data saved as JSON file: /content/drive/MyDrive/NCU/Data Science and Machine Learning/Final Project/Data/final_data_BERT.json
