In [1]:
"""
This notebook uses approach from paper: https://arxiv.org/abs/2106.12978
"""

'\nThis notebook uses approach from paper: https://arxiv.org/abs/2106.12978\n'

In [2]:
import time
import numpy as np
import pandas as pd
import torch

from sentence_transformers import SentenceTransformer

PARALLEL_INFERENCE_INSTANCES = 20
model = SentenceTransformer('all-mpnet-base-v2')

from enum import Enum
from typing import NamedTuple, Optional


class TopicSegmentationAlgorithm(Enum):
    RANDOM = 0
    EVEN = 1
    BERT = 2
    SBERT = 3


class TextTilingHyperparameters(NamedTuple):
    SENTENCE_COMPARISON_WINDOW: int = 15
    SMOOTHING_PASSES: int = 2
    SMOOTHING_WINDOW: int = 1
    TOPIC_CHANGE_THRESHOLD: float = 0.6


class TopicSegmentationConfig(NamedTuple):
    TEXT_TILING: Optional[TextTilingHyperparameters] = None
    MAX_SEGMENTS_CAP: bool = True
    MAX_SEGMENTS_CAP__AVERAGE_SEGMENT_LENGTH: int = 60


def PrintMessage(msg, x):
    print(msg)
    print(x)


def depth_score(timeseries):
    """
    The depth score corresponds to how strongly the cues for a subtopic changed on both sides of a
    given token-sequence gap and is based on the distance from the peaks on both sides of the valleyto that valley.

    returns depth_scores
    """
    depth_scores = []
    for i in range(1, len(timeseries) - 1):
        left, right = i - 1, i + 1
        while left > 0 and timeseries[left - 1] > timeseries[left]:
            left -= 1
        while (
                right < (len(timeseries) - 1) and timeseries[right + 1] > timeseries[right]
        ):
            right += 1
        depth_scores.append(
            (timeseries[right] - timeseries[i]) + (timeseries[left] - timeseries[i])
        )
    return depth_scores


def smooth(timeseries, n, s):
    smoothed_timeseries = timeseries[:]
    for _ in range(n):
        for index in range(len(smoothed_timeseries)):
            neighbours = smoothed_timeseries[
                         max(0, index - s): min(len(timeseries) - 1, index + s)
                         ]
            smoothed_timeseries[index] = sum(neighbours) / len(neighbours)
    return smoothed_timeseries


def sentences_similarity(first_sentence_features, second_sentence_features) -> float:
    """
    Given two senteneces embedding features compute cosine similarity
    """
    similarity_metric = torch.nn.CosineSimilarity()
    return float(similarity_metric(first_sentence_features, second_sentence_features))


def compute_window(timeseries, start_index, end_index):
    """given start and end index of embedding, compute pooled window value

    [window_size, 768] -> [1, 768]
    """
    stack = torch.stack(timeseries[start_index:end_index])
    stack = stack.unsqueeze(
        0
    )  # https://jbencook.com/adding-a-dimension-to-a-tensor-in-pytorch/
    stack_size = end_index - start_index
    pooling = torch.nn.MaxPool2d((stack_size - 1, 1))
    return pooling(stack)


def block_comparison_score(timeseries, k):
    """
    comparison score for a gap (i)

    cfr. docstring of block_comparison_score
    """
    res = []
    for i in range(k, len(timeseries) - k):
        first_window_features = compute_window(timeseries, i - k, i + 1)
        second_window_features = compute_window(timeseries, i + 1, i + k + 2)
        res.append(
            sentences_similarity(first_window_features[0], second_window_features[0])
        )

    return res


def get_features_from_sentence(batch_sentences, layer=-2):
    """
    extracts the BERT semantic representation
    from a sentence, using an averaged value of
    the `layer`-th layer

    returns a 1-dimensional tensor of size 758
    """

    return model.encode(batch_sentences, convert_to_numpy=False)


def arsort2(array1, array2):
    x = np.array(array1)
    y = np.array(array2)

    sorted_idx = x.argsort()[::-1]
    return x[sorted_idx], y[sorted_idx]


def get_local_maxima(array):
    local_maxima_indices = []
    local_maxima_values = []
    for i in range(1, len(array) - 1):
        if array[i - 1] < array[i] and array[i] > array[i + 1]:
            local_maxima_indices.append(i)
            local_maxima_values.append(array[i])
    return local_maxima_indices, local_maxima_values


def depth_score_to_topic_change_indexes(
        depth_score_timeseries,
        meeting_duration=60 * 3600,
        topic_segmentation_configs=TopicSegmentationConfig,
):
    """
    capped add a max segment limit so there are not too many segments, used for UI improvements on the Workplace TeamWork product
    """

    capped = topic_segmentation_configs.MAX_SEGMENTS_CAP
    average_segment_length = (
        topic_segmentation_configs.MAX_SEGMENTS_CAP__AVERAGE_SEGMENT_LENGTH
    )
    threshold = topic_segmentation_configs.TEXT_TILING.TOPIC_CHANGE_THRESHOLD * max(
        depth_score_timeseries
    )

    # print("DEPTH_SCORE_TIMESERIES:")
    # print(list(depth_score_timeseries))

    if depth_score_timeseries == []:
        return []

    local_maxima_indices, local_maxima = get_local_maxima(depth_score_timeseries)

    if local_maxima == []:
        return []

    if capped:  # capped is segmentation used for UI
        # sort based on maxima for pruning
        local_maxima, local_maxima_indices = arsort2(local_maxima, local_maxima_indices)

        # local maxima are sorted by depth_score value and we take only the first K
        # where the K+1th local maxima is lower then the threshold
        for thres in range(len(local_maxima)):
            if local_maxima[thres] <= threshold:
                break

        max_segments = int(meeting_duration / average_segment_length)
        slice_length = min(max_segments, thres)

        local_maxima_indices = local_maxima_indices[:slice_length]
        local_maxima = local_maxima[:slice_length]

        # after pruning, sort again based on indices for chronological ordering
        local_maxima_indices, _ = arsort2(local_maxima_indices, local_maxima)

    else:  # this is the vanilla TextTiling used for Pk optimization
        filtered_local_maxima_indices = []
        filtered_local_maxima = []

        for i, m in enumerate(local_maxima):
            if m > threshold:
                filtered_local_maxima.append(m)
                filtered_local_maxima_indices.append(local_maxima_indices[i])

        local_maxima = filtered_local_maxima
        local_maxima_indices = filtered_local_maxima_indices

    # print("LOCAL_MAXIMA_INDICES:")
    # print(list(local_maxima_indices))

    return local_maxima_indices


def get_timeseries(caption_indexes, features):
    timeseries = []
    for caption_index in caption_indexes:
        timeseries.append(features[caption_index])
    return timeseries


def flatten_features(batches_features):
    res = []
    for batch_features in batches_features:
        res += batch_features
    return res


def split_list(a, n):
    k, m = divmod(len(a), n)
    return (
        a[i * k + min(i, m): (i + 1) * k + min(i + 1, m)]
        for i in range(min(len(a), n))
    )


def topic_segmentation(
        topic_segmentation_algorithm: TopicSegmentationAlgorithm,
        df: pd.DataFrame,
        meeting_id_col_name: str,
        start_col_name: str,
        end_col_name: str,
        caption_col_name: str,
        topic_segmentation_config: TopicSegmentationConfig,
):
    """
    Input:
        df: dataframe with meeting captions
    Output:
        {meeting_id: [list of topic change indexes]}
    """

    if topic_segmentation_algorithm == TopicSegmentationAlgorithm.BERT:
        return topic_segmentation_bert(
            df,
            meeting_id_col_name,
            start_col_name,
            end_col_name,
            caption_col_name,
            topic_segmentation_config,
        )
    elif topic_segmentation_algorithm == TopicSegmentationAlgorithm.RANDOM:
        return topic_segmentation_random(
            df, meeting_id_col_name, start_col_name, end_col_name, caption_col_name
        )
    elif topic_segmentation_algorithm == TopicSegmentationAlgorithm.EVEN:
        return topic_segmentation_even(
            df, meeting_id_col_name, start_col_name, end_col_name, caption_col_name
        )
    else:
        return topic_segmentation_bert(
            df,
            meeting_id_col_name,
            start_col_name,
            end_col_name,
            caption_col_name,
            topic_segmentation_config,
        )


default_meeting_id_column = 'meeting_id'
default_st_column = 'st'
default_en_column = 'en'
default_caption_column = 'caption'


def topic_segmentation_bert(
        df: pd.DataFrame,
        meeting_id_col_name: str = default_meeting_id_column,
        start_col_name: str = default_st_column,
        end_col_name: str = default_en_column,
        caption_col_name: str = default_caption_column,
        topic_segmentation_configs: TopicSegmentationConfig = TopicSegmentationConfig(),
):
    textiling_hyperparameters = topic_segmentation_configs.TEXT_TILING

    # parallel inference
    features = get_features_from_sentence(df[caption_col_name])

    # meeting_id -> list of topic change start times
    segments = {}
    task_idx = 0
    print("meeting_id -> task_idx")
    for meeting_id in set(df[meeting_id_col_name]):
        print("%s -> %d" % (meeting_id, task_idx))
        task_idx += 1

        meeting_data = df[df[meeting_id_col_name] == meeting_id]
        caption_indexes = list(meeting_data.index)

        timeseries = get_timeseries(caption_indexes, features)
        block_comparison_score_timeseries = block_comparison_score(
            timeseries, k=textiling_hyperparameters.SENTENCE_COMPARISON_WINDOW
        )

        block_comparison_score_timeseries = smooth(
            block_comparison_score_timeseries,
            n=textiling_hyperparameters.SMOOTHING_PASSES,
            s=textiling_hyperparameters.SMOOTHING_WINDOW,
        )

        depth_score_timeseries = depth_score(block_comparison_score_timeseries)

        meeting_start_time = meeting_data[start_col_name].iloc[0]
        meeting_end_time = meeting_data[end_col_name].iloc[-1]
        meeting_duration = meeting_end_time - meeting_start_time
        segments[meeting_id] = depth_score_to_topic_change_indexes(
            depth_score_timeseries,
            meeting_duration,
            topic_segmentation_configs=topic_segmentation_configs,
        )

    return segments


from random import random

import pandas as pd


def topic_segmentation_random(
        df: pd.DataFrame,
        meeting_id_col_name: str,
        start_col_name: str,
        end_col_name: str,
        caption_col_name: str,
        random_threshold: float = 0.9,
):
    # meeting_id -> list of topic change start times
    segments = {}
    task_idx = 0
    print("meeting_id -> task_idx")
    for meeting_id in set(df[meeting_id_col_name]):
        print("%s -> %d" % (meeting_id, task_idx))
        task_idx += 1

        meeting_data = df[df[meeting_id_col_name] == meeting_id]
        meeting_start_times = meeting_data[start_col_name]
        random_segmentation = []
        for i, _ in enumerate(meeting_start_times):
            if random() > random_threshold:
                random_segmentation.append(i)
        print(random_segmentation)
        segments[meeting_id] = random_segmentation
    return segments


def topic_segmentation_even(
        df: pd.DataFrame,
        meeting_id_col_name: str,
        start_col_name: str,
        end_col_name: str,
        caption_col_name: str,
):
    # meeting_id -> list of topic change start times
    segments = {}
    task_idx = 0
    print("meeting_id -> task_idx")
    for meeting_id in set(df[meeting_id_col_name]):
        print("%s -> %d" % (meeting_id, task_idx))
        task_idx += 1

        meeting_data = df[df[meeting_id_col_name] == meeting_id]
        meeting_start_times = meeting_data[start_col_name]
        even_segmentation = []
        for i, _ in enumerate(meeting_start_times):
            if i % 30 == 0:
                even_segmentation.append(i)
        print(even_segmentation)
        segments[meeting_id] = even_segmentation
    return segments


In [3]:
#!/usr/bin/env python3
import logging
from bisect import bisect
from typing import Dict
import pandas as pd
from nltk.metrics.segmentation import pk, windowdiff


def compute_metrics(prediction_segmentations, binary_labels, metric_name_suffix=""):
    print(prediction_segmentations)
    indices = {k: [i for i, v in enumerate(binary_labels[k]) if v == 1] for k in binary_labels.keys()}
    print(f'expected: {indices}')
    _pk, _windiff = [], []
    for meeting_id, reference_segmentation in binary_labels.items():

        predicted_segmentation_indexes = prediction_segmentations[meeting_id]
        # we need to convert from topic changes indexes to topic changes binaries
        predicted_segmentation = [0] * len(reference_segmentation)
        for topic_change_index in predicted_segmentation_indexes:
            predicted_segmentation[topic_change_index] = 1

        reference_segmentation = "".join(map(str, reference_segmentation))
        predicted_segmentation = "".join(map(str, predicted_segmentation))

        _pk.append(pk(reference_segmentation, predicted_segmentation))

        # setting k to default value used in CoAP (pk) function for both evaluation functions
        k = int(
            round(
                len(reference_segmentation) / (reference_segmentation.count("1") * 2.0)
            )
        )
        _windiff.append(windowdiff(reference_segmentation, predicted_segmentation, k))

    avg_pk = sum(_pk) / len(binary_labels)
    avg_windiff = sum(_windiff) / len(binary_labels)

    print("Pk on {} meetings: {}".format(len(binary_labels), avg_pk))
    print("WinDiff on {} meetings: {}".format(len(binary_labels), avg_windiff))

    return {
        "average_Pk_" + str(metric_name_suffix): avg_pk,
        "average_windiff_" + str(metric_name_suffix): avg_windiff,
    }


def binary_labels_flattened(
        input_df,
        labels_df,
        meeting_id_col_name: str,
        start_col_name: str,
        end_col_name: str,
        caption_col_name: str,
):
    """
    Binary Label [0, 0, 1, 0] for topic changes as ntlk format.
    Hierarchical topic strutcure flattened.
    see https://www.XXXX.com/intern/anp/view/?id=434543
    """
    labels_flattened = {}
    meeting_ids = list(set(input_df[meeting_id_col_name]))

    for meeting_id in meeting_ids:
        logging.info("\n\nMEETING ID:{}".format(meeting_id))

        if meeting_id not in list(labels_df[meeting_id_col_name]):
            logging.info("{} not found in `labels_df`".format(meeting_id))
            continue

        meeting_data = input_df[
            input_df[meeting_id_col_name] == meeting_id
            ].sort_values(by=[start_col_name])
        meeting_sentences = [*map(lambda s: s.lower(), list(meeting_data["caption"]))]

        caption_start_times = list(meeting_data[start_col_name])
        segment_start_times = list(
            labels_df[labels_df[meeting_id_col_name] == meeting_id][start_col_name]
        )

        meeting_labels_flattened = [0] * len(caption_start_times)

        # we skip first and last labaled segment cause they are naive segments
        for sst in segment_start_times[1:]:
            try:
                topic_change_index = caption_start_times.index(sst)
            except ValueError:
                topic_change_index = bisect(caption_start_times, sst)
                if topic_change_index == len(meeting_labels_flattened):
                    topic_change_index -= 1  # bisect my go out of boundary
            meeting_labels_flattened[topic_change_index] = 1

        labels_flattened[meeting_id] = meeting_labels_flattened

        logging.info("MEETING TRANSCRIPTS")
        for i, sentence in enumerate(meeting_sentences):
            if meeting_labels_flattened[i] == 1:
                logging.info("\n\n<<------ Topic Change () ------>>\n")
            logging.info(sentence)

    return labels_flattened


def binary_labels_top_level(
        input_df,
        labels_df,
        meeting_id_col_name: str,
        start_col_name: str,
        end_col_name: str,
        caption_col_name: str,
):
    """
    Binary Label [0, 0, 1, 0] for topic changes as ntlk format.
    Hierarchical topic strutcure only top level topics
    see https://www.XXXX.com/intern/anp/view/?id=434543
    """
    labels_top_level = {}
    meeting_ids = list(set(input_df[meeting_id_col_name]))

    for meeting_id in meeting_ids:
        logging.info("\n\nMEETING ID:{}".format(meeting_id))

        if meeting_id not in list(labels_df[meeting_id_col_name]):
            logging.info("{} not found in `labels_df`".format(meeting_id))
            continue

        meeting_data = input_df[
            input_df[meeting_id_col_name] == meeting_id
            ].sort_values(by=[start_col_name])
        meeting_sentences = [*map(lambda s: s.lower(), list(meeting_data["caption"]))]

        caption_start_times = list(meeting_data[start_col_name])
        segment_start_times = list(
            labels_df[labels_df[meeting_id_col_name] == meeting_id][start_col_name]
        )
        segment_end_times = list(
            labels_df[labels_df[meeting_id_col_name] == meeting_id][end_col_name]
        )

        meeting_labels_top_level = [0] * len(caption_start_times)

        high_level_topics_indexes = []
        i = 0
        while i < len(segment_end_times):
            end = segment_end_times[i]
            high_level_topics_indexes.append(i)
            if segment_end_times.count(end) == 2:
                # skip all the subtopics of this high level topic
                i = (
                        segment_end_times.index(end)
                        + segment_end_times[segment_end_times.index(end) + 1:].index(end)
                        + 2
                )
            else:
                i += 1

        segment_start_times_high_level = [
            segment_start_times[i] for i in high_level_topics_indexes
        ]

        # we skip first and last labaled segment cause they are naive segments
        for sst in segment_start_times_high_level[1:]:
            try:
                topic_change_index = caption_start_times.index(sst)
            except ValueError:
                topic_change_index = bisect(caption_start_times, sst)
                if topic_change_index == len(meeting_labels_top_level):
                    topic_change_index -= 1  # bisect my go out of boundary
            meeting_labels_top_level[topic_change_index] = 1

        labels_top_level[meeting_id] = meeting_labels_top_level

        logging.info("MEETING TRANSCRIPTS")
        for i, sentence in enumerate(meeting_sentences):
            if meeting_labels_top_level[i] == 1:
                logging.info("\n\n<<------ Topic Change () ------>>\n")
            logging.info(sentence)

    return labels_top_level


MEETING_ID_COL_NAME = "meeting_id"
START_COL_NAME = "st"
EN_COL_NAME = "en"
CAPTION_COL_NAME = "caption"


def eval_topic_segmentation(
        input_df: pd.DataFrame,
        label_df: pd.DataFrame,
        topic_segmentation_algorithm: TopicSegmentationAlgorithm,
        topic_segmentation_config: TopicSegmentationConfig,
) -> Dict[str, float]:
    prediction_segmentations = topic_segmentation(
        topic_segmentation_algorithm,
        input_df,
        MEETING_ID_COL_NAME,
        START_COL_NAME,
        EN_COL_NAME,
        CAPTION_COL_NAME,
        topic_segmentation_config,
    )

    flattened = binary_labels_flattened(
        input_df,
        label_df,
        MEETING_ID_COL_NAME,
        START_COL_NAME,
        EN_COL_NAME,
        CAPTION_COL_NAME,
    )

    top_level = binary_labels_top_level(
        input_df,
        label_df,
        MEETING_ID_COL_NAME,
        START_COL_NAME,
        EN_COL_NAME,
        CAPTION_COL_NAME,
    )

    flattened_metrics = compute_metrics(
        prediction_segmentations, flattened, metric_name_suffix="flattened"
    )
    top_level_metrics = compute_metrics(
        prediction_segmentations, top_level, metric_name_suffix="top_level"
    )

    def merge_metrics(*metrics):
        res = {}
        for m in metrics:
            for k, v in m.items():
                res[k] = v
        return res

    return merge_metrics(flattened_metrics, top_level_metrics)


In [4]:
import sys
import pandas as pd


def preprocessing(df, caption_col_name):
    fillers = ["um", "uh", "oh", "hmm", "you know", "like"]
    fillers += list(
        map(lambda filler: filler + " ", fillers)
    )  # filler inside caption with other words
    fillers = list(
        map(lambda filler: "(?i)" + filler, fillers)
    )  # make it case-insensitive
    df[caption_col_name].replace(fillers, [""] * len(fillers), regex=True, inplace=True)

    captions_with_multiple_setences = len(df.loc[df[caption_col_name].isin(["."])])
    if captions_with_multiple_setences > 0:
        print(
            f"WARNING: Found {captions_with_multiple_setences} captions with multiple sentences; sentence embeddings may be inaccurate.",
            file=sys.stderr,
        )

    df = df[df[caption_col_name].str.len() > 20]
    df.reset_index(inplace=True)

    return df


def icsi_dataset():
    pass


def ami_dataset():
    """See XXXX for label generation and XXXX for input analysis

            SELECT
                fb_meeting_id AS meeting_id,
                st,
                en,
                caption,
                speaker
            FROM {ami}
            WHERE ds = '2021-01-12'

            SELECT
                fb_meeting_id AS meeting_id,
                st,
                en,
                topic
            FROM {labels}
            WHERE ds = '2021-01-10'
    """
    train = pd.read_csv('data/train_ami.csv')
    test = pd.read_csv('data/test_ami.csv')
    # train = train[train['meeting_id'] < 1]
    # test = test[test['meeting_id'] < 1]
    train = preprocessing(train, 'caption')
    return train, test


def measure(f):
    start_time = time.perf_counter()
    f()
    end_time = time.perf_counter()

    elapsed_time = end_time - start_time
    print("Elapsed time:", elapsed_time)

In [5]:
# def test_ami():
#     input_df, label_df = ami_dataset()
#     return eval_topic_segmentation(
#         input_df,
#         label_df,
#         TopicSegmentationAlgorithm.SBERT,
#         TopicSegmentationConfig(TextTilingHyperparameters()),
#     )
#
# measure(test_ami)

In [6]:
"""
Pk on 20 meetings: 0.42860887474411713
WinDiff on 20 meetings: 0.591437437455864
"""

'\nPk on 20 meetings: 0.42860887474411713\nWinDiff on 20 meetings: 0.591437437455864\n'

In [34]:
import pandas as pd
import re

def convert_time_to_seconds(time_str):
    time_parts = time_str.split(':')
    hours = 0
    minutes = 0
    seconds = 0
    milliseconds = 0

    if len(time_parts) == 2:
        minutes, seconds = map(float, time_parts)
    elif len(time_parts) == 3:
        hours, minutes, seconds = map(float, time_parts)

    seconds += (hours * 3600) + (minutes * 60)

    if '.' in str(seconds):
        seconds, milliseconds = map(int, str(seconds).split('.'))

    return (seconds + milliseconds/1000)


def vvt_to_df(file_path):
    with open(file_path, 'r') as f:
        data = f.read()
    meeting_id = file_path  # assuming meeting ID is in the file name and just an integer
    utterances = re.findall(r'(\d+:\d+.\d+).+?(\d+:\d+.\d+)\n(.*?)\n\n', data, re.DOTALL)
    result = []
    utterance_id = 1
    for utterance in utterances:
        st, en, caption = utterance
        st_sec = convert_time_to_seconds(st)
        en_sec = convert_time_to_seconds(en)
        result.append(
            {'meeting_id': meeting_id, 'st': st_sec, 'en': en_sec, 'caption': caption, 'speaker': utterance_id})
        utterance_id += 1
    return pd.DataFrame(result)

In [35]:
file_path = './data/episode_001_large.vtt'
df = vvt_to_df(file_path)
print(df.head())
segments = topic_segmentation_bert(
    df=df,
    topic_segmentation_configs=TopicSegmentationConfig(TextTilingHyperparameters(TOPIC_CHANGE_THRESHOLD=0.5))
)
segments

                     meeting_id      st      en  \
0  ./data/episode_001_large.vtt   0.000   4.002   
1  ./data/episode_001_large.vtt   4.002   6.006   
2  ./data/episode_001_large.vtt   6.006   8.068   
3  ./data/episode_001_large.vtt   8.068  11.092   
4  ./data/episode_001_large.vtt  11.092  16.096   

                                             caption  speaker  
0   As part of MIT course 6S099, Artificial Gener...        1  
1   I've gotten the chance to sit down with Max T...        2  
2                     He is a professor here at MIT.        3  
3   He's a physicist, spent a large part of his c...        4  
4   studying the mysteries of our cosmological un...        5  
meeting_id -> task_idx
./data/episode_001_large.vtt -> 0


{'./data/episode_001_large.vtt': array([1623, 1603, 1552, 1514, 1415, 1385, 1162,  804,  569,  416,  381,
         339,   33,   16])}

In [36]:
from transformers import pipeline

summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", max_length=2048, truncation=True)

# Assuming the list of indices is stored in a variable named topic_indices
topic_indices = segments

# Create an empty list to store the topics
topics = []

def convert_seconds_to_time(seconds):
    hours = int(seconds // 3600)
    seconds %= 3600
    minutes = int(seconds // 60)
    seconds %= 60
    milliseconds = int((seconds % 1) * 1000)
    seconds = int(seconds)
    return f"{hours}:{minutes:02}:{seconds:02}.{milliseconds:03}"

# Iterate over each file and its corresponding indices
for file_path, indices in topic_indices.items():

    # Sort the indices in ascending order
    indices = sorted(indices)

    # Add 0 as the starting index
    indices = [0] + indices

    # Iterate over each pair of indices
    for i in range(len(indices) - 1):
        # Extract the rows of the dataframe between the two indices
        start_index = indices[i]
        end_index = indices[i + 1]
        df_subset = df[(df['meeting_id'] == file_path) & (df.index >= start_index) & (df.index < end_index)]

        start_time = convert_seconds_to_time(df_subset.loc[start_index, 'st'])
        end_time = convert_seconds_to_time(df_subset.loc[end_index - 1, 'en'])
        # Combine the captions into a single string separated by a newline character
        topic_caption = '\n'.join(df_subset['caption'].tolist())
        summary = summarizer(topic_caption, max_length=30, min_length=1, do_sample=False)[0]['summary_text']
        # summary = ""

        # Append the topic to the list of topics
        topics.append({'meeting_id': file_path,
                       'start_index': start_index,
                       'end_index': end_index,
                       'summary': summary,
                       'start_time': start_time,
                       'end_time': end_time,
                       'topic': topic_caption})

# Create a new dataframe with the topics
topics_df = pd.DataFrame(topics)
topics_df.head()

Unnamed: 0,meeting_id,start_index,end_index,summary,start_time,end_time,topic
0,./data/episode_001_large.vtt,0,16,,0:00:00.000,0:00:47.097,"As part of MIT course 6S099, Artificial Gener..."
1,./data/episode_001_large.vtt,16,33,,0:00:47.097,0:01:36.016,please subscribe and also click the little be...
2,./data/episode_001_large.vtt,33,339,,0:01:36.016,0:15:34.056,"radio frequency interference, RFI, look it up..."
3,./data/episode_001_large.vtt,339,381,,0:15:34.056,0:17:19.016,or would you prefer that it's actually\n not ...
4,./data/episode_001_large.vtt,381,416,,0:17:19.016,0:18:47.011,"oh, these animals can't feel pain.\n It's oka..."


In [37]:
topics_df.to_csv('./output_topics.csv')