#### The code is an implemmentation of the paper [Unsupervised Topic Segmentation of Meetings with BERT Embeddings](https://arxiv.org/pdf/2106.12978.pdf) and is taken from the author's original [implemmentation](https://github.com/gdamaskinos/unsupervised_topic_segmentation)
# TODO
- [x] Incorporate bug-fixes to the original code 
- [x] Make sure it works end-to-end
- [x] Change RoBERTa encoding to an up-to-date version 
- [ ] Use faster sentence embeddings from the sentence-transformer package - [Link](https://github.com/UKPLab/sentence-transformers)
- [ ] Make sure the paper's metrics are reproducable 
- [ ] Try multilingual models' embeddings that potentially could work on the Russian and other languages
- [ ] Make the output usable

In [None]:
!pip install transformers bitarray hydra-core omegaconf

In [None]:
# types.py  -> https://github.com/gdamaskinos/unsupervised_topic_segmentation/blob/main/types.py

from enum import Enum
from typing import NamedTuple, Optional

class TopicSegmentationAlgorithm(Enum):
    RANDOM = 0
    EVEN = 1
    BERT = 2
    SBERT = 3

class TextTilingHyperparameters(NamedTuple):
    SENTENCE_COMPARISON_WINDOW: int = 15
    SMOOTHING_PASSES: int = 2
    SMOOTHING_WINDOW: int = 1
    TOPIC_CHANGE_THRESHOLD: float = 0.6

class TopicSegmentationConfig(NamedTuple):
    TEXT_TILING: Optional[TextTilingHyperparameters] = TextTilingHyperparameters()
    MAX_SEGMENTS_CAP: bool = True
    MAX_SEGMENTS_CAP__AVERAGE_SEGMENT_LENGTH: int = 60

In [None]:
# dataset.py  -> https://github.com/gdamaskinos/unsupervised_topic_segmentation/blob/main/dataset.py

## COMPLETELY OVERWRITTEN! 
## SEE HOW THE DATASETS WERE MADE: https://colab.research.google.com/drive/1k3d_8jgj_IBDjixpdV-SRNxm0sfKBDjp?usp=sharing
## (don't believe anything, check for nonsense)

!wget --no-check-certificate 'https://drive.google.com/uc?export=download&id=1jPKupuybeZz8iuuVi0TE8hckIF8gtheN' -O ami_topic_captions.csv
!wget --no-check-certificate 'https://drive.google.com/uc?export=download&id=1w4OsaOsM__i_vpejz6pZzGSt4Kno3vzv' -O ami_topic_labels.csv 

## No prepocessing was done, should be????

import pandas as pd

input_df = pd.read_csv('ami_topic_captions.csv')
labels_df = pd.read_csv('ami_topic_labels.csv')

In [None]:
input_df.head()

Unnamed: 0,id,text,label,starttime,endtime
0,IS1000c,Right first time this time .,0,12.73,14.09
1,IS1000c,Nu,0,16.66,16.96
2,IS1000c,There we go .,0,20.55,20.93
3,IS1000c,"It's not that complicated ,",0,24.38,25.34
4,IS1000c,but I get it wrong every time .,0,25.34,26.72


In [None]:
# baselines.py  -> https://github.com/gdamaskinos/unsupervised_topic_segmentation/blob/main/baselines.py

## KEPT ONLY EVEN SEGMENTATION (more reproducible)

def topic_segmentation_even(
    df: pd.DataFrame,
    meeting_id_col_name: str,
    start_col_name: str,
    end_col_name: str,
    caption_col_name: str,
):

    # meeting_id -> list of topic change start times
    segments = {}
    task_idx = 0
    print("meeting_id -> task_idx")
    for meeting_id in set(df[meeting_id_col_name]):
        print("%s -> %d" % (meeting_id, task_idx))
        task_idx += 1

        meeting_data = df[df[meeting_id_col_name] == meeting_id]
        meeting_start_times = meeting_data[start_col_name]
        even_segmentation = []
        for i, _ in enumerate(meeting_start_times):
            if i % 30 == 0:
                even_segmentation.append(i)
        print(even_segmentation)
        segments[meeting_id] = even_segmentation
    return segments

In [None]:
# eval.py  -> https://github.com/gdamaskinos/unsupervised_topic_segmentation/blob/main/eval.py

import logging
from bisect import bisect
from typing import Dict
from nltk.metrics.segmentation import pk, windowdiff


def compute_metrics(prediction_segmentations, binary_labels, metric_name_suffix=""):
    print(prediction_segmentations)
    print(binary_labels)
    _pk, _windiff = [], []
    for meeting_id, reference_segmentation in binary_labels.items():

        predicted_segmentation_indexes = prediction_segmentations[meeting_id]
        # we need to convert from topic changes indexes to topic changes binaries
        predicted_segmentation = [0] * len(reference_segmentation)
        for topic_change_index in predicted_segmentation_indexes:
            predicted_segmentation[topic_change_index] = 1

        reference_segmentation = "".join(map(str, reference_segmentation))
        predicted_segmentation = "".join(map(str, predicted_segmentation))

        _pk.append(pk(reference_segmentation, predicted_segmentation))

        # setting k to default value used in CoAP (pk) function for both evaluation functions
        k = int(
            round(
                len(reference_segmentation) / (reference_segmentation.count("1") * 2.0)
            )
        )
        _windiff.append(windowdiff(reference_segmentation, predicted_segmentation, k))

    avg_pk = sum(_pk) / len(binary_labels)
    avg_windiff = sum(_windiff) / len(binary_labels)

    print("Pk on {} meetings: {}".format(len(binary_labels), avg_pk))
    print("WinDiff on {} meetings: {}".format(len(binary_labels), avg_windiff))

    return {
        "average_Pk_" + str(metric_name_suffix): avg_pk,
        "average_windiff_" + str(metric_name_suffix): avg_windiff,
    }


def eval_topic_segmentation(
    input_df, label_df,
    topic_segmentation_algorithm: TopicSegmentationAlgorithm,
    topic_segmentation_config: TopicSegmentationConfig,
    meeting_id_col_name = "id", 
    start_col_name = "starttime", 
    end_col_name = "endtime", 
    caption_col_name="text"
) -> Dict[str, float]:

    prediction_segmentations = topic_segmentation(
        topic_segmentation_algorithm,
        input_df,
        meeting_id_col_name,
        start_col_name,
        end_col_name,
        caption_col_name,
        topic_segmentation_config,
    )

    flattened = binary_labels_flattened(
        input_df,
        label_df,
        meeting_id_col_name,
        start_col_name,
        end_col_name,
        caption_col_name,
    )

    top_level = binary_labels_top_level(
        input_df,
        label_df,
        meeting_id_col_name,
        start_col_name,
        end_col_name,
        caption_col_name,
    )

    flattened_metrics = compute_metrics(
        prediction_segmentations, flattened, metric_name_suffix="flattened"
    )
    top_level_metrics = compute_metrics(
        prediction_segmentations, top_level, metric_name_suffix="top_level"
    )

    def merge_metrics(*metrics):
        res = {}
        for m in metrics:
            for k, v in m.items():
                res[k] = v
        return res

    return merge_metrics(flattened_metrics, top_level_metrics)


def binary_labels_top_level(
    input_df,
    labels_df,
    meeting_id_col_name: str,
    start_col_name: str,
    end_col_name: str,
    caption_col_name: str,
):
    """
    Binary Label [0, 0, 1, 0] for topic changes as ntlk format.
    Hierarchical topic strutcure only top level topics
    see https://www.XXXX.com/intern/anp/view/?id=434543
    """
    labels_top_level = {}
    meeting_ids = list(set(input_df[meeting_id_col_name]))

    for meeting_id in meeting_ids:
        logging.info("\n\nMEETING ID:{}".format(meeting_id))

        if meeting_id not in list(labels_df[meeting_id_col_name]):
            logging.info("{} not found in `labels_df`".format(meeting_id))
            continue

        meeting_data = input_df[
            input_df[meeting_id_col_name] == meeting_id
        ].sort_values(by=[start_col_name])
        meeting_sentences = [*map(lambda s: s.lower(), list(meeting_data[caption_col_name]))]

        caption_start_times = list(meeting_data[start_col_name])
        segment_start_times = list(
            labels_df[labels_df[meeting_id_col_name] == meeting_id][start_col_name]
        )
        segment_end_times = list(
            labels_df[labels_df[meeting_id_col_name] == meeting_id][end_col_name]
        )

        meeting_labels_top_level = [0] * len(caption_start_times)

        high_level_topics_indexes = []
        i = 0
        while i < len(segment_end_times):
            end = segment_end_times[i]
            high_level_topics_indexes.append(i)
            if segment_end_times.count(end) == 2:
                # skip all the subtopics of this high level topic
                i = (
                    segment_end_times.index(end)
                    + segment_end_times[segment_end_times.index(end) + 1 :].index(end)
                    + 2
                )
            else:
                i += 1

        segment_start_times_high_level = [
            segment_start_times[i] for i in high_level_topics_indexes
        ]

        # we skip first and last labaled segment cause they are naive segments
        for sst in segment_start_times_high_level[1:]:
            try:
                topic_change_index = caption_start_times.index(sst)
            except ValueError:
                topic_change_index = bisect(caption_start_times, sst)
                if topic_change_index == len(meeting_labels_top_level):
                    topic_change_index -= 1  # bisect my go out of boundary
            meeting_labels_top_level[topic_change_index] = 1

        labels_top_level[meeting_id] = meeting_labels_top_level

        logging.info("MEETING TRANSCRIPTS")
        for i, sentence in enumerate(meeting_sentences):
            if meeting_labels_top_level[i] == 1:
                logging.warning("\n\n<<------ Topic Change () ------>>\n")
            logging.info(sentence)

    return labels_top_level


def binary_labels_flattened(
    input_df,
    labels_df,
    meeting_id_col_name: str,
    start_col_name: str,
    end_col_name: str,
    caption_col_name: str,
):
    """
    Binary Label [0, 0, 1, 0] for topic changes as ntlk format.
    Hierarchical topic strutcure flattened.
    see https://www.XXXX.com/intern/anp/view/?id=434543
    """
    labels_flattened = {}
    meeting_ids = list(set(input_df[meeting_id_col_name]))

    for meeting_id in meeting_ids:
        logging.info("\n\nMEETING ID:{}".format(meeting_id))

        if meeting_id not in list(labels_df[meeting_id_col_name]):
            logging.info("{} not found in `labels_df`".format(meeting_id))
            continue

        meeting_data = input_df[
            input_df[meeting_id_col_name] == meeting_id
        ].sort_values(by=[start_col_name])
        meeting_sentences = [*map(lambda s: s.lower(), list(meeting_data[caption_col_name]))]

        caption_start_times = list(meeting_data[start_col_name])
        segment_start_times = list(
            labels_df[labels_df[meeting_id_col_name] == meeting_id][start_col_name]
        )

        meeting_labels_flattened = [0] * len(caption_start_times)

        # we skip first and last labaled segment cause they are naive segments
        for sst in segment_start_times[1:]:
            try:
                topic_change_index = caption_start_times.index(sst)
            except ValueError:
                topic_change_index = bisect(caption_start_times, sst)
                if topic_change_index == len(meeting_labels_flattened):
                    topic_change_index -= 1  # bisect my go out of boundary
            meeting_labels_flattened[topic_change_index] = 1

        labels_flattened[meeting_id] = meeting_labels_flattened

        logging.info("MEETING TRANSCRIPTS")
        for i, sentence in enumerate(meeting_sentences):
            if meeting_labels_flattened[i] == 1:
                logging.warning("\n\n<<------ Topic Change () ------>>\n")
            logging.info(sentence)

    return labels_flattened

In [None]:
def topic_segmentation(
    topic_segmentation_algorithm: TopicSegmentationAlgorithm,
    df: pd.DataFrame,
    meeting_id_col_name: str,
    start_col_name: str,
    end_col_name: str,
    caption_col_name: str,
    topic_segmentation_config: TopicSegmentationConfig,
):
    """
    Input:
        df: dataframe with meeting captions
    Output:
        {meeting_id: [list of topic change indexes]}
    """

    if topic_segmentation_algorithm == TopicSegmentationAlgorithm.BERT:
        return topic_segmentation_bert(
            df,
            meeting_id_col_name,
            start_col_name,
            end_col_name,
            caption_col_name,
            topic_segmentation_config,
        )
    elif topic_segmentation_algorithm == TopicSegmentationAlgorithm.RANDOM:
        return topic_segmentation_random(
            df, meeting_id_col_name, start_col_name, end_col_name, caption_col_name
        )
    elif topic_segmentation_algorithm == TopicSegmentationAlgorithm.EVEN:
        return topic_segmentation_even(
            df, meeting_id_col_name, start_col_name, end_col_name, caption_col_name
        )
    else:
        raise NotImplementedError("Algorithm not implemented")

In [None]:
# everything above should be enough to run the EVEN segmentaion

eval_topic_segmentation(input_df,labels_df, topic_segmentation_algorithm = TopicSegmentationAlgorithm.EVEN, topic_segmentation_config = TopicSegmentationConfig(), meeting_id_col_name = "id", start_col_name = "starttime", end_col_name = "endtime", caption_col_name="text")

meeting_id -> task_idx
IS1005b -> 0
[0, 30, 60, 90, 120, 150, 180, 210]
TS3007b -> 1
[0, 30, 60, 90, 120, 150, 180, 210, 240, 270, 300, 330, 360, 390, 420, 450, 480, 510, 540, 570, 600]
ES2011a -> 2
[0, 30, 60, 90, 120, 150, 180, 210]
ES2013c -> 3
[0, 30, 60, 90, 120, 150, 180, 210, 240]
ES2003c -> 4
[0, 30, 60, 90, 120, 150, 180, 210]
TS3003d -> 5
[0, 30, 60, 90, 120, 150, 180, 210, 240, 270, 300, 330, 360, 390, 420, 450, 480, 510, 540, 570, 600, 630, 660, 690]
TS3004a -> 6
[0, 30, 60, 90, 120, 150, 180, 210, 240, 270, 300, 330, 360]
TS3009a -> 7
[0, 30, 60, 90, 120, 150, 180, 210, 240, 270, 300, 330]
TS3005a -> 8
[0, 30, 60, 90, 120, 150, 180, 210, 240, 270, 300, 330, 360]
ES2015a -> 9
[0, 30, 60, 90, 120, 150, 180, 210, 240, 270]
IS1007c -> 10
[0, 30, 60, 90, 120, 150, 180, 210, 240, 270, 300, 330, 360, 390]
ES2010b -> 11
[0, 30, 60, 90, 120, 150, 180]
ES2014b -> 12
[0, 30, 60, 90, 120, 150, 180, 210, 240, 270, 300, 330, 360, 390, 420, 450, 480, 510, 540, 570]
ES2016c -> 13
[0, 30, 

[1;30;43mВыходные данные были обрезаны до нескольких последних строк (5000).[0m

<<------ Topic Change () ------>>


<<------ Topic Change () ------>>


<<------ Topic Change () ------>>


<<------ Topic Change () ------>>


<<------ Topic Change () ------>>


<<------ Topic Change () ------>>


<<------ Topic Change () ------>>


<<------ Topic Change () ------>>


<<------ Topic Change () ------>>


<<------ Topic Change () ------>>


<<------ Topic Change () ------>>


<<------ Topic Change () ------>>


<<------ Topic Change () ------>>


<<------ Topic Change () ------>>


<<------ Topic Change () ------>>


<<------ Topic Change () ------>>


<<------ Topic Change () ------>>


<<------ Topic Change () ------>>


<<------ Topic Change () ------>>


<<------ Topic Change () ------>>


<<------ Topic Change () ------>>


<<------ Topic Change () ------>>


<<------ Topic Change () ------>>


<<------ Topic Change () ------>>


<<------ Topic Change () ------>>


<<------ Topic Ch

{'IS1005b': [0, 30, 60, 90, 120, 150, 180, 210], 'TS3007b': [0, 30, 60, 90, 120, 150, 180, 210, 240, 270, 300, 330, 360, 390, 420, 450, 480, 510, 540, 570, 600], 'ES2011a': [0, 30, 60, 90, 120, 150, 180, 210], 'ES2013c': [0, 30, 60, 90, 120, 150, 180, 210, 240], 'ES2003c': [0, 30, 60, 90, 120, 150, 180, 210], 'TS3003d': [0, 30, 60, 90, 120, 150, 180, 210, 240, 270, 300, 330, 360, 390, 420, 450, 480, 510, 540, 570, 600, 630, 660, 690], 'TS3004a': [0, 30, 60, 90, 120, 150, 180, 210, 240, 270, 300, 330, 360], 'TS3009a': [0, 30, 60, 90, 120, 150, 180, 210, 240, 270, 300, 330], 'TS3005a': [0, 30, 60, 90, 120, 150, 180, 210, 240, 270, 300, 330, 360], 'ES2015a': [0, 30, 60, 90, 120, 150, 180, 210, 240, 270], 'IS1007c': [0, 30, 60, 90, 120, 150, 180, 210, 240, 270, 300, 330, 360, 390], 'ES2010b': [0, 30, 60, 90, 120, 150, 180], 'ES2014b': [0, 30, 60, 90, 120, 150, 180, 210, 240, 270, 300, 330, 360, 390, 420, 450, 480, 510, 540, 570], 'ES2016c': [0, 30, 60, 90, 120, 150], 'TS3003a': [0, 30, 60,

{'average_Pk_flattened': 0.5838338915864022,
 'average_windiff_flattened': 0.7287174165393657,
 'average_Pk_top_level': 0.5838338915864022,
 'average_windiff_top_level': 0.7287174165393657}

## BERT

In [None]:
from transformers import RobertaModel, RobertaTokenizer
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

# Initialize the tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaModel.from_pretrained('roberta-base')

# Use a GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Set the model to evaluation mode
model.eval()

def get_features_from_sentence(batch_sentences, layer=-2):
    """
    Extracts the RoBERTa semantic representation from a sentence
    using an averaged value of the `layer`-th layer.
    Returns a 2-dimensional tensor of size (batch_size, feature_size).
    """
    # Tokenize the input sentences
    input_ids = [tokenizer.encode(sentence, return_tensors='pt') for sentence in batch_sentences]

    # Pad the sequences with zeros to the maximum sequence length
    max_length = max([input_id.size(1) for input_id in input_ids])
    input_ids = [F.pad(input_id, pad=(0, max_length - input_id.size(1))) for input_id in input_ids]

    # Concatenate the padded sequences along the batch dimension
    input_ids = torch.cat(input_ids, dim=0)

    input_ids.to(device)

    # Extract the features from the model
    with torch.no_grad():
      all_layers = model.forward(input_ids.to(device), attention_mask=input_ids.to(device).ne(0),output_hidden_states=True)[-1]
        # all_layers = model(input_ids)[0]

    # Average the features of the specified layer across the tokens in each sentence
    layer_output = all_layers[layer]
    pooling = torch.nn.AvgPool2d((max_length, 1))
    batch_features = pooling(layer_output).squeeze() #layer_output.mean(dim=1)
    return batch_features

# Test the function
batch_sentences = ["This is a test input.", "Another test input."]
batch_features = get_features_from_sentence(batch_sentences)
print(batch_features.shape)  # Should be (2, 768)

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


torch.Size([2, 768])


In [None]:
# core.py  -> https://github.com/gdamaskinos/unsupervised_topic_segmentation/blob/main/core.py

PARALLEL_INFERENCE_INSTANCES = 100 # num of batches that the meeting will be split into (probably not optimal !TO CHANGE!)

def split_list(a, n):
    """
    a utility function that is used to split a given list into smaller lists of a specified size. 
    :list a:
    :an integer n: 
    :return a generator that yields n smaller lists, each containing a portion of the elements from the input list

    Here is an example of how the split_list function could be used:

    a = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
    n = 3

    for small_list in split_list(a, n):
        print(small_list)

    # Output:
    # [1, 2, 3, 4]
    # [5, 6, 7]
    # [8, 9, 10]

    """
    k, m = divmod(len(a), n)
    return (
        a[i * k + min(i, m) : (i + 1) * k + min(i + 1, m)]
        for i in range(min(len(a), n))
    )


def flatten_features(batches_features):
    res = []
    for batch_features in batches_features:
        res += batch_features
    return res


def get_timeseries(caption_indexes, features):
    timeseries = []
    for caption_index in caption_indexes:
        timeseries.append(features[caption_index])
    return timeseries


def topic_segmentation_bert(
    df: pd.DataFrame,
    meeting_id_col_name: str,
    start_col_name: str,
    end_col_name: str,
    caption_col_name: str,
    topic_segmentation_configs: TopicSegmentationConfig,
):
    textiling_hyperparameters = topic_segmentation_configs.TEXT_TILING

    # parallel inference
    batches_features = []
    for batch_sentences in split_list(
        df[caption_col_name], PARALLEL_INFERENCE_INSTANCES
    ):
        batches_features.append(get_features_from_sentence(batch_sentences))
    features = flatten_features(batches_features)

    # meeting_id -> list of topic change start times
    segments = {}
    task_idx = 0
    print("meeting_id -> task_idx")
    for meeting_id in set(df[meeting_id_col_name]):
        print("%s -> %d" % (meeting_id, task_idx))
        task_idx += 1

        meeting_data = df[df[meeting_id_col_name] == meeting_id]
        caption_indexes = list(meeting_data.index)

        timeseries = get_timeseries(caption_indexes, features)
        block_comparison_score_timeseries = block_comparison_score(
            timeseries, k=textiling_hyperparameters.SENTENCE_COMPARISON_WINDOW
        )

        block_comparison_score_timeseries = smooth(
            block_comparison_score_timeseries,
            n=textiling_hyperparameters.SMOOTHING_PASSES,
            s=textiling_hyperparameters.SMOOTHING_WINDOW,
        )

        depth_score_timeseries = depth_score(block_comparison_score_timeseries)

        meeting_start_time = meeting_data[start_col_name].iloc[0]
        meeting_end_time = meeting_data[end_col_name].iloc[-1]
        meeting_duration = meeting_end_time - meeting_start_time
        segments[meeting_id] = depth_score_to_topic_change_indexes(
            depth_score_timeseries,
            meeting_duration,
            topic_segmentation_configs=topic_segmentation_configs,
        )
        print(segments[meeting_id])

    return segments


## FUNCTIONS THAT ARE USED IN `TOPIC_SEGMENTATION_BERT`

def block_comparison_score(timeseries, k):
    """
    comparison score for a gap (i)
    cfr. docstring of block_comparison_score
    """
    res = []
    for i in range(k, len(timeseries) - k):
        first_window_features = compute_window(timeseries, i - k, i + 1)
        second_window_features = compute_window(timeseries, i + 1, i + k + 2)
        res.append(
            sentences_similarity(first_window_features, second_window_features)
        )

    return res


def compute_window(timeseries, start_index, end_index):
    """given start and end index of embedding, compute pooled window value
    [window_size, 768] -> [1, 768]
    """
    stack = torch.stack([features for features in timeseries[start_index:end_index]])
    stack = stack.unsqueeze(
        0
    )  # https://jbencook.com/adding-a-dimension-to-a-tensor-in-pytorch/
    stack_size = end_index - start_index
    pooling = torch.nn.MaxPool2d((stack_size - 1, 1)) # CHECK FOR SENSE?!
    return pooling(stack).squeeze(0)


def sentences_similarity(first_sentence_features, second_sentence_features) -> float:
    """
    Given two senteneces embedding features compute cosine similarity
    """
    similarity_metric = torch.nn.CosineSimilarity()
    return float(similarity_metric(first_sentence_features, second_sentence_features))


def smooth(timeseries, n, s):
    smoothed_timeseries = timeseries[:]
    for _ in range(n):
        for index in range(len(smoothed_timeseries)):
            neighbours = smoothed_timeseries[
                max(0, index - s) : min(len(timeseries) - 1, index + s)
            ]
            smoothed_timeseries[index] = sum(neighbours) / len(neighbours)
    return smoothed_timeseries


def depth_score(timeseries):
    """
    The depth score corresponds to how strongly the cues for a subtopic changed on both sides of a
    given token-sequence gap and is based on the distance from the peaks on both sides of the valleyto that valley.
    returns depth_scores
    """
    depth_scores = []
    for i in range(1, len(timeseries) - 1):
        left, right = i - 1, i + 1
        while left > 0 and timeseries[left - 1] > timeseries[left]:
            left -= 1
        while (
            right < (len(timeseries) - 1) and timeseries[right + 1] > timeseries[right]
        ):
            right += 1
        depth_scores.append(
            (timeseries[right] - timeseries[i]) + (timeseries[left] - timeseries[i])
        )
    return depth_scores


def depth_score_to_topic_change_indexes(
    depth_score_timeseries,
    meeting_duration,
    topic_segmentation_configs=TopicSegmentationConfig,
):
    """
    capped add a max segment limit so there are not too many segments, used for UI improvements on the Workplace TeamWork product
    """

    capped = topic_segmentation_configs.MAX_SEGMENTS_CAP
    average_segment_length = (
        topic_segmentation_configs.MAX_SEGMENTS_CAP__AVERAGE_SEGMENT_LENGTH
    )
    threshold = topic_segmentation_configs.TEXT_TILING.TOPIC_CHANGE_THRESHOLD * max(
        depth_score_timeseries
    )

    print("DEPTH_SCORE_TIMESERIES:")
    print(list(depth_score_timeseries))

    if depth_score_timeseries == []:
        return []

    local_maxima_indices, local_maxima = get_local_maxima(depth_score_timeseries)

    if local_maxima == []:
        return []

    if capped:  # capped is segmentation used for UI
        # sort based on maxima for pruning
        local_maxima, local_maxima_indices = arsort2(local_maxima, local_maxima_indices)

        # local maxima are sorted by depth_score value and we take only the first K
        # where the K+1th local maxima is lower then the threshold
        for thres in range(len(local_maxima)):
            if local_maxima[thres] <= threshold:
                break

        max_segments = int(meeting_duration / average_segment_length)
        slice_length = min(max_segments, thres)

        local_maxima_indices = local_maxima_indices[:slice_length]
        local_maxima = local_maxima[:slice_length]

        # after pruning, sort again based on indices for chronological ordering
        local_maxima_indices, _ = arsort2(local_maxima_indices, local_maxima)

    else:  # this is the vanilla TextTiling used for Pk optimization
        filtered_local_maxima_indices = []
        filtered_local_maxima = []

        for i, m in enumerate(local_maxima):
            if m > threshold:
                filtered_local_maxima.append(m)
                filtered_local_maxima_indices.append(i)

        local_maxima = filtered_local_maxima
        local_maxima_indices = filtered_local_maxima_indices

    print("LOCAL_MAXIMA_INDICES:")
    print(list(local_maxima_indices))

    return local_maxima_indices

  
def get_local_maxima(array):
    local_maxima_indices = []
    local_maxima_values = []
    for i in range(1, len(array) - 1):
        if array[i - 1] < array[i] and array[i] > array[i + 1]:
            local_maxima_indices.append(i)
            local_maxima_values.append(array[i])
    return local_maxima_indices, local_maxima_values


def arsort2(array1, array2):
    x = np.array(array1)
    y = np.array(array2)

    sorted_idx = x.argsort()[::-1]
    return x[sorted_idx], y[sorted_idx]

In [None]:
## THIS IS A REPETIOTION OF THE ABOVE `get_features_from_sentence` FUNCTION LINE BY LINE JUST TO BE SURE

layer=-2
# Tokenize the input sentences
input_ids = [tokenizer.encode(sentence, return_tensors='pt') for sentence in batch_sentences]

# Pad the sequences with zeros to the maximum sequence length
max_length = max([input_id.size(1) for input_id in input_ids])
input_ids = [F.pad(input_id, pad=(0, max_length - input_id.size(1))) for input_id in input_ids]

# Concatenate the padded sequences along the batch dimension
input_ids = torch.cat(input_ids, dim=0)

# Extract the features from the model
with torch.no_grad():
  all_layers = model.forward(input_ids.to(device), attention_mask=input_ids.to(device).ne(0),output_hidden_states=True)[-1]
    # all_layers = model(input_ids)[0]

# Average the features of the specified layer across the tokens in each sentence
layer_output = all_layers[layer]
pooling = torch.nn.AvgPool2d((max_length, 1))
batch_features = pooling(layer_output).squeeze()
print(batch_features.shape)

torch.Size([2, 768])


In [None]:
 meeting_id_col_name = "id"
 start_col_name = "starttime"
 end_col_name = "endtime"
 caption_col_name="text"
 topic_segmentation_configs=TopicSegmentationConfig()

In [None]:
## THIS IS A REPETIOTION OF THE ABOVE `topic_segmentation_bert` FUNCTION LINE BY LINE JUST TO BE SURE

textiling_hyperparameters = TextTilingHyperparameters() #textiling_hyperparameters = topic_segmentation_configs.TEXT_TILING

# parallel inference
batches_features = []
for batch_sentences in split_list(input_df[caption_col_name], PARALLEL_INFERENCE_INSTANCES):
    batches_features.append(get_features_from_sentence(batch_sentences))
features = flatten_features(batches_features)

In [None]:
# meeting_id -> list of topic change start times
segments = {}
task_idx = 0

In [None]:
print("meeting_id -> task_idx")
for meeting_id in set(input_df[meeting_id_col_name]):
    print("%s -> %d" % (meeting_id, task_idx))
    task_idx += 1

    meeting_data = input_df[input_df[meeting_id_col_name] == meeting_id]
    caption_indexes = list(meeting_data.index)

    timeseries = get_timeseries(caption_indexes, features)
    break

meeting_id -> task_idx
IS1005b -> 0


In [None]:
len(timeseries)

212

In [None]:
stack = torch.stack([features for features in timeseries[0:16]])

In [None]:
stack.shape

torch.Size([16, 768])

In [None]:
stack = stack.unsqueeze(
        0
    )

In [None]:
stack

tensor([[[ 0.0619, -0.2647,  0.2537,  ..., -0.3019,  0.1373, -0.1885],
         [ 0.0893, -0.7606,  0.2438,  ...,  0.1475,  0.2624, -0.3302],
         [ 0.1079, -0.0230,  0.2105,  ..., -0.1893,  0.0111, -0.2747],
         ...,
         [-0.1000, -0.4266,  0.1026,  ...,  0.4359,  0.1632, -0.2909],
         [-0.4250, -0.0019, -0.0444,  ..., -0.1158, -0.0331, -0.0877],
         [ 0.2642,  0.0364, -0.3780,  ...,  0.4504,  0.3100, -0.3753]]],
       device='cuda:0')

In [None]:
stack.shape

torch.Size([1, 16, 768])

In [None]:
stack_size = 16 - 0
stack_size

16

In [None]:
pooling = torch.nn.MaxPool2d((stack_size-1, 1))

In [None]:
first_window_features = pooling(stack).squeeze(0)

In [None]:
second_window_features = compute_window(timeseries, 16, 32)

In [None]:
first_window_features[0].shape, second_window_features[0].shape

(torch.Size([768]), torch.Size([768]))

In [None]:
torch.nn.CosineSimilarity()(first_window_features, second_window_features)

tensor([0.9553], device='cuda:0')

In [None]:
block_comparison_score_timeseries = block_comparison_score(
        timeseries, k=textiling_hyperparameters.SENTENCE_COMPARISON_WINDOW
    )

In [None]:
block_comparison_score_timeseries

[0.9552718997001648,
 0.9618903398513794,
 0.9628734588623047,
 0.9639553427696228,
 0.9650323987007141,
 0.9644960165023804,
 0.9652298092842102,
 0.9642989635467529,
 0.9743444919586182,
 0.9732085466384888,
 0.9734659790992737,
 0.973329484462738,
 0.9714340567588806,
 0.9713731408119202,
 0.9710617065429688,
 0.9647892713546753,
 0.9661908149719238,
 0.9680362939834595,
 0.9665354490280151,
 0.9683114886283875,
 0.9697588682174683,
 0.9693399667739868,
 0.9713379144668579,
 0.968528151512146,
 0.9655241370201111,
 0.9653291702270508,
 0.9462243318557739,
 0.9597635865211487,
 0.9609294533729553,
 0.959946870803833,
 0.9635553956031799,
 0.9690277576446533,
 0.9689504504203796,
 0.9708817601203918,
 0.9684669971466064,
 0.9677728414535522,
 0.9681336879730225,
 0.9710786938667297,
 0.9737392663955688,
 0.9721769094467163,
 0.969790518283844,
 0.9692502021789551,
 0.9468590617179871,
 0.9651015996932983,
 0.9494200944900513,
 0.9490715265274048,
 0.958331286907196,
 0.972060143947601

In [None]:
block_comparison_score_timeseries = smooth(
            block_comparison_score_timeseries,
            n=textiling_hyperparameters.SMOOTHING_PASSES,
            s=textiling_hyperparameters.SMOOTHING_WINDOW,
        )

In [None]:
# block_comparison_score_timeseries

In [None]:
depth_score_timeseries = depth_score(block_comparison_score_timeseries)

In [None]:
# depth_score_timeseries

In [None]:
meeting_start_time = meeting_data[start_col_name].iloc[0]
meeting_end_time = meeting_data[end_col_name].iloc[-1]
meeting_duration = meeting_end_time - meeting_start_time
segments[meeting_id] = depth_score_to_topic_change_indexes(
    depth_score_timeseries,
    meeting_duration,
    topic_segmentation_configs=topic_segmentation_configs,
)
print(segments[meeting_id])

DEPTH_SCORE_TIMESERIES:
[0.013387084021815099, 0.011240914478548802, 0.009626887753256597, 0.00828134642506484, 0.007876766860135831, 0.00730758068675641, 0.007488410468795337, 0.002556061153882183, 0.0006578591564903036, -0.00041995807259809226, -0.0008906193688744679, -0.0002824428702297155, -0.0001224176085088402, -0.001437850415641151, 0.0007378730015261681, 0.0030030525647362083, 0.005263640002084458, 0.006249441434391656, 0.006118173396401971, 0.0006702472634572132, 0.00036181089939191224, -0.0008098332960990717, -0.0009488489052777993, -2.5676489486592402e-05, -0.003548631085420695, 0.005634854066679518, 0.020618522118188043, 0.02107896757552341, 0.0210766328399159, 0.009474729002742155, 0.00577827741609982, 0.003968705234915548, 0.0020982642943174756, 0.0023704253109110285, 0.002853583665734849, 0.0029147395834117074, 0.0014728145953963878, -0.0005784341630306056, -0.0009469263484302992, -0.0003407990431767427, -0.005442727264476699, 0.005265737429176243, 0.004208038889687016, 

In [None]:
# prediction_segmentations = topic_segmentation_bert(input_df, meeting_id_col_name = "id", start_col_name = "starttime", end_col_name = "endtime", caption_col_name="text", topic_segmentation_configs=TopicSegmentationConfig())

In [None]:
eval_topic_segmentation(input_df,labels_df, topic_segmentation_algorithm = TopicSegmentationAlgorithm.BERT, topic_segmentation_config = TopicSegmentationConfig(), meeting_id_col_name = "id", start_col_name = "starttime", end_col_name = "endtime", caption_col_name="text")

meeting_id -> task_idx
IS1005b -> 0
DEPTH_SCORE_TIMESERIES:
[0.013387084021815099, 0.011240914478548802, 0.009626887753256597, 0.00828134642506484, 0.007876766860135831, 0.00730758068675641, 0.007488410468795337, 0.002556061153882183, 0.0006578591564903036, -0.00041995807259809226, -0.0008906193688744679, -0.0002824428702297155, -0.0001224176085088402, -0.001437850415641151, 0.0007378730015261681, 0.0030030525647362083, 0.005263640002084458, 0.006249441434391656, 0.006118173396401971, 0.0006702472634572132, 0.00036181089939191224, -0.0008098332960990717, -0.0009488489052777993, -2.5676489486592402e-05, -0.003548631085420695, 0.005634854066679518, 0.020618522118188043, 0.02107896757552341, 0.0210766328399159, 0.009474729002742155, 0.00577827741609982, 0.003968705234915548, 0.0020982642943174756, 0.0023704253109110285, 0.002853583665734849, 0.0029147395834117074, 0.0014728145953963878, -0.0005784341630306056, -0.0009469263484302992, -0.0003407990431767427, -0.005442727264476699, 0.005265


<<------ Topic Change () ------>>


<<------ Topic Change () ------>>


<<------ Topic Change () ------>>


<<------ Topic Change () ------>>


<<------ Topic Change () ------>>


<<------ Topic Change () ------>>


<<------ Topic Change () ------>>


<<------ Topic Change () ------>>


<<------ Topic Change () ------>>


<<------ Topic Change () ------>>


<<------ Topic Change () ------>>


<<------ Topic Change () ------>>


<<------ Topic Change () ------>>


<<------ Topic Change () ------>>


<<------ Topic Change () ------>>


<<------ Topic Change () ------>>


<<------ Topic Change () ------>>


<<------ Topic Change () ------>>


<<------ Topic Change () ------>>


<<------ Topic Change () ------>>


<<------ Topic Change () ------>>


<<------ Topic Change () ------>>


<<------ Topic Change () ------>>


<<------ Topic Change () ------>>


<<------ Topic Change () ------>>


<<------ Topic Change () ------>>


<<------ Topic Change () ------>>


<<------ Topic Change () --

DEPTH_SCORE_TIMESERIES:
[-0.00038339197635650635, -0.0004432164132595062, 0.0005260929465293884, 0.0019191568717360497, 0.002661542035639286, 0.0029375695157796144, 0.002355406992137432, 0.003510526556055993, 0.004481632669921964, 0.005198285391088575, 0.0062102242663968354, 0.0036810374549531844, 0.000663892878947081, 0.012573113409288794, 0.013593533750153597, 0.009229430429385843, 0.0015580055800086257, 0.0008717264873752129, -0.0006444622704435687, -0.0007667837058562554, -0.0002761274321159135, -0.00012846443697966947, -0.0017746279282846444, 0.0008300140845163284, 0.002789317211308706, 0.004996879023400291, 0.007359343595167989, 0.009119760405334354, 0.00999679511428253, 0.010051263204927885, 0.009609817829525857, -0.0011143863540971077, 0.0015416675486000786, 0.0030053985348134082, 0.000593708282980776, -0.0004449876344990722, 0.0006734295926623668, 0.001245695936861746, 0.0014650442611970904, 0.004018107606156618, 0.005253847889044816, 0.005564683598968445, 0.007629865612575948

[1;30;43mВыходные данные были обрезаны до нескольких последних строк (5000).[0m

<<------ Topic Change () ------>>


<<------ Topic Change () ------>>


<<------ Topic Change () ------>>


<<------ Topic Change () ------>>


<<------ Topic Change () ------>>


<<------ Topic Change () ------>>


<<------ Topic Change () ------>>


<<------ Topic Change () ------>>


<<------ Topic Change () ------>>


<<------ Topic Change () ------>>


<<------ Topic Change () ------>>


<<------ Topic Change () ------>>


<<------ Topic Change () ------>>


<<------ Topic Change () ------>>


<<------ Topic Change () ------>>


<<------ Topic Change () ------>>


<<------ Topic Change () ------>>


<<------ Topic Change () ------>>


<<------ Topic Change () ------>>


<<------ Topic Change () ------>>


<<------ Topic Change () ------>>


<<------ Topic Change () ------>>


<<------ Topic Change () ------>>


<<------ Topic Change () ------>>


<<------ Topic Change () ------>>


<<------ Topic Ch

{'IS1005b': array([138,  72,  45]), 'TS3007b': array([23,  7]), 'ES2011a': array([61, 45, 31]), 'ES2013c': array([129,  30,   5]), 'ES2003c': array([100,  79]), 'TS3003d': array([625, 485, 469, 349, 263, 247,  37,  22,   6]), 'TS3004a': array([274, 247, 211,  44,  30,  11]), 'TS3009a': array([300, 278, 156, 109,  86,  29,  13]), 'TS3005a': array([83, 64]), 'ES2015a': array([172, 128]), 'IS1007c': array([357, 314, 137,  53]), 'ES2010b': array([132, 118,  97,  86,  73,  57,  13,   3]), 'ES2014b': array([545, 450, 419, 390, 375, 333, 240, 218, 210, 176, 126, 109,  90,
        32]), 'ES2016c': array([34]), 'TS3003a': array([341, 319, 213, 177]), 'ES2008b': array([549, 479, 460, 352, 325, 274,   8]), 'IS1004b': array([310, 276]), 'IB4003': array([102,  76,  60,   6]), 'IS1006b': array([158,  88,  72]), 'ES2009c': array([196, 180]), 'ES2007d': array([351, 335, 269, 139, 123,  31]), 'ES2009d': array([626, 187, 164, 147]), 'TS3006a': array([602, 586, 545, 428, 393, 358, 334, 260, 204, 187, 156

{'average_Pk_flattened': 0.4733611118758348,
 'average_windiff_flattened': 0.5081729116263476,
 'average_Pk_top_level': 0.4733611118758348,
 'average_windiff_top_level': 0.5081729116263476}