In [1]:
# Uncomment below line if this library is missing
# !python -m spacy download en_core_web_sm

In [2]:
import attr
import numpy as np
import pandas as pd
import spacy
import nltk
from nltk.tokenize.texttiling import TextTilingTokenizer
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity


def load_sentence_transformer(model_name='all-MiniLM-L6-v2'):
    model = SentenceTransformer(model_name)
    return model


def load_spacy():
    return spacy.load('en_core_web_sm')


model = load_sentence_transformer()
nlp = load_spacy()
nltk.download('stopwords')

input_df = pd.read_csv('./data/train_ami.csv')
label_df = pd.read_csv('./data/test_ami.csv')
input_df = input_df[input_df['meeting_id'] < 1]  # for performance test only first meeting
label_df = label_df[label_df['meeting_id'] < 1]  # for performance test only first meeting

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/akvelon/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [20]:

@attr.s
class SemanticTextSegmentation:
    """
    Segment a call transcript based on topics discussed in the call using
    TextTilling with Sentence Similarity via sentence transformer.

    Paramters
    ---------
    data: pd.Dataframe
        Pass the trascript in the dataframe format

    utterance: str
        pass the column name which represent utterance in transcript dataframe

    """

    data = attr.ib()
    utterance = attr.ib(default='utterance')

    def __attrs_post_init__(self):
        columns = self.data.columns.tolist()

    def get_segments_ind(self, threshold=0.7):
        segments = self._text_tilling()
        merge_index = self._merge_segments(segments, threshold)
        return merge_index

    def get_segments(self, threshold=0.7):
        """
        returns the transcript segments computed with texttiling and sentence-transformer.

        Paramters
        ---------
        threshold: float
            sentence similarity threshold. (used to merge the sentences into coherant segments)

        Return
        ------
        new_segments: list
            list of segments
        """
        segments = self._text_tilling()
        merge_index = self._index_mapping(self._merge_segments(segments, threshold))
        new_segments = []
        for i in merge_index:
            seg = ' '.join([segments[_] for _ in i])
            new_segments.append(seg)
        return new_segments

    def _merge_segments(self, segments, threshold):
        segment_map = [0]
        sims = []
        for index, (text1, text2) in enumerate(zip(segments[:-1], segments[1:])):
            sim = self._get_similarity(text1, text2)
            sims.append(sim)

        # threshold = np.mean(sims) - 3 * np.var(sims)
        for sim in sims:
            if sim >= threshold:
                segment_map.append(0)
            else:
                segment_map.append(1)
        return segment_map

    def _index_mapping(self, segment_map):
        index_list = []
        temp = []
        for index, i in enumerate(segment_map):
            if i == 1:
                index_list.append(temp)
                temp = [index]
            else:
                temp.append(index)
        index_list.append(temp)
        return index_list

    def _get_similarity(self, text1, text2):
        sentence_1 = [i.text.strip()
                      for i in nlp(text1).sents if len(i.text.split(' ')) > 1]
        sentence_2 = [i.text.strip()
                      for i in nlp(text2).sents if len(i.text.split(' ')) > 2]
        embeding_1 = model.encode(sentence_1)
        embeding_2 = model.encode(sentence_2)
        embeding_1 = np.mean(embeding_1, axis=0).reshape(1, -1)
        embeding_2 = np.mean(embeding_2, axis=0).reshape(1, -1)

        if np.any(np.isnan(embeding_1)) or np.any(np.isnan(embeding_2)):
            return 1

        sim = cosine_similarity(embeding_1, embeding_2)
        return sim

    def _text_tilling(self):
        tt = TextTilingTokenizer(w=15, k=10)
        text = '\n\n\t'.join(self.data[self.utterance].tolist())
        segment = tt.tokenize(text)
        segment = [i.replace("\n\n\t", ' ') for i in segment]
        return segment

# segmenter = SemanticTextSegmentation(input_df, 'caption')
# binary = segmenter.get_segments_ind(threshold=0.5)

In [21]:
# segments = segmenter.get_segments(threshold=0.5)
# print('\n\n<-- Topic Change -->\n\n'.join(segments[:5]))

In [82]:
import pandas as pd
import re


def convert_time_to_seconds(time_str):
    time_parts = time_str.split(':')
    hours = 0
    minutes = 0
    seconds = 0
    milliseconds = 0

    if len(time_parts) == 2:
        minutes, seconds = map(float, time_parts)
    elif len(time_parts) == 3:
        hours, minutes, seconds = map(float, time_parts)

    seconds += (hours * 3600) + (minutes * 60)

    return seconds


def vvt_to_df(file_path):
    with open(file_path, 'r') as f:
        data = f.read()
    meeting_id = file_path  # assuming meeting ID is in the file name and just an integer
    utterances = re.findall(r'(\d?\d?:?\d+:\d+.\d+).+?(\d?\d?:?\d+:\d+.\d+)\n(.*?)\n\n', data, re.DOTALL)
    result = []
    utterance_id = 1
    for utterance in utterances:
        st, en, caption = utterance
        st_sec = convert_time_to_seconds(st)
        en_sec = convert_time_to_seconds(en)
        result.append(
            {'meeting_id': meeting_id, 'st': st_sec, 'en': en_sec, 'caption': caption, 'speaker': utterance_id})
        utterance_id += 1
    return pd.DataFrame(result)

In [91]:
file_path = './data/episode_001_large.vtt'
df = vvt_to_df(file_path)
print(df.head())

                     meeting_id     st     en  \
0  ./data/episode_001_large.vtt   0.00   4.20   
1  ./data/episode_001_large.vtt   4.20   6.60   
2  ./data/episode_001_large.vtt   6.60   8.68   
3  ./data/episode_001_large.vtt   8.68  11.92   
4  ./data/episode_001_large.vtt  11.92  16.96   

                                             caption  speaker  
0   As part of MIT course 6S099, Artificial Gener...        1  
1   I've gotten the chance to sit down with Max T...        2  
2                     He is a professor here at MIT.        3  
3   He's a physicist, spent a large part of his c...        4  
4   studying the mysteries of our cosmological un...        5  


In [92]:
df.tail()

Unnamed: 0,meeting_id,st,en,caption,speaker
1775,./data/episode_001_large.vtt,4951.44,4952.56,Thank you for your time today.,1776
1776,./data/episode_001_large.vtt,4952.56,4953.56,It's been awesome.,1777
1777,./data/episode_001_large.vtt,4953.56,4954.4,Thank you so much.,1778
1778,./data/episode_001_large.vtt,4954.4,4955.24,Thanks.,1779
1779,./data/episode_001_large.vtt,4955.24,4960.24,Have a great day.,1780


In [78]:
segmenter = SemanticTextSegmentation(df, 'caption')
segments = segmenter.get_segments(threshold=0.4)
print('\n\n<-- Topic Change -->\n\n'.join(segments[:5]))

 As part of MIT course 6S099, Artificial General Intelligence,  I've gotten the chance to sit down with Max Tegmark.  He is a professor here at MIT.  He's a physicist, spent a large part of his career  studying the mysteries of our cosmological universe.  But he's also studied and delved into the beneficial  possibilities and the existential risks  of artificial intelligence.  Amongst many other things, he is the cofounder  of the Future of Life Institute, author of two books,  both of which I highly recommend.  First, Our Mathematical Universe.  Second is Life 3.0.  He's truly an out of the box thinker and a fun personality,  so I really enjoy talking to him.  If you'd like to see more of these videos in the future,  please subscribe and also click the little bell icon  to make sure you don't miss any videos.  Also, Twitter, LinkedIn, agi.mit.edu  if you wanna watch other lectures  or conversations like this one.  Better yet, go read Max's book, Life 3.0.  Chapter seven on goals is my

In [79]:
len(segments)

25

In [98]:
nltk.download('punkt')

from nltk.tokenize import word_tokenize
from collections import Counter


def find_topic_timestamps(topics, sentences_df):
    topic_timestamps = []

    sentence_index = 0

    for topic in topics:
        topic_tokens = word_tokenize(topic)
        topic_word_counts = Counter(topic_tokens)

        start_time, end_time = None, None

        for i, row in sentences_df[sentences_df.index >= sentence_index].iterrows():
            sentence_tokens = [word for word in word_tokenize(row['caption']) if len(word) >= 2]
            sentence_index += 1

            if (len(sentence_tokens) - sum(topic_word_counts[token] > 0 for token in sentence_tokens)) < 2:
                for token in sentence_tokens:
                    topic_word_counts.subtract({token: 1})
                if start_time is None or row['st'] < start_time:
                    start_time = row['st']
                if end_time is None or row['en'] > end_time:
                    end_time = row['en']
            else:
                sentence_index -= 1
                break

        if start_time is not None and end_time is not None:
            topic_timestamps.append((start_time, end_time))

    return topic_timestamps


ts = find_topic_timestamps(segments, df)

[nltk_data] Downloading package punkt to /Users/akvelon/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [106]:
def convert_seconds_to_time(seconds):
    hours = int(seconds // 3600)
    seconds %= 3600
    minutes = int(seconds // 60)
    seconds %= 60
    milliseconds = int((seconds % 1) * 1000)
    seconds = int(seconds)
    return f"{hours}:{minutes:02}:{seconds:02}.{milliseconds:03}"


ts_formatted = np.array([list(map(convert_seconds_to_time, row)) for row in ts])

In [107]:
out_df = pd.DataFrame({"start_time": ts_formatted[:, 0], "end_time": ts_formatted[:, 1], "topic": segments})
out_df

Unnamed: 0,segment,start_time,end_time
0,"As part of MIT course 6S099, Artificial Gener...",0:00:00.000,0:01:36.159
1,"radio frequency interference, RFI, look it u...",0:01:36.159,0:02:25.039
2,and I hope you're still able to enjoy this c...,0:02:25.039,0:05:05.000
3,that's gotten the point of building advanced...,0:05:05.800,0:12:05.120
4,We're actually studying it a little bit in m...,0:12:05.120,0:12:27.919
5,is actually having locked in syndrome or is...,0:12:27.919,0:14:39.519
6,who say that actually some information proce...,0:14:39.519,0:15:45.039
7,it's like when you're in a relationship and ...,0:15:45.039,0:16:03.120
8,between everything seeming like there's cons...,0:16:03.519,0:17:39.480
9,or women don't have souls or whatever. So I...,0:17:39.480,0:26:47.960


In [109]:
out_df.to_json('converge_output.json', orient='records', indent=4)