In [55]:
# !python -m spacy download en_core_web_sm

In [50]:
import attr
import numpy as np
import pandas as pd
import spacy
import nltk
from nltk.tokenize.texttiling import TextTilingTokenizer
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity


def load_sentence_transformer(model_name='all-MiniLM-L6-v2'):
    model = SentenceTransformer(model_name)
    return model


def load_spacy():
    return spacy.load('en_core_web_sm')

model = load_sentence_transformer()
nlp = load_spacy()
nltk.download('stopwords')

input_df = pd.read_csv('./train.csv')
label_df = pd.read_csv('./test.csv')
input_df = input_df[input_df['meeting_id'] < 1]
label_df = label_df[label_df['meeting_id'] < 1]

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/akvelon/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [51]:

@attr.s
class SemanticTextSegmentation:

    """
    Segment a call transcript based on topics discussed in the call using
    TextTilling with Sentence Similarity via sentence transformer.

    Paramters
    ---------
    data: pd.Dataframe
        Pass the trascript in the dataframe format

    utterance: str
        pass the column name which represent utterance in transcript dataframe

    """

    data = attr.ib()
    utterance = attr.ib(default='utterance')

    def __attrs_post_init__(self):
        columns = self.data.columns.tolist()

    def get_segments_ind(self, threshold=0.7):
        segments = self._text_tilling()
        merge_index = self._merge_segments(segments, threshold)
        return merge_index

    def get_segments(self, threshold=0.7):
        """
        returns the transcript segments computed with texttiling and sentence-transformer.

        Paramters
        ---------
        threshold: float
            sentence similarity threshold. (used to merge the sentences into coherant segments)

        Return
        ------
        new_segments: list
            list of segments
        """
        segments = self._text_tilling()
        merge_index = self._index_mapping(self._merge_segments(segments, threshold))
        new_segments = []
        for i in merge_index:
            seg = ' '.join([segments[_] for _ in i])
            new_segments.append(seg)
        return new_segments

    def _merge_segments(self, segments, threshold):
        segment_map = [0]
        sims = []
        for index, (text1, text2) in enumerate(zip(segments[:-1], segments[1:])):
            sim = self._get_similarity(text1, text2)
            sims.append(sim)

        # threshold = np.mean(sims) - 3 * np.var(sims)
        for sim in sims:
            if sim >= threshold:
                segment_map.append(0)
            else:
                segment_map.append(1)
        return segment_map

    def _index_mapping(self, segment_map):
        index_list = []
        temp = []
        for index, i in enumerate(segment_map):
            if i == 1:
                index_list.append(temp)
                temp = [index]
            else:
                temp.append(index)
        index_list.append(temp)
        return index_list

    def _get_similarity(self, text1, text2):
        sentence_1 = [i.text.strip()
                      for i in nlp(text1).sents if len(i.text.split(' ')) > 1]
        sentence_2 = [i.text.strip()
                      for i in nlp(text2).sents if len(i.text.split(' ')) > 2]
        embeding_1 = model.encode(sentence_1)
        embeding_2 = model.encode(sentence_2)
        embeding_1 = np.mean(embeding_1, axis=0).reshape(1, -1)
        embeding_2 = np.mean(embeding_2, axis=0).reshape(1, -1)

        if np.any(np.isnan(embeding_1)) or np.any(np.isnan(embeding_2)):
            return 1

        sim = cosine_similarity(embeding_1, embeding_2)
        return sim

    def _text_tilling(self):
        tt = TextTilingTokenizer(w=15, k=10)
        text = '\n\n\t'.join(self.data[self.utterance].tolist())
        segment = tt.tokenize(text)
        segment = [i.replace("\n\n\t", ' ') for i in segment]
        return segment



segmenter = SemanticTextSegmentation(input_df, 'caption')
binary = segmenter.get_segments_ind(threshold=0.5)
len(binary)

32

In [52]:
binary

[0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0]

In [53]:
segments = segmenter.get_segments(threshold=0.5)
len(segments)

9

In [54]:
segments[0]

"Here we go . Welcome everybody . Um , I'm Abigail Claflin . You can call me Abbie . 'S see . PowerPoint , that's not it . There we go . So this is our kick off meeting . Um and I guess we should all get acquainted {vocalsound} . {vocalsound} Let's {disfmarker} shall we all introduce ourselves ? Hi I'm Chiara , I'm the um Marketing Expert . Um , would you like me to talk about my aims at the moment , or would you like me to just say my name and then we can talk about business later ? I think we'll get around to that , yeah . So this is just introductions yeah . We'll get round to that later .  My name is Chiara and I'm the Marketing Expert . Okay . I forgot to s say I'm the Project Manager but I figured you all knew that already , {vocalsound} {vocalsound} {vocalsound} um so . {vocalsound} I'm Stephanie and I am the User Interface Designer . I'm Krista and I'm the Industrial Designer . Okay . Um so f here's our agenda for today . Um we're gonna do some tool training , project plan and 