In [None]:
from convokit import Corpus, PolitenessStrategies, TextParser
from convokit.transformer import Transformer
from inspect import signature
from collections import deque

In [None]:
path_to_folder = "/content/gdrive/My Drive/INFO 4350 Final Project/Data and Analysis"
# change this to wherever the CANDOR corpus is located in the file system

In [None]:
# load the Candor dataset in from the folder we have on Drive
corpus = Corpus(filename=f"{path_to_folder}/CANDOR-corpus-cliffhanger")

In [None]:
utt = corpus.random_utterance()
print("Utterance Text: ", utt.text)
print("Utterance Meta: ", utt.meta)

convo = corpus.random_conversation()
print(convo)

Utterance Text:  Yeah, they emailed me about that.
Utterance Meta:  ConvoKitMeta({'turn_id': 300, 'start': 1437.44, 'stop': 1440.96, 'interval': -2.3199999999999363, 'delta': 3.5199999999999823, 'questions': 0, 'end_question': False, 'overlap': True, 'n_words': 6})
Conversation('id': '3231ee9a-483f-464c-b563-da35de30594c', 'utterances': ['229458', '229459', '229460', '229461', '229462', '229463', '229464', '229465', '229466', '229467', '229468', '229469', '229470', '229471', '229472', '229473', '229474', '229475', '229476', '229477', '229478', '229479', '229480', '229481', '229482', '229483', '229484', '229485', '229486', '229487', '229488', '229489', '229490', '229491', '229492', '229493', '229494', '229495', '229496', '229497', '229498', '229499', '229500', '229501', '229502', '229503', '229504', '229505', '229506', '229507', '229508', '229509', '229510', '229511', '229512', '229513', '229514', '229515', '229516', '229517', '229518', '229519', '229520', '229521', '229522', '229523', 

### The transform( ) function

The transform is to compute and add something to the corpus, mostly metadata. Here we give a simple example of how a transform function would look like.

In [None]:
def calculate_politeness_score(polite_strat):
  politeness_mapping = {
    'feature_politeness_==Please==': 0.49,
    'feature_politeness_==Please_start==': -0.3,
    'feature_politeness_==HASHEDGE==': 0,
    'feature_politeness_==Indirect_(btw)==': 0.63,
    'feature_politeness_==Hedges==': 0.14,
    'feature_politeness_==Factuality==': -0.38,
    'feature_politeness_==Deference==': 0.78,
    'feature_politeness_==Gratitude==': 0.87,
    'feature_politeness_==Apologizing==': 0.36,
    'feature_politeness_==1st_person_pl.==': 0.08,
    'feature_politeness_==1st_person==': 0.08,
    'feature_politeness_==1st_person_start==': 0.12,
    'feature_politeness_==2nd_person==': 0.05,
    'feature_politeness_==2nd_person_start==': -0.3,
    'feature_politeness_==Indirect_(greeting)==': 0.43,
    'feature_politeness_==Direct_question==': -0.27,
    'feature_politeness_==Direct_start==': -0.43,
    'feature_politeness_==HASPOSITIVE==': 0.12,
    'feature_politeness_==HASNEGATIVE==': -0.13,
    'feature_politeness_==SUBJUNCTIVE==': 0,
    'feature_politeness_==INDICATIVE==': 0,
  }

  politeness = 0

  for key, value in polite_strat.items():
    politeness += politeness_mapping[key] * value

  return politeness


In [None]:
from scipy.stats import kendalltau

class ConversationSmoothness(Transformer):
    """
    A simple transformer to label a Corpus on a conversation level

    Will only work on the Candor corpus
    :param metric: a string that chooses which computation method to use to compute smoothness. It will either be 'ratio', 'decline', or 'tone'. By default, it is 'ratio'
    :param end_len: the number of utterances to take from the conversation end (must be an even number)
    :param output_field: field for writing the computed output in metadata. Will default to write to conversation metadata with name 'smoothness'.
    :param input_filter: a boolean function of signature `input_filter(conversation, aux_input)`. attributes will only be computed for conversations where `input_filter` returns `True`. By default, will always return `True`, meaning that attributes will be computed for all utterances.
    :param verbosity: frequency at which to print status messages when computing attributes.

    (previous params for the object in the demo for reference, you can ignore)
    obj_type: type of Corpus object to calculate: 'conversation', 'speaker', or 'utterance', default to be 'utterance'
    input_field: Input fields from every utterance object. Will default to reading 'utt.text'. If a string is provided, than consider metadata with field name input_field.
    output_field: field for writing the computed output in metadata. Will default to write to utterance metadata with name 'capitalization'.
    input_filter: a boolean function of signature `input_filter(utterance, aux_input)`. attributes will only be computed for utterances where `input_filter` returns `True`. By default, will always return `True`, meaning that attributes will be computed for all utterances.
    verbosity: frequency at which to print status messages when computing attributes.
    """

    def __init__(
        self,
        metric='ratio',
        end_len=12,
        output_field='smoothness',
        input_filter=None,
        verbosity=200,
    ):
        if input_filter:
            if len(signature(input_filter).parameters) == 1:
                self.input_filter = lambda convo: input_filter(convo)
            else:
                self.input_filter = input_filter
        else:
            self.input_filter = lambda convo: True
        self.metric = metric
        self.end_len = end_len
        self.output_field = output_field
        self.verbosity = verbosity
        self.ps = PolitenessStrategies(verbose=0)
        self.parser = TextParser(verbosity=0)


    def _print_output(self, i):
        return (self.verbosity > 0) and (i > 0) and (i % self.verbosity == 0)

    def transform(self, corpus: Corpus) -> Corpus:
        """
        Takes the  and annotate in the corresponding object metadata fields.

        :param corpus: Corpus
        :return: the corpus
        """

        total = len(list(corpus.iter_conversations()))

        for idx, convo in enumerate(corpus.iter_conversations()):
            if self._print_output(idx):
                print(f"%03d/%03d conversations processed" % (idx, total))

            if not self.input_filter(convo):
                continue

            last_utts = convo.get_utterance_ids()[-self.end_len:]
            len_last_utts = len(last_utts)

            # for the calculation
            calc = 0

            # difference for pairs in decline metric
            diffs = deque([])

            # has pos and has neg freqs
            has_pos = [0, 0]
            has_neg = [0, 0]

            politeness1 = 0
            politeness2 = 0

            # for tau decline metric
            canonical_ordering = []
            ordering = []

            # for metric calculations

            # new for loop here for the ratio metric
            if self.metric == 'ratio':
              for i in range(len_last_utts - 1):
                utt = corpus.get_utterance(last_utts[i])
                next_utt = corpus.get_utterance(last_utts[i + 1])
                utt1len, utt2len = utt.meta['delta'], next_utt.meta['delta']
                ratio = utt1len / utt2len if utt1len <= utt2len else utt2len / utt1len
                calc += ratio

            # here are the other loops
            for i in range(len_last_utts // 2):
              utt = corpus.get_utterance(last_utts[i])
              paired_utt = corpus.get_utterance(last_utts[i + 1])

              if self.metric == 'ratio':
                # old metric
                # # get your pairs (only look at even numbers)
                # utt1len, utt2len = utt.meta['delta'], paired_utt.meta['delta']
                # ratio = utt1len / utt2len if utt1len <= utt2len else utt2len / utt1len
                # calc += ratio
                pass

              elif self.metric == 'decline':
                # append the differences
                diffs.append(abs(utt.meta['delta'] - paired_utt.meta['delta']))
                # calculate the difference of differences when possible
                if len(diffs) == 2:
                  # remove last element and calculate the most recent element
                  popped = diffs.popleft()
                  calc += abs(popped - diffs[0])

                # NEW DECLINE METRIC
                ordering.append((abs(utt.meta['delta'] - paired_utt.meta['delta']), (utt.id, paired_utt.id)))
                canonical_ordering.append((utt.id, paired_utt.id))

              elif self.metric == 'tone':
                # old metric

                # run the text transformer for this utterance
                self.parser.transform_utterance(utt)
                self.parser.transform_utterance(paired_utt)
                # run politeness on here
                utt_polite = self.ps.transform_utterance(utt, markers=True)
                paired_utt_polite = self.ps.transform_utterance(paired_utt, markers=True)
                # find the ratios

                has_pos[0] += utt_polite.meta["politeness_strategies"]['feature_politeness_==HASPOSITIVE==']
                has_pos[1] += paired_utt_polite.meta["politeness_strategies"]['feature_politeness_==HASPOSITIVE==']
                has_neg[0] += utt_polite.meta["politeness_strategies"]['feature_politeness_==HASNEGATIVE==']
                has_neg[1] += paired_utt_polite.meta["politeness_strategies"]['feature_politeness_==HASNEGATIVE==']
                # Returns (1) absolute difference between Has Positive prevalences and (2) absolute difference between Has Negative prevalences

                # difference in politeness score
                politeness1 = calculate_politeness_score(utt_polite.meta["politeness_strategies"])
                politeness2 = calculate_politeness_score(paired_utt_polite.meta["politeness_strategies"])

              else:
                raise KeyError('metric must be ratio, decline, or tone')

            if self.metric == 'decline':
              ordering = [pair for ratio, pair in sorted(ordering, reverse=True)]
              tau, _ = kendalltau(ordering, canonical_ordering)
              calc = tau

            if self.metric == 'tone':
              pos_diff = abs(has_pos[0] / (len_last_utts // 2) - has_pos[1] / (len_last_utts // 2))
              neg_diff = abs(has_neg[0] / (len_last_utts // 2) - has_neg[1] / (len_last_utts // 2))
              convo.add_meta(f'{self.output_field}_{self.metric}_pos_count1', has_pos[0])
              convo.add_meta(f'{self.output_field}_{self.metric}_neg_count1', has_neg[0])
              convo.add_meta(f'{self.output_field}_{self.metric}_pos_count2', has_pos[1])
              convo.add_meta(f'{self.output_field}_{self.metric}_neg_count2', has_neg[1])
              convo.add_meta(f'{self.output_field}_{self.metric}_pos', pos_diff)
              convo.add_meta(f'{self.output_field}_{self.metric}_neg', neg_diff)
              convo.add_meta(f'{self.output_field}_{self.metric}_politeness1', politeness1)
              convo.add_meta(f'{self.output_field}_{self.metric}_politeness2', politeness2)
              convo.add_meta(f'{self.output_field}_{self.metric}_politeness_diff', abs(politeness1 / (len_last_utts // 2) - politeness2 / (len_last_utts // 2)))
            else:
              # take the average of all summed components
              calc /= (len_last_utts - 1) if self.metric == "ratio" else 1
              # do the catching and add to output_field
              convo.add_meta(f'{self.output_field}_{self.metric}', calc)

            last_utt_time_delta = corpus.get_utterance(last_utts[-1]).meta["stop"] - corpus.get_utterance(last_utts[0]).meta["start"]
            convo.add_meta(f'{self.output_field}_last_utts_time', last_utt_time_delta)

        return corpus

In [None]:
ratio_transformer = ConversationSmoothness()
ratio_transformer.transform(corpus)

decline_transformer = ConversationSmoothness(metric="decline")
decline_transformer.transform(corpus)

tone_transformer = ConversationSmoothness(metric="tone")
tone_transformer.transform(corpus)

200/1650 conversations processed
400/1650 conversations processed
600/1650 conversations processed
800/1650 conversations processed
1000/1650 conversations processed
1200/1650 conversations processed
1400/1650 conversations processed
1600/1650 conversations processed
200/1650 conversations processed
400/1650 conversations processed
600/1650 conversations processed
800/1650 conversations processed
1000/1650 conversations processed
1200/1650 conversations processed
1400/1650 conversations processed
1600/1650 conversations processed
200/1650 conversations processed
400/1650 conversations processed
600/1650 conversations processed
800/1650 conversations processed
1000/1650 conversations processed
1200/1650 conversations processed
1400/1650 conversations processed
1600/1650 conversations processed


<convokit.model.corpus.Corpus at 0x788e90188bb0>

In [None]:
def print_conversation_ending(convo):
  print(convo.id)
  speaker_map = {}
  curr_speaker = 1
  last_utts = convo.get_utterance_ids()[-12:]
  for utt_id in last_utts:
      utt = corpus.get_utterance(utt_id)
      if utt.speaker.id not in speaker_map:
        speaker_map[utt.speaker.id] = curr_speaker
        curr_speaker += 1
      print(f"SPEAKER {speaker_map[utt.speaker.id]}: {utt.text}")
  print()

In [None]:
print_conversation_ending(convo)
print("Smoothness (ratio, the higher the number, the more smooth it will be): ", convo.meta['smoothness_ratio'])
print("Smoothness (decline, the higher the number, the more smooth it will be): ", convo.meta['smoothness_decline'])
print("Smoothness (pos tone, the lower the number, the more smooth it will be): ", convo.meta['smoothness_tone_pos'])
print("Smoothness (neg tone, the lower the number, the more smooth it will be): ", convo.meta['smoothness_tone_neg'])
print("Smoothness (politeness, the lower the number, the more smooth it will be): ", convo.meta['smoothness_tone_politeness_diff'])

3231ee9a-483f-464c-b563-da35de30594c
SPEAKER 1: I think we made it, we make it.
SPEAKER 2: Okay.
SPEAKER 1: Yeah.
SPEAKER 2: Well you're probably you're probably tired so.
SPEAKER 1: I think we did it. I am after working all day.
SPEAKER 2: Uh huh. Yeah.
SPEAKER 1: Yeah.
SPEAKER 2: Oh my gosh. Okay well have a good rest of the day, Week whatever.
SPEAKER 1: Yeah. Thank you. You too.
SPEAKER 2: All right thank you.
SPEAKER 1: Bye.
SPEAKER 2: Goodbye.

Smoothness (ratio, the higher the number, the more smooth it will be):  0.45962043967974253
Smoothness (decline, the higher the number, the more smooth it will be):  -0.11475409836065575
Smoothness (pos tone, the lower the number, the more smooth it will be):  0.0
Smoothness (neg tone, the lower the number, the more smooth it will be):  0.0
Smoothness (politeness, the lower the number, the more smooth it will be):  0.0


In [None]:
convo = corpus.random_conversation()
print_conversation_ending(convo)
print("Smoothness (ratio, the higher the number, the more smooth it will be): ", convo.meta['smoothness_ratio'])
print("Smoothness (decline, the higher the number, the more smooth it will be): ", convo.meta['smoothness_decline'])
print("Smoothness (pos tone, the lower the number, the more smooth it will be): ", convo.meta['smoothness_tone_pos'])
print("Smoothness (neg tone, the lower the number, the more smooth it will be): ", convo.meta['smoothness_tone_neg'])
print("Smoothness (politeness, the lower the number, the more smooth it will be): ", convo.meta['smoothness_tone_politeness_diff'])

06b83c0a-7bf1-4cf9-9e72-034615d97050
SPEAKER 1: We're supposed to be oh yeah we're way past.
SPEAKER 2: I think we've passed yeah, we've passed the time. Yeah, I think we're I think we're good, but yeah.
SPEAKER 1: Okay. Yeah. We had 45 minutes probably. We only need 25.
SPEAKER 2: Okay. Well it's so nice talking to you.
SPEAKER 1: That's great talking to you. I don't even It didn't seem like 45 minutes.
SPEAKER 2: Yeah, this is really great. I learned so much and like thank you for all the great advice to.
SPEAKER 1: Thank you. Always say yes. Go for all the adventures. You can nice talking to you too.
SPEAKER 2: Yeah, it was really nice talking to you.
SPEAKER 1: Um Have fun.
SPEAKER 2: Okay, well, goodbye. Thank you.
SPEAKER 1: Right.
SPEAKER 2: Bye.

Smoothness (ratio, the higher the number, the more smooth it will be):  0.49963096484501934
Smoothness (decline, the higher the number, the more smooth it will be):  0.459016393442623
Smoothness (pos tone, the lower the number, the mor

### Metrics

#### Generate Histograms

In [None]:
import matplotlib
import matplotlib.pyplot as plt
import numpy as np

In [None]:
def graph_plot(metric, metric_good_name, filename, bins=10, range=None):
  dist = []

  for convo in corpus.iter_conversations():
    dist.append(convo.meta[f"smoothness_{metric}"])

  print("Min:", np.min(dist))
  print("Max:", np.max(dist))
  print("Mean:", np.mean(dist))
  print("SD:", np.std(dist))

  if range:
    plt.hist(dist, density=True, bins=bins, range=range)
  else:
    plt.hist(dist, density=True, bins=bins)
  plt.ylabel('Count')
  plt.xlabel(metric_good_name)

  plt.savefig(f'{path_to_folder}/Outputs/{filename}.png')

In [None]:
# for ratio
graph_plot("ratio", "Ratio", "ratio_hist", bins=20)

In [None]:
# for decline
graph_plot("decline", "Decline", "decline_hist", bins=5, range=[0, 40])

In [None]:
# for pos tone
graph_plot("tone_pos", "Positive Tone", "tone_pos_hist", bins=5)

In [None]:
# for neg tone
graph_plot("tone_neg", "Negative Tone", "tone_neg_hist", bins=5)

#### Get the annotated data

In [None]:
annotated_ids = [
    '1c82d05c-19ce-4d2a-83db-a54021c9196d',
    '0203bb21-da17-416b-8e2f-018b99689616',
    'fffda3e6-7d99-4db8-aa12-16e99fa454c2',
    '826248c3-018b-4b56-8844-ef762f5b60cd',
    '11cb78ed-49fb-4634-8a7a-3c59109563b5',
    '33528414-6a77-4fde-a01a-aebbad5fc3d8',
    '2d7f1113-de9d-4e61-bdbe-38a9bd2a1121',
    '29b7edd5-d78d-4edb-bcce-4f6c9a166455',
    '2608d293-6af3-4f26-959c-e0f6a2597a37',
    '5d895bf7-4efd-4a5d-ad62-57fa820ad746',
]

In [None]:
for id in annotated_ids:
  print_conversation_ending(corpus.get_conversation(id))

In [None]:
for id in annotated_ids:
  convo = corpus.get_conversation(id)
  print(id)
  print("Smoothness (ratio, the higher the number, the more smooth it will be): ", convo.meta['smoothness_ratio'])
  print("Smoothness (decline, the lower the number, the more smooth it will be): ", convo.meta['smoothness_decline'])
  print("Smoothness (pos tone, the lower the number, the more smooth it will be): ", convo.meta['smoothness_tone_pos'])
  print("Smoothness (neg tone, the lower the number, the more smooth it will be): ", convo.meta['smoothness_tone_neg'])
  print()