# Converting DeliData Dataset to ConvoKit format

Notebook Contributors: Yash Chatha, Laerdon Kim

This notebook is to help people working with the DeliData Corpus to quickly transform it into ConvoKit format.
Details about the construction of the corpus are available here:

In [None]:
!pip install convokit

In [2]:
import pandas as pd
from tqdm import tqdm
from convokit import Corpus, Speaker, Utterance
from collections import defaultdict, Counter
import pandas as pd
import numpy as np
import glob

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
%cd /content/drive/MyDrive/datasets/delidata1_rc3/combined

/content/drive/MyDrive/datasets/delidata1_rc3/combined


## Combining files into one Dataframe

In [5]:
files = glob.glob('*')
dfs = []
for file in files:
    df = pd.read_csv(file, sep='\t')
    dfs.append(df)

In [6]:
corpus_raw = pd.concat(dfs)

In [7]:
print(corpus_raw.shape)
count = df['message_id'].str.contains('-1').sum()
print(count)

(17610, 13)
3


In [8]:
corpus_raw = corpus_raw.reset_index(drop=True)

## Processing speakers

Note: there is no significant metadata to include at the speaker level—so, we leave these fields empty.



In [9]:
all_speakers = list(set(corpus_raw['origin'].to_list()))

In [10]:
corpus_speakers = {}
for speaker_id in all_speakers:
    corpus_speakers.update({speaker_id : Speaker(id = speaker_id, meta = {})})

## Processing utterances

In [11]:
import os

In [12]:
desired_metadata_fields = ['annotation_type',
                           'annotation_target',
                           'annotation_additional',
                           'message_type',
                           'original_text']

corpus_utterances = []
for index, row in tqdm(corpus_raw.iterrows()):
    current_id = row['message_id']
    current_meta = {}
    for k, v in row.items():
        if k in desired_metadata_fields:
            current_meta.update({k : v})

    current_speaker = row['origin']
    current_conversation_id = row['group_id']
    current_text = row['clean_text']

    utterance = Utterance(id = current_id,
                          speaker = corpus_speakers[current_speaker],
                          conversation_id = str(current_conversation_id),
                          text = str(current_text),
                          meta = current_meta)

    corpus_utterances.append(utterance)

17610it [00:02, 7505.58it/s] 


In [13]:
corpus_object = Corpus(utterances=corpus_utterances)

No configuration file found at /root/.convokit/config.yml; writing with contents: 
# Default Backend Parameters
db_host: localhost:27017
data_directory: ~/.convokit/saved-corpora
default_backend: mem


# Creating Conversation metadata

The following attributes provided in the DeliData dataset feature are conversation-wide.

```
team_performance
performance_change
sol_tracker_message
sol_tracker_all
```



In [14]:
conversation_metadata_headers = ['team_performance',
                                 'performance_change',
                                 'sol_tracker_message',
                                 'sol_tracker_all',]

In [15]:
for convo in corpus_object.iter_conversations():
    convo_id = convo.id
    convo_row = corpus_raw[corpus_raw['group_id'] == convo_id].iloc[0]
    metadata = {}
    for field_name in conversation_metadata_headers:
        field_value = convo_row[field_name]
        metadata.update({field_name : field_value})
    convo.meta = metadata

In [16]:
SAVE_PATH = '/content/drive/MyDrive/datasets/delidata1_rc3/'
corpus_object.dump(f"delidata-corpus", base_path=SAVE_PATH)

from convokit import meta_index
meta_index(filename = f"{SAVE_PATH}/delidata-corpus")

{'utterances-index': {'message_type': ["<class 'str'>"],
  'original_text': ["<class 'str'>", "<class 'float'>"],
  'annotation_type': ["<class 'float'>", "<class 'str'>"],
  'annotation_target': ["<class 'float'>", "<class 'str'>"],
  'annotation_additional': ["<class 'float'>", "<class 'str'>"]},
 'speakers-index': {},
 'conversations-index': {'team_performance': ["<class 'numpy.float64'>"],
  'performance_change': ["<class 'numpy.float64'>"],
  'sol_tracker_message': ["<class 'float'>"],
  'sol_tracker_all': ["<class 'str'>"]},
 'overall-index': {},
 'version': 1,
 'vectors': []}