In [5]:
%pip install datasets convokit ipykernel
# parsed with spacy en_core_web_sm: "python -m spacy download en_core_web_sm"


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


### Download original dataset from huggingface

Detailed introduction to the datasets is on "https://huggingface.co/datasets/audreyeleven/MentalManip"

There are three seperate files on MentalManip repo: mentalmanip_detialed.csv, mentalmanip_con.csv, and mentalmanip_maj.csv.

Here, we use the processed mentalmanip_con.csv version which contains final gold labels the authors generated from the 3 annotators' results using Consensus agreement strategy.

In [2]:
from datasets import load_dataset
from tqdm import tqdm
from convokit import Corpus, Speaker, Utterance, Conversation
from collections import defaultdict
import os, re

  from .autonotebook import tqdm as notebook_tqdm
  import pkg_resources


TransformerDecoderModel requires ML dependencies. Run 'pip install convokit[llm]' to install them.
TransformerEncoderModel requires ML dependencies. Run 'pip install convokit[llm]' to install them.
An error occurred: No module named 'torch'




In [None]:
dataset = load_dataset("audreyeleven/MentalManip", "mentalmanip_con") # or "mentalmanip_maj", "mentalmanip_detailed"

Some datasets params were ignored: ['license']. Make sure to use only valid params for the dataset builder and to have a up-to-date version of the `datasets` library.


In [11]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'manipulative', 'technique', 'vulnerability'],
        num_rows: 2915
    })
})

In [12]:
ds = dataset["train"]
row0 = ds[0]
print(row0.keys())
print(row0["dialogue"])

dict_keys(['id', 'dialogue', 'manipulative', 'technique', 'vulnerability'])
Person1: Jesus! Listen to this one: "Do you remember me? Airport shuttle, June 7th. You: striking redhead with yellow dress, pearl necklace, brown shoes. I was the bookish fellow in the green cardigan who helped you find your contact lens. Am I crazy, or did we have a moment?"
Person2: God, that's so pathetic. I bet she didn't even notice him.
Person1: I know. And he's like psychotically obsessing over every little detail.
Person2: We should call him and pretend to be the redhead.
Person1: Oh, we totally have to.


### Dataset pre-analysis

In [66]:
import pandas as pd

df = ds.to_pandas()

# manipulative
print("Manipulative counts:")
print(df["manipulative"].value_counts().sort_index())

# split function
def split_and_flatten(series):
    values = []
    for item in series.dropna():
        for part in str(item).split(","):
            part = part.strip()
            if part:
                values.append(part)
    return pd.Series(values)

# technique
print("\nTechnique counts:")
print(split_and_flatten(df["technique"]).value_counts())

# vulnerability
print("\nVulnerability counts:")
print(split_and_flatten(df["vulnerability"]).value_counts())

Manipulative counts:
manipulative
0     899
1    2016
Name: count, dtype: int64

Technique counts:
Persuasion or Seduction    607
Shaming or Belittlement    384
Accusation                 361
Intimidation               321
Rationalization            213
Brandishing Anger          133
Denial                      87
Evasion                     83
Playing Victim Role         69
Feigning Innocence          58
Playing Servant Role        30
Name: count, dtype: int64

Vulnerability counts:
Dependency                  282
Low self-esteem             155
Naivete                      94
Over-responsibility          93
Over-intellectualization     46
Name: count, dtype: int64


In [67]:
# 1) sanity check: manipulative=1 but technique is null
m1_no_tech = df[(df["manipulative"] == 1) & (df["technique"].isna() | (df["technique"].str.strip() == ""))]
print("manipulative=1 but technique is NULL:", len(m1_no_tech))

# 2) manipulative=1 and technique >1
df["num_techniques"] = df["technique"].fillna("").apply(lambda x: len([t.strip() for t in str(x).split(",") if t.strip()]))
m1_multi = df[(df["manipulative"] == 1) & (df["num_techniques"] > 1)]
print("manipulative=1 with >1 techniques:", len(m1_multi))

manipulative=1 but technique is NULL: 268
manipulative=1 with >1 techniques: 543


### Function to convert each conversation to separate turns for further processing

In [15]:
PERSON_RE = re.compile(r"(Person\d+):\s*")

def parse_dialogue(raw: str):
    parts = PERSON_RE.split(raw)
    turns = []
    cur_speaker = None
    for chunk in parts:
        if not chunk:
            continue
        if chunk.startswith("Person"):
            cur_speaker = chunk  # e.g., Person1
        else:
            text = chunk.strip()
            if not text:
                continue
            if cur_speaker is None:
                if turns:
                    turns[-1] = (turns[-1][0], turns[-1][1] + " " + text)
                else:
                    turns.append(("Person1", text))
            else:
                turns.append((cur_speaker, text))
    return turns

In [16]:
turns = parse_dialogue(row0["dialogue"])
print(turns)

[('Person1', 'Jesus! Listen to this one: "Do you remember me? Airport shuttle, June 7th. You: striking redhead with yellow dress, pearl necklace, brown shoes. I was the bookish fellow in the green cardigan who helped you find your contact lens. Am I crazy, or did we have a moment?"'), ('Person2', "God, that's so pathetic. I bet she didn't even notice him."), ('Person1', "I know. And he's like psychotically obsessing over every little detail."), ('Person2', 'We should call him and pretend to be the redhead.'), ('Person1', 'Oh, we totally have to.')]


### Function to biuld speakers and utterances

Note: In the dataset, speakers within each conversation are labeled generically as Person1, Person2, etc. To avoid conflating these roles across different conversations, we treat them as distinct entities and prepend the conversation ID to each speaker ID, thereby uniquely associating speakers with their respective conversations.

In [None]:
def build_speakers(row_id, turns):
    speakers = {}
    for spk_label, _ in turns:
        sid = f"{row_id}__{spk_label}"
        if sid not in speakers:
            speakers[sid] = Speaker(id=sid, meta={"role_label": spk_label})
    return speakers

In [20]:
def build_utterances(row_id, turns, speakers):
    utterances, prev_utt_id = [], None
    root_utt_id = f"{row_id}__u0" 
    for turn_idx, (spk_label, text) in enumerate(turns):
        spk_id = f"{row_id}__{spk_label}"
        utt_id = f"{row_id}__u{turn_idx}"
        utterances.append(
            Utterance(
                id=utt_id,
                text=text,
                speaker=speakers[spk_id],
                conversation_id=root_utt_id,
                reply_to=prev_utt_id,
                timestamp=turn_idx,                
                # meta={"turn_index": turn_idx}
            )
        )
        prev_utt_id = utt_id
    return utterances

### Build Speakers and Utterances

Number of Speakers: 5830

Number of Utterances: 19232

Number of Conversations: 2915


In [21]:
all_utts = []
all_speakers = {}

for row in ds:
    row_id = str(row["id"])
    turns = parse_dialogue(row["dialogue"])  

    # Speakers
    local_speakers = build_speakers(row_id, turns)
    for sid, spk in local_speakers.items():
        all_speakers.setdefault(sid, spk)  # 合并进全局

    # Utterances
    all_utts.extend(build_utterances(row_id, turns, all_speakers))


In [22]:
print("number of speakers in the data = {}".format(len(all_speakers)))

number of speakers in the data = 5830


In [39]:
print("Total number of utterances = {}".format(len(all_utts)))
print(all_utts[0].text)
print(all_utts[0].reply_to)
print(all_utts[0].id)


Total number of utterances = 19232
Jesus! Listen to this one: "Do you remember me? Airport shuttle, June 7th. You: striking redhead with yellow dress, pearl necklace, brown shoes. I was the bookish fellow in the green cardigan who helped you find your contact lens. Am I crazy, or did we have a moment?"
None
85514414__u0


In [41]:
print(all_utts[1].text)
print(all_utts[1].reply_to)
print(all_utts[1].id)

God, that's so pathetic. I bet she didn't even notice him.
85514414__u0
85514414__u1


### Build corpus from utterances list

In [42]:
corpus = Corpus(utterances=all_utts)
print("number of conversations in the dataset = {}".format(len(corpus.get_conversation_ids())))

No configuration file found at /Users/byc324/.convokit/config.yml; writing with contents: 
# Default Backend Parameters
db_host: localhost:27017
data_directory: ~/.convokit/saved-corpora
model_directory: ~/.convokit/saved-models
default_backend: mem
number of conversations in the dataset = 2915


In [45]:
convo_ids = corpus.get_conversation_ids()
for i, convo_idx in enumerate(convo_ids[0:5]):
    print("sample conversation {}:".format(convo_idx))
    print(corpus.get_conversation(convo_idx).get_utterance_ids())

sample conversation 85514414__u0:
['85514414__u0', '85514414__u1', '85514414__u2', '85514414__u3', '85514414__u4']
sample conversation 85514415__u0:
['85514415__u0', '85514415__u1', '85514415__u2', '85514415__u3', '85514415__u4']
sample conversation 85514416__u0:
['85514416__u0', '85514416__u1', '85514416__u2', '85514416__u3', '85514416__u4', '85514416__u5', '85514416__u6']
sample conversation 85514417__u0:
['85514417__u0', '85514417__u1', '85514417__u2', '85514417__u3', '85514417__u4', '85514417__u5', '85514417__u6', '85514417__u7']
sample conversation 85514418__u0:
['85514418__u0', '85514418__u1', '85514418__u2', '85514418__u3', '85514418__u4', '85514418__u5', '85514418__u6']


### Add meta data for the conversations in the corpus

In [48]:
for row in ds:
    row_id = str(row["id"])
    root_id = f"{row_id}__u0"
    conv_meta = {
        "manipulative": int(row["manipulative"]),
        "technique": (row.get("technique") or "").split(",") if row.get("technique") else [],
        "vulnerability": (row.get("vulnerability") or "").split(",") if row.get("vulnerability") else []
    }
    convo = corpus.get_conversation(root_id)
    convo.meta.update(conv_meta)

In [50]:
corpus.get_conversation("85514447__u0").meta

ConvoKitMeta({'manipulative': 1, 'technique': ['Rationalization', 'Accusation', 'Shaming or Belittlement'], 'vulnerability': ['Low self-esteem']})

In [51]:
corpus.meta['name'] = "MentalManip_con"

### Parse the corpus

In [52]:
from convokit.text_processing import TextParser

In [54]:
parser = TextParser(verbosity=10000)
parsed_corpus = parser.transform(corpus)

10000/19232 utterances processed
19232/19232 utterances processed


In [55]:
parsed_corpus.get_utterance('85514417__u0').retrieve_meta('parsed')

[{'rt': 1,
  'toks': [{'tok': 'All', 'tag': 'RB', 'dep': 'advmod', 'up': 1, 'dn': []},
   {'tok': 'right', 'tag': 'RB', 'dep': 'ROOT', 'dn': [0, 2]},
   {'tok': '.', 'tag': '.', 'dep': 'punct', 'up': 1, 'dn': []}]},
 {'rt': 0,
  'toks': [{'tok': 'Tell', 'tag': 'VB', 'dep': 'ROOT', 'dn': [1, 3, 6]},
   {'tok': "'em", 'tag': 'PRP', 'dep': 'dobj', 'up': 0, 'dn': []},
   {'tok': 'to', 'tag': 'TO', 'dep': 'aux', 'up': 3, 'dn': []},
   {'tok': 'shoot', 'tag': 'VB', 'dep': 'xcomp', 'up': 0, 'dn': [2, 5]},
   {'tok': 'to', 'tag': 'TO', 'dep': 'aux', 'up': 5, 'dn': []},
   {'tok': 'kill', 'tag': 'VB', 'dep': 'xcomp', 'up': 3, 'dn': [4]},
   {'tok': '.', 'tag': '.', 'dep': 'punct', 'up': 0, 'dn': []}]}]

In [56]:
parsed_corpus.get_utterance('85514417__u0').text

"All right. Tell 'em to shoot to kill."

### Save the corpus

In [70]:
parsed_corpus.dump("mentalmanip-corpus", base_path=".")

### Load Corpus

In [3]:
corpus = Corpus(filename="./mentalmanip-corpus")
print(len(list(corpus.iter_conversations())))

2915


In [72]:
corpus.print_summary_stats()

Number of Speakers: 5830
Number of Utterances: 19232
Number of Conversations: 2915


In [4]:
convo = corpus.random_conversation()
print(convo)

Conversation('id': '85514573__u0', 'utterances': ['85514573__u0', '85514573__u1', '85514573__u2', '85514573__u3', '85514573__u4', '85514573__u5'], 'meta': ConvoKitMeta({'manipulative': 1, 'technique': ['Persuasion or Seduction'], 'vulnerability': ['Low self-esteem']}))


In [76]:
convo.print_conversation_structure()

85516233__Person1
    85516233__Person2


In [74]:
speaker = corpus.random_speaker()
print(speaker)

Speaker(id: '85515593__Person2', vectors: [], meta: ConvoKitMeta({'role_label': 'Person2'}))


In [75]:
for utt in corpus.iter_utterances():
    print(utt.text)
    break

Jesus! Listen to this one: "Do you remember me? Airport shuttle, June 7th. You: striking redhead with yellow dress, pearl necklace, brown shoes. I was the bookish fellow in the green cardigan who helped you find your contact lens. Am I crazy, or did we have a moment?"
