# Clean and Wrangle Data

## Create Dataframe of Compiled Transcripts

Output a .csv file with all the text and metadata for each turn under `/output_data/uncooked`

In [39]:
from glob import glob
import numpy as np
import pandas as pd
import re

# <!-- Compile all input_data --> #
def compile_transcripts():
    empathy_compiled_df = pd.DataFrame({"speaker_turn": [],
                                        "timestamp": [],
                                        "conversation_text": [],
                                        "transcript_number": []})

    # <!-- Loop over all transcripts to extract speaker turn, timestamp, convo text, and transcript number --> #
    for transcript in sorted(glob('../input_data/NR/*.txt')):
        # <!-- Transcript being looped over --> #
        transcript_num = transcript.lstrip('../input_data/NR/')[0:4]

        # <!-- RegEx to find the speaker turns and timestamps and use as delimiter to isolate conversation text --> #
        frag_df = pd.read_csv(transcript,
                              sep=r"S\w+[(\s)(\s\d\s)]+\(\d{2}:\d{2}\):",
                              engine="python").rename(
            columns={"Unnamed: 0": "conversation_text", "Unnamed: 1": "to_be_dropped"}).drop(
            columns=['to_be_dropped']).dropna()

        # <!-- RegEx to find the speaker turns and timestamps and isolate them --> #
        with open(transcript, 'r') as plain_text:
            plain_text = plain_text.read().replace('\n', ' ')
            timestamps = re.findall(r"S\w+[(\s)(\s\d\s)]+\(\d{2}:\d{2}\):", plain_text)
            timestamps_df = pd.DataFrame(timestamps).iloc[:, 0].str.split(
                r" \(", expand=True).rename(
                columns={0: "speaker_turn", 1: "timestamp"})
            timestamps_df["timestamp"] = timestamps_df["timestamp"].str.rstrip('):')
            empathy_compiled_part_df = pd.concat([timestamps_df.reset_index(drop=True),
                                                  frag_df.reset_index(drop=True)], axis=1)
            empathy_compiled_part_df['transcript_number'] = transcript_num
        empathy_compiled_df = pd.concat([empathy_compiled_df, empathy_compiled_part_df])
        empathy_compiled_df = empathy_compiled_df.reindex(
            ['transcript_number', 'speaker_turn', 'timestamp', 'conversation_text'],
            axis=1)
    return empathy_compiled_df


# <!-- Get speaker and overall turn counts --> #
def compile_transcript_turns(df):
    raw_transcripts_compiled_df = pd.DataFrame({"speaker_turn": [],
                                                "timestamp": [],
                                                "conversation_text": [],
                                                "transcript_number": []})
    for transcript in sorted(set(df['transcript_number'].to_list())):
        sub_df = df.loc[df['transcript_number'] == transcript].copy()
        speaker_turn_count = []
        speaker_1_count = speaker_2_count = researcher_count = 0
        raw_transcripts_compiled_df = pd.concat([raw_transcripts_compiled_df, sub_df])

    # <!-- Change all "[laugh-]" segments to "[laugh]" --> #
    raw_transcripts_compiled_df['conversation_text'] = raw_transcripts_compiled_df['conversation_text'].str.replace(
        r'\[([Ll]augh)[A-Za-z]*\]',
        '[laugh]', regex=True)

    # <!-- Add all annotations set off in square brackets to a new column, "other_text" --> #
    raw_transcripts_compiled_df['other_text'] = raw_transcripts_compiled_df['conversation_text'].str.findall(
        r'\[(.*?)\]')

    # <!-- Drop all annotations set off in square brackets from the "conversation_text" column --> #
    raw_transcripts_compiled_df['conversation_text'] = raw_transcripts_compiled_df['conversation_text'].str.replace(
        r'\s*\[(.*?)\]',  # Make sure you catch the space before it—if it's there
        '', regex=True)

    # <!-- Strip whitespace and fill in empty cells with "." that previously only held text set off in square brackets --> #
    raw_transcripts_compiled_df['condition'] = np.where(
        raw_transcripts_compiled_df['transcript_number'].astype(float) % 2 == 0, "control", "empathy")

    # <!-- Write to .csv file --> #
    raw_transcripts_compiled_df.to_csv("../output_data/uncooked/S0_raw_transcripts_compiled.csv", index=False)


if __name__ == '__main__':
    empathy_precompiled_df = compile_transcripts()
    compile_transcript_turns(empathy_precompiled_df)

## Backchanneling

There are *two* different versions of the dataset:
1. No laughs: any nonverbal contribution to the conversation (laughs) are swung into a separate column, and the turns immediately preceding and succeeding it are spliced together
2. No laughs, no backchanneling: any backchanneling terms are swung into an additional separate column, and the turns immediately preceding and succeeding it are spliced together

## Import Packages + Data

In [40]:
# <!-- Data Wrangling --> #
import ast
import pandas as pd
import numpy as np
from itertools import accumulate
import re

# <!-- NLP --> #
import spacy
from spacytextblob.spacytextblob import SpacyTextBlob
nlp = spacy.load("en_core_web_sm")

# <!-- Read in Data --> #
transcripts_compiled_df = pd.read_csv("../output_data/uncooked/S0_raw_transcripts_compiled.csv")

## Clean Up Punctuation

### Delete `"."` Left Behind From Extracting Text Set Off in `"[]"`

In [41]:
transcripts_compiled_df.loc[transcripts_compiled_df['conversation_text'].isna(), 'conversation_text'] = "np.nan_filler"
transcripts_compiled_df.loc[transcripts_compiled_df['conversation_text'] == "", 'conversation_text'] = "np.nan_filler"
transcripts_compiled_df.loc[transcripts_compiled_df['conversation_text'].str.contains(r"^[.]+$", regex=True), 'conversation_text'] = "np.nan_filler"

### Strip Text of Punctuation, Half-Spoken Words and Whitespace

In [42]:
transcripts_compiled_df["conversation_text"] = transcripts_compiled_df["conversation_text"].str.replace("...", " ", regex=False)
transcripts_compiled_df["conversation_text"] = transcripts_compiled_df["conversation_text"].str.replace(", .", ".", regex=False)
transcripts_compiled_df["conversation_text"] = transcripts_compiled_df["conversation_text"].str.replace(" . ", ". ", regex=False)
transcripts_compiled_df["conversation_text"] = transcripts_compiled_df["conversation_text"].str.replace("..", ".", regex=False)
transcripts_compiled_df["conversation_text"] = transcripts_compiled_df["conversation_text"].str.replace(",,", ",", regex=False)
transcripts_compiled_df["conversation_text"] = transcripts_compiled_df["conversation_text"].str.replace("  ", " ", regex=False)
transcripts_compiled_df["conversation_text"] = transcripts_compiled_df["conversation_text"].str.replace('"', "", regex=False)
transcripts_compiled_df["conversation_text"] = transcripts_compiled_df["conversation_text"].str.replace("\w+-\s", "", regex=True)  # drop half-spoken words, and not just the ones at the end of a line
transcripts_compiled_df["conversation_text"] = transcripts_compiled_df["conversation_text"].str.strip("-")
transcripts_compiled_df["conversation_text"] = transcripts_compiled_df["conversation_text"].str.strip()

Use a `.loc` to remove the ". " leading the rows

In [43]:
transcripts_compiled_df.loc[transcripts_compiled_df["conversation_text"].str.startswith(". "), 'conversation_text'] = transcripts_compiled_df.loc[transcripts_compiled_df["conversation_text"].str.startswith(". "), 'conversation_text'].str.lstrip(". ")
transcripts_compiled_df.loc[transcripts_compiled_df["conversation_text"].str.startswith(", "), 'conversation_text'] = transcripts_compiled_df.loc[transcripts_compiled_df["conversation_text"].str.startswith(", "), 'conversation_text'].str.lstrip(", ")
transcripts_compiled_df.loc[transcripts_compiled_df["conversation_text"].str.endswith(","), 'conversation_text'] = transcripts_compiled_df.loc[transcripts_compiled_df["conversation_text"].str.endswith(","), 'conversation_text'].str.rstrip(",")

In [44]:
transcripts_compiled_df.loc[transcripts_compiled_df['conversation_text'].isna(), 'conversation_text'] = "np.nan_filler"
transcripts_compiled_df.loc[transcripts_compiled_df['conversation_text'] == '', 'conversation_text'] = "np.nan_filler"
transcripts_compiled_df.loc[transcripts_compiled_df['conversation_text'] == "''", 'conversation_text'] = "np.nan_filler"

transcripts_compiled_df.loc[transcripts_compiled_df['conversation_text'].str.contains(r"^[.]+$", regex=True), 'conversation_text'] = "np.nan_filler"

### Drop Null Responses

In [45]:
transcripts_compiled_df = transcripts_compiled_df[transcripts_compiled_df['conversation_text'] != "np.nan_filler"].reset_index(drop=True)
transcripts_compiled_df

Unnamed: 0,speaker_turn,timestamp,conversation_text,transcript_number,other_text,condition
0,Speaker 1,00:14,"All right. Um, so you just click the next butt...",1003,[],empathy
1,Speaker 2,00:19,"Uh, I guess so, yeah. Let's see. Uh, okay. Yea...",1003,[],empathy
2,Speaker 1,00:33,"I don't, are we allowed to say what our name is?",1003,[],empathy
3,Speaker 2,00:34,"Oh, are we not allowed to? I don't know. Maybe...",1003,[],empathy
4,Speaker 1,00:38,I have no idea. Okay. Let's just do benefit of...,1003,[],empathy
...,...,...,...,...,...,...
6448,Speaker 2,20:27,"But maybe it'll come, like, 10 years down the ...",1049,[],empathy
6449,Speaker 1,20:30,Yeah. And maybe you don't need that moment.,1049,[],empathy
6450,Speaker 2,20:31,Yeah.,1049,[],empathy
6451,Speaker 1,20:32,Because like you're not as aligned in somethin...,1049,[],empathy


In [46]:
transcripts_compiled_df["speaker_turn_shifted"] = transcripts_compiled_df["speaker_turn"].shift()
transcripts_compiled_df["transcript_number_shifted"] = transcripts_compiled_df["transcript_number"].shift()

transcripts_compiled_df["repeat_turn_check"] = np.where(
    (transcripts_compiled_df["speaker_turn_shifted"] == transcripts_compiled_df["speaker_turn"]) &
    (transcripts_compiled_df["transcript_number_shifted"] == transcripts_compiled_df["transcript_number"]), 
    "repeat",
    "no_repeat")
transcripts_compiled_df

Unnamed: 0,speaker_turn,timestamp,conversation_text,transcript_number,other_text,condition,speaker_turn_shifted,transcript_number_shifted,repeat_turn_check
0,Speaker 1,00:14,"All right. Um, so you just click the next butt...",1003,[],empathy,,,no_repeat
1,Speaker 2,00:19,"Uh, I guess so, yeah. Let's see. Uh, okay. Yea...",1003,[],empathy,Speaker 1,1003.0,no_repeat
2,Speaker 1,00:33,"I don't, are we allowed to say what our name is?",1003,[],empathy,Speaker 2,1003.0,no_repeat
3,Speaker 2,00:34,"Oh, are we not allowed to? I don't know. Maybe...",1003,[],empathy,Speaker 1,1003.0,no_repeat
4,Speaker 1,00:38,I have no idea. Okay. Let's just do benefit of...,1003,[],empathy,Speaker 2,1003.0,no_repeat
...,...,...,...,...,...,...,...,...,...
6448,Speaker 2,20:27,"But maybe it'll come, like, 10 years down the ...",1049,[],empathy,Speaker 1,1049.0,no_repeat
6449,Speaker 1,20:30,Yeah. And maybe you don't need that moment.,1049,[],empathy,Speaker 2,1049.0,no_repeat
6450,Speaker 2,20:31,Yeah.,1049,[],empathy,Speaker 1,1049.0,no_repeat
6451,Speaker 1,20:32,Because like you're not as aligned in somethin...,1049,[],empathy,Speaker 2,1049.0,no_repeat


#### Splice together rows of text from same speaker punctuated by laugh 

In [47]:
repeat_idx = transcripts_compiled_df.index[transcripts_compiled_df['repeat_turn_check'] == "repeat"]

transcripts_compiled_df.loc[repeat_idx - 1, 'conversation_text'] = transcripts_compiled_df.loc[repeat_idx - 1, 'conversation_text'].str.cat(
    transcripts_compiled_df.loc[repeat_idx, 'conversation_text'].to_list(), sep=" ")

transcripts_compiled_df.loc[repeat_idx - 1, 'conversation_text']

13      Uh, sure. Let's see. Uh, um, let's see, wake u...
38      Uh, sure, yeah. Um, let's see. I think this wa...
66      Uh, yeah. I'd like to play baseball. It's kind...
70      And be like, All right, guess I'll go work on ...
161     Yeah. They kinda real deep. I don't know how, ...
                              ...                        
6378    Um, I think I want to eventually do a PhD. Um,...
6388    Because I think I came in with the philosophy ...
6415    So like, um, I don't I feel like there for me,...
6418    So hearing you talk about how, like, for you a...
6440    Yeah. No, uh, what you're saying about guilt i...
Name: conversation_text, Length: 184, dtype: object

#### Drop repeat speaker rows from which you have already extracted text

In [48]:
transcripts_compiled_df.drop(transcripts_compiled_df.loc[repeat_idx, 'conversation_text'].index, inplace=True)

In [49]:
transcripts_compiled_df[transcripts_compiled_df["repeat_turn_check"] == "repeat"]  # No more repeats!

Unnamed: 0,speaker_turn,timestamp,conversation_text,transcript_number,other_text,condition,speaker_turn_shifted,transcript_number_shifted,repeat_turn_check


In [50]:
transcripts_compiled_df.drop(columns=['speaker_turn_shifted', 'transcript_number_shifted', 'repeat_turn_check'], inplace=True)  # We also don't need these vestigial columns

In [51]:
transcripts_compiled_df.reset_index(drop=True)

# <!-- Get speaker and overall turn counts --> #
transcripts_compiled_df.insert(
    1, 'overall_turn_count', transcripts_compiled_df.groupby(
    'transcript_number').cumcount() + 1)  # +1 to reflect next turn of the entire conversation
transcripts_compiled_df.insert(
    2, 'speaker_turn_count', transcripts_compiled_df.groupby(
    ['speaker_turn', 'transcript_number']).cumcount() + 1)  # +1 to reflect next turn of the specific participant

### Run Through Pre-Trained Pipeline

In [52]:
transcripts_compiled_df["conversation_text_nlp_vectors"] = transcripts_compiled_df["conversation_text"].apply(lambda x: nlp(x))

#### Create Columns for NLP Attributes

In [53]:
transcripts_compiled_df["token_lemma"] = transcripts_compiled_df["conversation_text_nlp_vectors"].apply(lambda x: [token.lemma_ for token in x])
transcripts_compiled_df["token_pos"] = transcripts_compiled_df["conversation_text_nlp_vectors"].apply(lambda x: [token.pos_ for token in x])
transcripts_compiled_df["token_tag"] = transcripts_compiled_df["conversation_text_nlp_vectors"].apply(lambda x: [token.tag_ for token in x])
transcripts_compiled_df["token_dep"] = transcripts_compiled_df["conversation_text_nlp_vectors"].apply(lambda x: [token.dep_ for token in x])
transcripts_compiled_df["token_shape"] = transcripts_compiled_df["conversation_text_nlp_vectors"].apply(lambda x: [token.shape_ for token in x])
transcripts_compiled_df["token_is_alpha"] = transcripts_compiled_df["conversation_text_nlp_vectors"].apply(lambda x: [token.is_alpha for token in x])
transcripts_compiled_df["token_is_stop"] = transcripts_compiled_df["conversation_text_nlp_vectors"].apply(lambda x: [token.is_stop for token in x])

#### Length of Responses With and Without Punctuation

In [54]:
transcripts_compiled_df["token_lemma_len"] = transcripts_compiled_df["token_lemma"].apply(len)
transcripts_compiled_df["token_lemma_len_no_punct"] = transcripts_compiled_df["token_pos"].apply(len) - transcripts_compiled_df["token_pos"].apply(lambda x: x.count("PUNCT"))

Convert timestamps to `datetime` format

In [55]:
transcripts_compiled_df['timestamp'] = pd.to_datetime(transcripts_compiled_df['timestamp'], format='%M:%S')
transcripts_compiled_df.insert(4, 'timestamp_delta', (transcripts_compiled_df['timestamp'] - transcripts_compiled_df['timestamp'].shift()).fillna(pd.Timedelta('0 days')).astype(str).str[7:])
transcripts_compiled_df['timestamp_delta'] = transcripts_compiled_df['timestamp_delta'].shift(-1)

## 1. No Laughs Only

In [56]:
transcripts_compiled_df.to_csv("../output_data/cooked/final_dataset.csv", index=False)

## Convert Notebook to HTML

In [4]:
import os
os.system('jupyter nbconvert --to html clean-wrangle.ipynb')

[NbConvertApp] Converting notebook clean-wrangle.ipynb to html
[NbConvertApp] Writing 807806 bytes to clean-wrangle.html


0