In [20]:
import json
import os 
import pandas as pd 
import re

label2int = {
    '1-RAPPORT': 0,
    '2-NEGOTIATE': 1,
    '3-EMOTION': 2,
    '4-LOGIC': 3,
    '5-AUTHORITY': 4,
    '6-SOCIAL': 5,
    '7-PRESSURE': 6,
    '8-NO-PERSUASION': 7
}

In [100]:
def jsonl_to_transcript(path):
    jsonl = open_jsonl(path)
    transcripts = []
    for i in jsonl:
        transcript = process_json(i)
        for i in transcript:
            transcripts.append(i)
    return pd.DataFrame(transcripts)

import numpy as np

def dict_to_prob_vector(input_dict, label2idx):
    total_labels = len(label2idx)
    total_counts = sum(input_dict.values())
    prob_vector = np.zeros(total_labels)
    for label, count in input_dict.items():
        if label not in label2idx:
            continue
        label_index = label2idx[label]
        prob_vector[label_index] = count / total_counts
    
    return prob_vector


def open_jsonl(jsl_path):

    with open(jsl_path) as file:
        jsl = file.read().splitlines()
        jsonl = [json.loads(i) for i in jsl]

    return jsonl

def process_json(json):
    text, p_maps = preprocess_text(json['data'].strip())
    labels = gen_label_dic(json['label'])
    new_js = []
    new_js = {'text' : text}
    for key, prob in zip(label2int.keys(), dict_to_prob_vector(labels, label2int)):
        new_js[key] = prob
        
    # for label, count in labels.items():

    #     # can possibly just use a multilabel situation here 
    # new_j = {'text' : text,
    #          'label_prob' : label,
    #          'label_count' : count,
    #          'avg_label' : count/sum(labels.values()),
    #          'persuader_vec': p_maps[0],
    #          'persuadee_vec' : p_maps[1]}
    #     new_js.append(new_j)
    return [new_js]

def gen_pmaps(tokens):

    # add script here that tags persuader's utterances 
    persuader_map = [1 if i == '<persuader>' else 0 for i in tokens]
    persuadee_map = [1 if i == '<persuadee>' else 0 for i in tokens]
    return persuader_map, persuadee_map
    
def gen_label_dic(labels):
    
    label_count = {}
    
    for i in labels:
        i = i[-1]
        if i != '8-NO-PERSUASION':
            i = '-'.join(i.split('-')[:-1])
        if i not in label_count:
            label_count[i] = 1
        else:
            label_count[i] += 1
    return label_count

def preprocess_text(text):
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'Persuader: ', '<persuader> ', text)
    text = re.sub(r'Persuadee: ', '<persuadee> ', text)
    text = text.split(' ')
    persuader, persuadee = gen_pmaps(text)
    text = ' '.join(text)
    return text, (persuader, persuadee)

In [101]:
jsl_folder = '../../data/phase_1_data/5-transcripts/'

In [102]:
def gen_df(jsl_folder):
    df_list = []
    for p in os.listdir(jsl_folder):
        if p[-5:] != 'jsonl':
            continue 
        jsl_path = jsl_folder + p
        df_list.append(jsonl_to_transcript(jsl_path))
    return pd.concat(df_list)

In [103]:
df_5_trans = gen_df('../../data/phase_1_data/5-transcripts/')

In [104]:
int2lbel = {i : label for i, label in enumerate(sorted(df_5_trans.label.unique()[:-1]))}
label2int = {label : int for int, label in int2lbel.items()}

AttributeError: 'DataFrame' object has no attribute 'label'

In [105]:
df_5_trans

Unnamed: 0,text,1-RAPPORT,2-NEGOTIATE,3-EMOTION,4-LOGIC,5-AUTHORITY,6-SOCIAL,7-PRESSURE,8-NO-PERSUASION
0,<persuader> Hello. <persuader> How are you? <p...,0.700000,0.250000,0.000000,0.000000,0.050000,0.000000,0.000000,0.000000
1,"<persuader> Hey there, are you one to donate t...",0.296296,0.111111,0.148148,0.296296,0.037037,0.037037,0.000000,0.074074
2,<persuader> Good Morning! <persuader> How are ...,0.611111,0.166667,0.138889,0.000000,0.027778,0.000000,0.000000,0.055556
3,<persuader> Have you ever heard of the charity...,0.687500,0.187500,0.031250,0.000000,0.031250,0.031250,0.000000,0.031250
4,<persuader> Hello <persuadee> hi! <persuader> ...,0.461538,0.153846,0.076923,0.000000,0.000000,0.000000,0.000000,0.307692
...,...,...,...,...,...,...,...,...,...
0,<persuader> Hello. <persuader> How are you? <p...,0.375000,0.250000,0.208333,0.000000,0.000000,0.000000,0.000000,0.166667
1,"<persuader> Hey there, are you one to donate t...",0.088235,0.323529,0.264706,0.058824,0.029412,0.088235,0.058824,0.088235
2,<persuader> Good Morning! <persuader> How are ...,0.425532,0.319149,0.170213,0.000000,0.000000,0.000000,0.042553,0.042553
3,<persuader> Have you ever heard of the charity...,0.425000,0.100000,0.250000,0.025000,0.050000,0.025000,0.050000,0.075000


In [107]:
df_20_trans = gen_df('../../data/phase_1_data/20_transcripts/')

In [109]:
batch_folder = '../../data/phase_1_data/main_markup/'

df_list = []
for subfolder in os.listdir(batch_folder):
    if subfolder[0] == '.':
        continue
    df = gen_df(batch_folder+subfolder+'/')
    df_list.append(df)
df_main_mark = pd.concat(df_list)

In [119]:
df_main_mark.iloc[0]

text               <persuader> Hi! How're you? \t<persuadee> I am...
1-RAPPORT                                                   0.692308
2-NEGOTIATE                                                 0.076923
3-EMOTION                                                   0.076923
4-LOGIC                                                          0.0
5-AUTHORITY                                                 0.153846
6-SOCIAL                                                         0.0
7-PRESSURE                                                       0.0
8-NO-PERSUASION                                                  0.0
Name: 0, dtype: object

In [111]:
all_stuff = pd.concat([
    df_main_mark,
    df_20_trans,
    df_5_trans
])

In [34]:
labels = [
    '1-RAPPORT',
    '2-NEGOTIATE',
    '3-EMOTION',
    '4-LOGIC',
    '5-AUTHORITY',
    '6-SOCIAL',
    '7-PRESSURE',
    '8-NO-PERSUASION'
]

In [35]:
empty_dic = {
    'text' : [],
    'binary_label' : []
}

In [114]:
df_main_mark.to_csv('../../data/multilabel/multilabel_w_probs.csv', index=False)

In [39]:
def output_binary_dfs(all_stuff, labels, output_path=None):

    for label in labels:
        d = all_stuff[all_stuff['label']==label]
        d['binary_label'] = 1
        not_d = all_stuff[all_stuff['label']!=label]
        not_d['binary_label'] = 0
        label_df = pd.concat([d,not_d])
        label_df.to_csv(output_path+label+'.csv', index=False)
        return 


In [115]:
df = pd.read_csv('../../data/binary_classifier/binary_label_1-RAPPORT.csv')

In [116]:
df

Unnamed: 0,text,label,label_count,avg_label,persuader_vec,persuadee_vec,binary_label
0,<persuader> Hi! How're you? \t<persuadee> I am...,1-RAPPORT,9,0.692308,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1
1,"<persuader> Hello, how are you today? Have you...",1-RAPPORT,15,0.483871,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1
2,<persuader> Hello how are you today? \t<persua...,1-RAPPORT,9,0.692308,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1
3,<persuader> Hello there. What do you think of ...,1-RAPPORT,3,0.214286,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1
4,<persuader> Hello \t<persuadee> hello <persuad...,1-RAPPORT,4,0.250000,"[1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1
...,...,...,...,...,...,...,...
9809,<persuader> Hello <persuadee> hi! <persuader> ...,5-AUTHORITY,1,0.083333,"[1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...",0
9810,<persuader> Hello <persuadee> hi! <persuader> ...,3-EMOTION,1,0.083333,"[1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...",0
9811,<persuader> Hello <persuadee> hi! <persuader> ...,2-NEGOTIATE,4,0.333333,"[1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...",0
9812,<persuader> Hello <persuadee> hi! <persuader> ...,4-LOGIC,1,0.083333,"[1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...",0


In [None]:
[1,1,1,1,1,1,1,1,1,1,1,,0,0,0,0,0,0,0,0,0,0,0,0,