In [74]:
import pandas as pd

In [75]:
explore_turn = pd.read_csv("explore_turn_common.csv")

In [76]:
turn_counts = explore_turn['chat_id'].value_counts()

In [77]:
valid_chat_ids = turn_counts[turn_counts == 12].index

In [78]:
explore_turn_12 = explore_turn[explore_turn['chat_id'].isin(valid_chat_ids)].copy()

In [79]:
explore_turn_12.head(5)

Unnamed: 0.1,Unnamed: 0,utterance_id,turn_id,chat_id,datetime,user_utterance,valence,label,sublabel,bot_action,...,valence_new,label_new,sublabel_new,user_utterance_english,bot_utterance_english,Contained technical issue,user_rating_english,user_rating_explanation_english,user_sentiment,bot_sentiment
0,0,1,1,1.0,2023-08-07 15:42:33.034771,start,Follow/Neutral,,,,...,Follow/Neutral,,,Start,"Hi! I'm MIcha, your motivational chatbot. My g...",1.0,,,2.0,1.833333
1,1,2,2,1.0,2023-08-07 15:43:07.304097,Ich möchte weniger prokrastinieren,change,Reason,desire,,...,change,Reason,desire,I want to procrastinate less,So you want to procrastinate less. Can you tel...,1.0,,,2.0,1.5
2,2,3,3,1.0,2023-08-07 15:43:48.284858,Ich könnte schon viel weiter im Leben sein.,change,Reason,General Reason,,...,change,Reason,General Reason,I could be a lot further in my life.,How important is this change on a scale from 0...,1.0,,,3.0,2.0
3,3,4,4,1.0,2023-08-07 15:44:04.602837,,Follow/Neutral,,,,...,Follow/Neutral,,,I don't know.,"On a scale from 0 (not at all) to 10 (very), h...",1.0,,,2.0,1.0
4,4,5,5,1.0,2023-08-07 15:44:20.584669,8,Follow/Neutral,,,,...,Follow/Neutral,,,8,An 8. So you're positive about this change ove...,1.0,,,2.0,2.0


##### Process the structure for LIWC_LSM analysis

In [80]:
import pandas as pd

user_df = explore_turn_12.copy()
user_df['text'] = user_df['user_utterance_english']
user_df['speaker'] = 'user'

bot_df = explore_turn_12.copy()
bot_df['text'] = bot_df['bot_utterance_english']
bot_df['speaker'] = 'bot'

LIWI_LSM_df = pd.concat([user_df, bot_df], ignore_index=True)

LIWI_LSM_df = explore_turn_12.drop(columns=['user_utterance_english', 'bot_utterance_english'])

In [81]:
for turn in range(1, 13):
    turn_df = LIWI_LSM_df[LIWI_LSM_df['turn_id'] == turn]
    filename = f"./turn_csvs/12turn conversation_Turn{turn}.csv"
    turn_df.to_csv(filename, index=False)

In [82]:
import os

lsm_list = []

# Read each Turn file one by one
for turn in range(1, 13):
    filename = f"./turn_csvs/LSM Result/Turn{turn}.csv"
    df = pd.read_csv(filename)
    
     # Assign turn_id corresponding to the current turn number
    df['turn_id'] = turn
    
    # Rename columns: GroupID → chat_id, LSM → LSM_LIWC
    df = df.rename(columns={'GroupID': 'chat_id', 'LSM': 'LSM_LIWC'})
    
    # Ensure the ID type is consistent (as string)
    df['chat_id'] = df['chat_id'].astype(str)
    
    # Add this turn’s data to the list
    lsm_list.append(df[['chat_id', 'turn_id', 'LSM_LIWC']])

# Concatenate all turn-level LSM data
lsm_all_turns = pd.concat(lsm_list, ignore_index=True)

# Ensure main table has consistent data types
explore_turn_12['chat_id'] = explore_turn_12['chat_id'].astype(str)
explore_turn_12['turn_id'] = explore_turn_12['turn_id'].astype(int)

# Merge LSM scores back into the main DataFrame
explore_turn_12 = explore_turn_12.merge(lsm_all_turns, on=['chat_id', 'turn_id'], how='left')

# Display the first few rows
explore_turn_12.head()


Unnamed: 0.1,Unnamed: 0,utterance_id,turn_id,chat_id,datetime,user_utterance,valence,label,sublabel,bot_action,...,label_new,sublabel_new,user_utterance_english,bot_utterance_english,Contained technical issue,user_rating_english,user_rating_explanation_english,user_sentiment,bot_sentiment,LSM_LIWC
0,0,1,1,1.0,2023-08-07 15:42:33.034771,start,Follow/Neutral,,,,...,,,Start,"Hi! I'm MIcha, your motivational chatbot. My g...",1.0,,,2.0,1.833333,0.0
1,1,2,2,1.0,2023-08-07 15:43:07.304097,Ich möchte weniger prokrastinieren,change,Reason,desire,,...,Reason,desire,I want to procrastinate less,So you want to procrastinate less. Can you tel...,1.0,,,2.0,1.5,0.48
2,2,3,3,1.0,2023-08-07 15:43:48.284858,Ich könnte schon viel weiter im Leben sein.,change,Reason,General Reason,,...,Reason,General Reason,I could be a lot further in my life.,How important is this change on a scale from 0...,1.0,,,3.0,2.0,0.35
3,3,4,4,1.0,2023-08-07 15:44:04.602837,,Follow/Neutral,,,,...,,,I don't know.,"On a scale from 0 (not at all) to 10 (very), h...",1.0,,,2.0,1.0,0.24
4,4,5,5,1.0,2023-08-07 15:44:20.584669,8,Follow/Neutral,,,,...,,,8,An 8. So you're positive about this change ove...,1.0,,,2.0,2.0,0.0


In [4]:
from empath import Empath

# Initialize Empath
lexicon = Empath()

# Step 1: Extract all Empath category features (no category limit)
def full_empath_features(text):
    return lexicon.analyze(text, normalize=True)

# Step 2: Compute features separately for user and bot
explore_turn_12['user_features'] = explore_turn_12['user_utterance_english'].apply(full_empath_features)
explore_turn_12['bot_features'] = explore_turn_12['bot_utterance_english'].apply(full_empath_features)

# Step 3: Combine all categories and calculate average frequency
from collections import defaultdict

def average_category_freq(feature_column):
    category_sums = defaultdict(float)
    for features in feature_column:
        for cat, score in features.items():
            category_sums[cat] += score
    return {cat: total / len(feature_column) for cat, total in category_sums.items()}

avg_user = average_category_freq(explore_turn_12['user_features'])
avg_bot = average_category_freq(explore_turn_12['bot_features'])

# Step 4: Merge average values from user and bot, and select frequently used categories
all_categories = set(avg_user.keys()) | set(avg_bot.keys())

selected_categories = []
for cat in all_categories:
    mean_val = (avg_user.get(cat, 0) + avg_bot.get(cat, 0)) / 2
    if mean_val > 0.01:  
        selected_categories.append(cat)

# Step 5: Print final selected categories
print("Selected Empath categories based on your data:")
print(selected_categories)

Selected Empath categories based on your data:
['eating', 'speaking', 'communication']


In [5]:
# Function to extract Empath-style linguistic features for selected categories
def get_features(text):
    return lexicon.analyze(text, categories=selected_categories, normalize=True)

# Create separate feature DataFrames for user and bot utterances
user_features = explore_turn_12['user_utterance_english'].apply(get_features).apply(pd.Series)
user_features.columns = [f'user_{cat}' for cat in user_features.columns]

bot_features = explore_turn_12['bot_utterance_english'].apply(get_features).apply(pd.Series)
bot_features.columns = [f'bot_{cat}' for cat in bot_features.columns]

# Combine both user and bot features into a temporary DataFrame for LSM computation
temp_df = pd.concat([user_features.reset_index(drop=True), bot_features.reset_index(drop=True)], axis=1)


# Function to compute LSM score for each row (i.e., turn)
def compute_lsm(row):
    scores = []
    for cat in selected_categories:
        try:
            u = row.get(f'user_{cat}', 0.0)
            b = row.get(f'bot_{cat}', 0.0)
            score = 1 - abs(u - b) / (u + b + 0.0001)
            scores.append(score)
        except Exception as e:
            print(f"Error for category {cat}: {e}")
    return sum(scores) / len(scores)

# Apply LSM calculation and store the results
lsm_scores = temp_df.apply(compute_lsm, axis=1)

# Store LSM scores in the original table
explore_turn_12['LSM_EMPATH'] = lsm_scores

In [6]:
explore_turn_12.to_csv('explore_turn_12.csv', index=False)