# Extraction of Content Features for the Chat Bot Messages

For our chat bot analysis, we would like to add two content based features:
- Emotional Content of User Messages
- Kind of Questions they asked (Conceptual? Procedural? ...)

In [None]:
import pandas as pd
import ast
import deepl
import time
from tqdm import tqdm
from transformers import pipeline
from concurrent.futures import ThreadPoolExecutor

# Read data
df = pd.read_csv("data/cleaned/gymitrainer_40percent.csv")

# Initialize DeepL
auth_key = "adad831c-ab5c-44a3-a708-609e71f78ad5:fx"
translator = deepl.Translator(auth_key)

# Step 1: Parse user messages
df["messages_user"] = df["content"].apply(lambda x: ast.literal_eval(x)[1::2])

# Step 2: Batched + rate-limited DeepL translation
def translate_batch_safe(messages, sleep=1.1):
    translated = []
    for msg in messages:
        try:
            translated.append(translator.translate_text(msg, source_lang="DE", target_lang="EN-US").text)
            time.sleep(sleep)  # Avoid rate limit (50 req/min for free tier)
        except Exception as e:
            translated.append("[TRANSLATION ERROR]")
    return translated

tqdm.pandas(desc="Translating user messages")
df["messages_user_en"] = df["messages_user"].progress_apply(translate_batch_safe)
df.to_csv("translated.csv")


In [1]:
import pandas as pd
from tqdm import tqdm
from tqdm.auto import tqdm as auto_tqdm
from transformers import pipeline
import torch

# Enable tqdm integration with pandas
tqdm.pandas()

# Load data
df = pd.read_csv("translated_retry.csv")

# Clean up messages
df["messages_user_en"] = df["messages_user_en"].apply(lambda lst: [m for m in eval(lst) if isinstance(m, str)])

# Step 3: Emotion classification
emotion_classifier = pipeline(
    "text-classification",
    model="j-hartmann/emotion-english-distilroberta-base",
    return_all_scores=True,
    top_k=None,
    device=0
)

emotion_labels = ['anger','disgust','fear','joy','neutral','sadness','surprise']

def compute_emotions(msg_list):
    if not isinstance(msg_list, list) or not msg_list:
        return {f"avg_{label}": 0.0 for label in emotion_labels}
    try:
        scores = emotion_classifier(msg_list)
        df_scores = pd.DataFrame([{s['label']: s['score'] for s in msg} for msg in scores])
        return df_scores.mean().add_prefix('avg_').to_dict()
    except Exception:
        return {f"avg_{label}": 0.0 for label in emotion_labels}

df["emotion_results"] = df["messages_user_en"].progress_apply(compute_emotions)
emotion_df = df["emotion_results"].apply(pd.Series)
df = pd.concat([df, emotion_df], axis=1)
df.to_csv("emotions.csv", index=False)

# Step 4: Zero-shot topic classification
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=0)
candidate_labels = ["conceptual", "procedural", "factual", "homework-specific"]

def classify_messages(messages):
    if not isinstance(messages, list):
        return {label + "_count": 0 for label in candidate_labels}
    counts = {label + "_count": 0 for label in candidate_labels}
    try:
        results = classifier(messages, candidate_labels)
        for result in results:
            top = result['labels'][0]
            counts[top + "_count"] += 1
    except Exception:
        pass
    return counts

df["classification_counts"] = df["messages_user_en"].progress_apply(classify_messages)
df = pd.concat([df, df["classification_counts"].apply(pd.Series)], axis=1)
df.to_csv("classifications.csv", index=False)

Device set to use mps:0
  2%|▏         | 58/3500 [00:04<02:43, 21.10it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (5195 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 3500/3500 [02:53<00:00, 20.22it/s]
Device set to use mps:0
100%|██████████| 3500/3500 [50:48<00:00,  1.15it/s]  


In [3]:
%pip install torch

Collecting torch
  Downloading torch-2.7.0-cp312-none-macosx_11_0_arm64.whl.metadata (29 kB)
Collecting sympy>=1.13.3 (from torch)
  Using cached sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting networkx (from torch)
  Using cached networkx-3.4.2-py3-none-any.whl.metadata (6.3 kB)
Collecting mpmath<1.4,>=1.1.0 (from sympy>=1.13.3->torch)
  Using cached mpmath-1.3.0-py3-none-any.whl.metadata (8.6 kB)
Downloading torch-2.7.0-cp312-none-macosx_11_0_arm64.whl (68.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.6/68.6 MB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hUsing cached sympy-1.14.0-py3-none-any.whl (6.3 MB)
Using cached networkx-3.4.2-py3-none-any.whl (1.7 MB)
Using cached mpmath-1.3.0-py3-none-any.whl (536 kB)
Installing collected packages: mpmath, sympy, networkx, torch
Successfully installed mpmath-1.3.0 networkx-3.4.2 sympy-1.14.0 torch-2.7.0
Note: you may need to restart the kernel to use updated packages.


For the content analysis and the topic modelling we only are interested in the messages sent by the user. Let's create a column which contains only those ...
User messages are every second message.

In [3]:
df2 = df.copy()
df2 = df2.drop('Unnamed: 0.1', axis=1)
df2 = df2.drop('Unnamed: 0', axis=1)


In [6]:
df2.to_csv("gogymi_with_content.csv", index=False)

In [None]:
import pandas as pd
import time
import deepl
from tqdm import tqdm

# Load translated data
df = pd.read_csv("translated_retry.csv")

# Re-initialize DeepL
auth_key = "cb200b55-d1f4-4483-a1c6-3c8cb05fc576:fx"
translator = deepl.Translator(auth_key)

# Identify rows with translation errors
mask_error = df["messages_user_en"].apply(lambda lst: "[TRANSLATION ERROR]" in lst)
df_errors = df[mask_error].copy()

# Parse messages_user from string to list if needed
if isinstance(df_errors["messages_user"].iloc[0], str):
    import ast
    df_errors["messages_user"] = df_errors["messages_user"].apply(ast.literal_eval)

# Retry translation
def retry_translation(messages):
    results = []
    for msg in messages:
        if not msg.strip():
            results.append("")  # Skip empty messages
            continue
        if msg == "[TRANSLATION ERROR]":
            results.append("[TRANSLATION ERROR]")  # Already marked as error
            continue
        try:
            translated = translator.translate_text(msg, source_lang="DE", target_lang="EN-US").text
            results.append(translated)
        except Exception as e:
            print("⚠️ Translation error occurred. Waiting 30 seconds before retrying...")
            print(e)
            time.sleep(30)
            try:
                results.append(translated)
            except Exception:
                results.append("[TRANSLATION ERROR]")
    return results

# Progressively retry and save
output_path = "translated_retry.csv"
save_interval = 50

pbar = tqdm(total=len(df_errors), desc="Retrying failed translations")
for idx, (i, row) in enumerate(df_errors.iterrows()):
    df_errors.at[i, "messages_user_en"] = retry_translation(row["messages_user"])

    if (idx + 1) % save_interval == 0 or idx == len(df_errors) - 1:
        df.update(df_errors)
        df.to_csv(output_path, index=False)
        pbar.set_postfix(saved_rows=idx + 1)

    pbar.update(1)

pbar.close()