# Extraction of Content Features for the Chat Bot Messages

RQ3 : Which features derived from chatbot interactions can help predict whether a student will perform above or below average?

For our chat bot analysis, we would like to add two content based features:
- Emotional Content of User Messages
- Kind of Questions they asked, for example: conceptual, homework-specific, procedural ...

First of all we translate the text from german to english using the Deepl API. If you want to rerun this process you will have to go and get a free account or two which allows you to get a key!!

In [None]:
import pandas as pd
import ast
import deepl
import time
from tqdm import tqdm
from transformers import pipeline
from concurrent.futures import ThreadPoolExecutor

# Read data that has already been filtered for the 40 percent confidence threshold
df = pd.read_csv("data/cleaned/gymitrainer_40percent.csv")

# Initialize DeepL
auth_key = "REMOVED" # removed for github upload 
translator = deepl.Translator(auth_key)

# Step 1: Parse user messages
df["messages_user"] = df["content"].apply(lambda x: ast.literal_eval(x)[1::2])

# Step 2: Batched + rate-limited DeepL translation
def translate_batch_safe(messages, sleep=1.1):
    translated = []
    for msg in messages:
        try:
            translated.append(translator.translate_text(msg, source_lang="DE", target_lang="EN-US").text)
            time.sleep(sleep)  # Avoid rate limit (50 req/min for free tier)
        except Exception as e:
            translated.append("[TRANSLATION ERROR]")
    return translated

tqdm.pandas(desc="Translating user messages")
df["messages_user_en"] = df["messages_user"].progress_apply(translate_batch_safe)
#df.to_csv("translated.csv")


We used the RoBERTa-based model trained on a combination of publicly available datasets from domains such as Twitter, Reddit, and scripted dialogues. We got this model from HuggingFace

In [1]:
import pandas as pd
from tqdm import tqdm
from tqdm.auto import tqdm as auto_tqdm
from transformers import pipeline
import torch

# Enable tqdm integration with pandas
tqdm.pandas()

# Load data
df = pd.read_csv("translated_retry.csv")

# Clean up messages
df["messages_user_en"] = df["messages_user_en"].apply(lambda lst: [m for m in eval(lst) if isinstance(m, str)])

# Step 3: Emotion classification
emotion_classifier = pipeline(
    "text-classification",
    model="j-hartmann/emotion-english-distilroberta-base",
    return_all_scores=True,
    top_k=None,
    device=0
)

emotion_labels = ['anger','disgust','fear','joy','neutral','sadness','surprise']

def compute_emotions(msg_list):
    if not isinstance(msg_list, list) or not msg_list:
        return {f"avg_{label}": 0.0 for label in emotion_labels}
    try:
        scores = emotion_classifier(msg_list)
        df_scores = pd.DataFrame([{s['label']: s['score'] for s in msg} for msg in scores])
        return df_scores.mean().add_prefix('avg_').to_dict()
    except Exception:
        return {f"avg_{label}": 0.0 for label in emotion_labels}

df["emotion_results"] = df["messages_user_en"].progress_apply(compute_emotions)
emotion_df = df["emotion_results"].apply(pd.Series)
df = pd.concat([df, emotion_df], axis=1)
df.to_csv("emotions.csv", index=False)

# Step 4: Zero-shot topic classification
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=0)
candidate_labels = ["conceptual", "procedural", "factual", "homework-specific"]

def classify_messages(messages):
    if not isinstance(messages, list):
        return {label + "_count": 0 for label in candidate_labels}
    counts = {label + "_count": 0 for label in candidate_labels}
    try:
        results = classifier(messages, candidate_labels)
        for result in results:
            top = result['labels'][0]
            counts[top + "_count"] += 1
    except Exception:
        pass
    return counts

df["classification_counts"] = df["messages_user_en"].progress_apply(classify_messages)
df = pd.concat([df, df["classification_counts"].apply(pd.Series)], axis=1)
df.to_csv("classifications.csv", index=False)

Device set to use mps:0
  2%|▏         | 58/3500 [00:04<02:43, 21.10it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (5195 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 3500/3500 [02:53<00:00, 20.22it/s]
Device set to use mps:0
100%|██████████| 3500/3500 [50:48<00:00,  1.15it/s]  
