# Nutzerbasierte Evaluation

In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

import sys
sys.path.insert(0, '/Users/florianrunkel/Documents/02_Uni/04_Masterarbeit/masterthesis/')

from backend.ml_pipe.data.database.mongodb import MongoDb

plt.rcParams['font.size'] = 10
plt.style.use('default')

from scipy import stats
from scipy.stats import mannwhitneyu
from scipy.stats import shapiro, kruskal, wilcoxon

## Retrieve Evaluation Questionnaire Data

In [2]:
try:
    mongo_client = MongoDb()

    result = mongo_client.get_all('feedback')
    feedback_data = result.get('data', [])

    print(feedback_data)

except Exception as e:
    print(f"Error connecting to MongoDB: {e}")

[{'_id': '68c8349bcd07e1b6a3665a13', 'uid': 'UID001', 'freeText': '', 'prognoseBewertung': [{'modell': '', 'prognose': '', 'echt': '', 'bemerkung': ''}], 'bewertungsskala': [4, 4, 4, 5, 5, 5, 4, 5, 3, 5], 'explanationFeedback': {}, 'timestamp': '2025-09-15T15:45:31.111053Z'}, {'_id': '68c834ea4c8877fd37754728', 'uid': 'UID001', 'freeText': '', 'prognoseBewertung': [{'modell': '', 'prognose': '', 'echt': '', 'bemerkung': ''}], 'bewertungsskala': [3, 4, 4, 3, 5, 5, 4, 4, 3, 5], 'explanationFeedback': {}, 'timestamp': '2025-09-15T15:46:50.816364Z'}, {'_id': '68c8363e4c8877fd3775472a', 'uid': 'UID001', 'freeText': '', 'prognoseBewertung': [{'modell': '', 'prognose': '', 'echt': '', 'bemerkung': ''}], 'bewertungsskala': [4, 5, 4, 4, 5, 5, 4, 3, 5, 5], 'explanationFeedback': {}, 'timestamp': '2025-09-15T15:52:29.992306Z'}, {'_id': '68c8373dcd07e1b6a3665a15', 'uid': 'UID001', 'freeText': '', 'prognoseBewertung': [{'modell': '', 'prognose': '', 'echt': '', 'bemerkung': ''}], 'bewertungsskala':

In [3]:
import os
os.environ["PYMONGO_DISABLE_IPV6"] = "1"     # IPv6 erstmal ausschalten
os.environ["DNSPYTHON_IPV6"] = "false"

import dns.resolver
r = dns.resolver.Resolver(configure=True)
r.nameservers = ["1.1.1.1", "8.8.8.8"]       # Router-DNS umgehen
r.timeout = 2.0
r.lifetime = 3.0
dns.resolver.default_resolver = r            # global setzen

# jetzt erst Deinen Code:
mongo_client = MongoDb()
result = mongo_client.get_all("feedback")
print(result)

{'statusCode': 200, 'data': [{'_id': '68c8349bcd07e1b6a3665a13', 'uid': 'UID001', 'freeText': '', 'prognoseBewertung': [{'modell': '', 'prognose': '', 'echt': '', 'bemerkung': ''}], 'bewertungsskala': [4, 4, 4, 5, 5, 5, 4, 5, 3, 5], 'explanationFeedback': {}, 'timestamp': '2025-09-15T15:45:31.111053Z'}, {'_id': '68c834ea4c8877fd37754728', 'uid': 'UID001', 'freeText': '', 'prognoseBewertung': [{'modell': '', 'prognose': '', 'echt': '', 'bemerkung': ''}], 'bewertungsskala': [3, 4, 4, 3, 5, 5, 4, 4, 3, 5], 'explanationFeedback': {}, 'timestamp': '2025-09-15T15:46:50.816364Z'}, {'_id': '68c8363e4c8877fd3775472a', 'uid': 'UID001', 'freeText': '', 'prognoseBewertung': [{'modell': '', 'prognose': '', 'echt': '', 'bemerkung': ''}], 'bewertungsskala': [4, 5, 4, 4, 5, 5, 4, 3, 5, 5], 'explanationFeedback': {}, 'timestamp': '2025-09-15T15:52:29.992306Z'}, {'_id': '68c8373dcd07e1b6a3665a15', 'uid': 'UID001', 'freeText': '', 'prognoseBewertung': [{'modell': '', 'prognose': '', 'echt': '', 'bemerkun

##  Overview

In [6]:
'''
Error handling getting feedback data.
'''
if result and 'data' in result:
    feedback_data = result['data']
    print(f"{len(feedback_data)} Feedback entries loaded")
else:
    print("No data available")
    feedback_data = []

'''
Create a dataframe from the feedback data and overview.
'''
if len(feedback_data) > 0:
    df = pd.DataFrame(feedback_data)

    df['rating_length'] = df['bewertungsskala'].apply(lambda x: len(x) if isinstance(x, list) else 0)
    print(f"\nBewertungsskala-Längen:")
    print(df['rating_length'].value_counts().sort_index())

    '''
    Infer the group of the feedback data.
    '''
    def infer_group(row):
        uid = str(row.get('uid', '')).lower()
        rl = row['rating_length']
        if uid.startswith('uid001'):
            return 'Experimental'
        if uid.startswith('uid002') or uid.startswith('uui2'):
            return 'Control'

        return 'Unknown'

    df['group'] = df.apply(infer_group, axis=1)

    MAX_ITEMS = 10
    for i in range(MAX_ITEMS):
        df[f'rating_{i+1}'] = df.apply(
            lambda row: row['bewertungsskala'][i]
            if isinstance(row['bewertungsskala'], list) and len(row['bewertungsskala']) > i
            else np.nan,
            axis=1
        )

    # Teilmengen
    control_df = df[df['group'] == 'Control'].copy()
    experimental_df = df[df['group'] == 'Experimental'].copy()
    unknown_df = df[df['group'] == 'Unknown'].copy()

    # Grundlegende Statistiken
    print("\nFEEDBACK-DATA OVERVIEW")
    print("=" * 50)
    print(f"Total number of feedback: {len(df)}")
    print(f"Control Group (10 Fragen): {len(control_df)} Teilnehmer")
    print(f"Experimental Group (10 Fragen): {len(experimental_df)} Teilnehmer")

    if len(unknown_df) > 0:
        print("\nNote: There are entries with unknown group (missing/ambiguous UID). Examples:")
        cols_show = ['uid', 'rating_length']
        print(unknown_df[cols_show].head(5).to_string(index=False))

    # Erste Zeilen
    print(f"\nFIRST 3 ROWS:")
    cols_preview = ['uid', 'group', 'rating_length']
    print(df[cols_preview].head(3).to_string(index=False))

else:
    print("No data available for processing")

20 Feedback entries loaded

Bewertungsskala-Längen:
rating_length
10    20
Name: count, dtype: int64

FEEDBACK-DATA OVERVIEW
Total number of feedback: 20
Control Group (10 Fragen): 10 Teilnehmer
Experimental Group (10 Fragen): 10 Teilnehmer

FIRST 3 ROWS:
   uid        group  rating_length
UID001 Experimental             10
UID001 Experimental             10
UID001 Experimental             10


## Statistical Comparisons of Both Groups

In [None]:
'''
Define the questions for the control and experimental group.
'''

# Group A Questions
control_group_questions = [
    "The system's predictions about candidate job-switching readiness seemed realistic.",
    "The predictions were relevant for prioritizing candidates in Active Sourcing.",
    "I trusted the system's predictions when deciding which candidates to approach.",
    "The recommendations gave me enough confidence to base sourcing decisions on them.",
    "The system's predictions were easy to interpret without further explanation.",
    "The predictions helped me to structure the candidate selection process more efficiently.",
    "The system's predictions supported me in combining them with my own recruiting expertise.",
    "The system turned out to be a valuable complement to my own judgment.",
    "I can imagine using such a prediction system in my daily recruiting activities.",
    "The system would help me to improve the effectiveness of my sourcing decisions."
]

# Group B Questions
experimental_group_questions = [
    "The explanations made it clear why a candidate was predicted as more or less likely to switch jobs.",
    "The explanations increased my understanding of how the system generated its predictions.",
    "The explanations strengthened my confidence in the reliability of the predictions.",
    "The presence of explanations made me more willing to act on the system's recommendations.",
    "The combination of predictions and explanations was straightforward and clear to understand.",
    "The explanations improved my ability to identify which candidates should be prioritized in Active Sourcing.",
    "The explanations supported me in combining the system's predictions with my own recruiting expertise.",
    "The system turned out to be a valuable complement to my own judgment.",
    "I could imagine integrating such a system with explanations into my daily recruiting workflow.",
    "The explanations provided added value compared to predictions alone."
]

'''
Define comparable questions
'''
comparable_questions = {
    "Comprehensibility & Interpretability": {
        "control": [1, 2],     # Q1–Q2
        "experimental": [1, 2] # Q1–Q2
    },
    "Confidence in Predictions": {
        "control": [3, 4],     # Q3–Q4
        "experimental": [3, 4] # Q3–Q4
    },
    "Usability for Recruiting": {
        "control": [5, 6],     # Q5–Q6
        "experimental": [5, 6] # Q5–Q6
    },
    "Integration of Human Expertise and AI Support": {
        "control": [7, 8],     # Q7–Q8
        "experimental": [7, 8] # Q7–Q8
    },
    "Perceived Value & Intention to Use": {
        "control": [9, 10],    # Q9–Q10
        "experimental": [9, 10]# Q9–Q10
    }
}

'''
Compare the control and experimental group.
'''
comparison_results = []

for category, question_indices in comparable_questions.items():
    print(f"\n{category.upper()}:")
    print("-" * 80)

    control_ratings = []
    for idx in question_indices['control']:
        col = f'rating_{idx}'
        if col in control_df.columns:
            ratings = control_df[col].dropna()
            control_ratings.extend(ratings.tolist())

    experimental_ratings = []
    for idx in question_indices['experimental']:
        col = f'rating_{idx}'
        if col in experimental_df.columns:
            ratings = experimental_df[col].dropna()
            experimental_ratings.extend(ratings.tolist())

    '''
    Perform the Mann-Whitney-U test.
    '''
    if len(control_ratings) > 0 and len(experimental_ratings) > 0:

        stat, p_value = mannwhitneyu(control_ratings, experimental_ratings, alternative='two-sided')

        # Effect size (Cohen's d)
        n1, n2 = len(control_ratings), len(experimental_ratings)
        pooled_std = np.sqrt(((n1-1)*np.var(control_ratings) + (n2-1)*np.var(experimental_ratings)) / (n1+n2-2))
        cohens_d = (np.mean(control_ratings) - np.mean(experimental_ratings)) / pooled_std if pooled_std > 0 else 0

        result = {
            'category': category,
            'control_mean': np.mean(control_ratings),
            'experimental_mean': np.mean(experimental_ratings),
            'control_n': len(control_ratings),
            'experimental_n': len(experimental_ratings),
            'u_statistic': stat,
            'p_value': p_value,
            'cohens_d': cohens_d,
            'significant': p_value < 0.05
        }
        comparison_results.append(result)

        print(f"  Control Group:    M = {result['control_mean']:.2f}, n = {result['control_n']}")
        print(f"  Experimental Group: M = {result['experimental_mean']:.2f}, n = {result['experimental_n']}")
        print(f"  Mann-Whitney-U:   U = {stat:.3f}, p = {p_value:.3f}")
        print(f"  Cohen's d:        d = {cohens_d:.3f}")
    
        if p_value < 0.05:
            direction = "higher" if result['control_mean'] > result['experimental_mean'] else "lower"
            print(f"  → SIGNIFICANT: Control Group is {direction} (p < 0.05)")
        else:
            print(f"  → NOT SIGNIFICANT: No differences (p ≥ 0.05)")
    else:
        print(f"  → NOT ENOUGH DATA FOR COMPARISON")

'''
Summarize the comparison results.
'''
print("\n" + "=" * 80)
print(f"SUMMARY OF ALL COMPARISONS:")
print("=" * 80)
significant_count = sum(1 for r in comparison_results if r['significant'])
print(f"Significant Differences: {significant_count}/{len(comparison_results)} Categories")

if significant_count > 0:
    print(f"\nSignificant Categories:")
    for result in comparison_results:
        if result['significant']:
            direction = "higher" if result['control_mean'] > result['experimental_mean'] else "lower"
            effect_size = "large" if abs(result['cohens_d']) > 0.8 else "medium" if abs(result['cohens_d']) > 0.5 else "small"
            print(f"  • {result['category']}: Control Group is {direction} (p = {result['p_value']:.3f}, d = {result['cohens_d']:.3f}, {effect_size})")



COMPREHENSIBILITY & INTERPRETABILITY:
--------------------------------------------------------------------------------
  Control Group:    M = 3.55, n = 20
  Experimental Group: M = 4.10, n = 20
  Mann-Whitney-U:   U = 140.000, p = 0.072
  Cohen's d:        d = -0.673
  → NOT SIGNIFICANT: No differences (p ≥ 0.05)

CONFIDENCE IN PREDICTIONS:
--------------------------------------------------------------------------------
  Control Group:    M = 2.50, n = 20
  Experimental Group: M = 4.05, n = 20
  Mann-Whitney-U:   U = 69.000, p = 0.000
  Cohen's d:        d = -1.551
  → SIGNIFICANT: Control Group is lower (p < 0.05)

USABILITY FOR RECRUITING:
--------------------------------------------------------------------------------
  Control Group:    M = 2.65, n = 20
  Experimental Group: M = 4.75, n = 20
  Mann-Whitney-U:   U = 36.000, p = 0.000
  Cohen's d:        d = -2.092
  → SIGNIFICANT: Control Group is lower (p < 0.05)

INTEGRATION OF HUMAN EXPERTISE AND AI SUPPORT:
------------------

## Statistical tests

In [17]:
print("\nCONTROL GROUP NORMALITY TESTS:")
print("-" * 80)
control_normality_results = []
for i, question in enumerate(control_group_questions):
    col = f'rating_{i+1}'
    if col in control_df.columns:
        ratings = control_df[col].dropna()
        if len(ratings) >= 3:
            stat, p_value = shapiro(ratings)
            control_normality_results.append({
                'Question': f"Q{i+1}",
                'N': len(ratings),
                'Shapiro-W': round(stat, 4),
                'p-Value': round(p_value, 4),
                'Normal Distributed': 'Yes' if p_value > 0.05 else 'No'
            })

if control_normality_results:
    control_normality_df = pd.DataFrame(control_normality_results)
    print(control_normality_df.to_string(index=False))
else:
    print("No Control Group Data available")

print("\nEXPERIMENTAL GROUP NORMALITÄTSTESTS:")
print("-" * 80)
experimental_normality_results = []
for i, question in enumerate(experimental_group_questions):
    col = f'rating_{i+1}'
    if col in experimental_df.columns:
        ratings = experimental_df[col].dropna()
        if len(ratings) >= 3:
            stat, p_value = shapiro(ratings)
            experimental_normality_results.append({
                'Question': f"Q{i+1}",
                'N': len(ratings),
                'Shapiro-W': round(stat, 4),
                'p-Value': round(p_value, 4),
                'Normal Distributed': 'Yes' if p_value > 0.05 else 'No'
            })

if experimental_normality_results:
    experimental_normality_df = pd.DataFrame(experimental_normality_results)
    print(experimental_normality_df.to_string(index=False))
else:
    print("No Experimental Group Data available")

'''
Summarize the normality results.
'''
all_normality_results = control_normality_results + experimental_normality_results
non_normal_count = sum(1 for result in all_normality_results if result['Normal Distributed'] == 'No')
print(f"\nTOTAL INTERPRETATION: {non_normal_count}/{len(all_normality_results)} Questions are not normally distributed")
print("→ Non-parametric tests (like the Kruskal-Wallis) are appropriate")



CONTROL GROUP NORMALITY TESTS:
--------------------------------------------------------------------------------
Question  N  Shapiro-W  p-Value Normal Distributed
      Q1 10     0.7516   0.0037                 No
      Q2 10     0.7404   0.0027                 No
      Q3 10     0.8526   0.0623                Yes
      Q4 10     0.8917   0.1770                Yes
      Q5 10     0.7957   0.0129                 No
      Q6 10     0.8250   0.0291                 No
      Q7 10     0.8858   0.1520                Yes
      Q8 10     0.9165   0.3283                Yes
      Q9 10     0.9073   0.2632                Yes
     Q10 10     0.7432   0.0029                 No

EXPERIMENTAL GROUP NORMALITÄTSTESTS:
--------------------------------------------------------------------------------
Question  N  Shapiro-W  p-Value Normal Distributed
      Q1 10     0.8022   0.0154                 No
      Q2 10     0.6553   0.0003                 No
      Q3 10     0.7516   0.0037                 No
   

In [19]:
'''
Build the scores for the control and experimental group.
'''
def build_scores(df, question_texts, comparable_questions, side_key, agg="mean"):
    q_to_col = {f"Q{i+1}": f"rating_{i+1}" for i in range(len(question_texts))}
    out = {}
    for cat, sides in comparable_questions.items():
        idxs = sides.get(side_key, [])
        cols = [q_to_col.get(f"Q{i}") for i in idxs]
        cols = [c for c in cols if c in df.columns]
        if not cols:
            continue
        block = df[cols]
        out[cat] = block.mean(axis=1) if agg == "mean" else block.median(axis=1)
    return pd.DataFrame(out)

'''
Summarize the means and standard deviations of the control and experimental group.
'''
def summarize_means_sd(control_df, experimental_df, control_qs, experimental_qs, comparable_questions, agg="mean"):
    ctrl = build_scores(control_df, control_qs, comparable_questions, "control", agg=agg)
    exp  = build_scores(experimental_df, experimental_qs, comparable_questions, "experimental", agg=agg)

    dims = [c for c in comparable_questions.keys() if c in ctrl.columns and c in exp.columns]
    rows = []
    for d in dims:
        x, y = ctrl[d].values, exp[d].values
        rows.append({
            "Construct": d,
            "Control_Mean": np.nanmean(x),
            "Control_SD":   np.nanstd(x, ddof=1),
            "Experimental_Mean": np.nanmean(y),
            "Experimental_SD":   np.nanstd(y, ddof=1),
        })
    df_sum = pd.DataFrame(rows)

    cols = pd.MultiIndex.from_tuples(
        [("Construct",""),
         ("Control","Mean"), ("Control","SD"),
         ("Experimental","Mean"), ("Experimental","SD")],
        names=["", ""]
    )
    data = []
    for _, r in df_sum.iterrows():
        data.append([
            r["Construct"],
            r["Control_Mean"], r["Control_SD"],
            r["Experimental_Mean"], r["Experimental_SD"]
        ])
    table = pd.DataFrame(data, columns=cols)
    table[("Control","Mean")] = table[("Control","Mean")].map(lambda v: f"{v:.3f}")
    table[("Control","SD")]   = table[("Control","SD")].map(lambda v: f"{v:.3f}")
    table[("Experimental","Mean")] = table[("Experimental","Mean")].map(lambda v: f"{v:.3f}")
    table[("Experimental","SD")]   = table[("Experimental","SD")].map(lambda v: f"{v:.3f}")
    return table

'''
Build the table for the control and experimental group.
'''
paper_table = summarize_means_sd(
    control_df, experimental_df,
    control_group_questions, experimental_group_questions,
    comparable_questions, agg="mean"
)

display(paper_table)

Unnamed: 0_level_0,Construct,Control,Control,Experimental,Experimental
Unnamed: 0_level_1,Unnamed: 1_level_1,Mean,SD,Mean,SD
0,Comprehensibility & Interpretability,3.55,0.798,4.1,0.459
1,Confidence in Predictions,2.5,1.08,4.05,0.55
2,Usability for Recruiting,2.65,1.081,4.75,0.486
3,Integration of Human Expertise and AI Support,2.8,0.949,3.85,0.669
4,Perceived Value & Intention to Use,3.15,1.081,3.8,0.753


## Mann-Whitney-U-Test

In [22]:
'''
Calculate the cliffs delta (Δ)
'''
def cliffs_delta(x, y):
    x = np.asarray(x, dtype=float)
    y = np.asarray(y, dtype=float)
    x = x[~np.isnan(x)]
    y = y[~np.isnan(y)]
    if len(x) == 0 or len(y) == 0:
        return np.nan
    greater, less = 0, 0
    for xi in x:
        greater += np.sum(xi > y)
        less    += np.sum(xi < y)
    n_pairs = len(x) * len(y)
    if n_pairs == 0:
        return np.nan
    return (greater - less) / n_pairs

'''
Build the dimension scores for the control and experimental group
'''
mapping = {
    "Comprehensibility & Interpretability": {"Control": [1, 2], "Experimental": [1, 2]},
    "Confidence in Predictions":            {"Control": [3, 4], "Experimental": [3, 4]},
    "Usability for Recruiting":             {"Control": [5, 6], "Experimental": [5, 6]},
    "Integration of Human Expertise and AI Support": {"Control": [7, 8], "Experimental": [7, 8]},
    "Perceived Value & Intention to Use":   {"Control": [9, 10], "Experimental": [9, 10]},
}

'''
Build the dimension scores for the control and experimental grou
'''
def build_dimension_scores(df, mapping, group_col="group"):
    df = df.copy()
    if "participant_id" not in df.columns:
        df["participant_id"] = np.arange(1, len(df) + 1)

    rows = []
    for _, row in df.iterrows():
        g = row[group_col]
        pid = row["participant_id"]
        for dim, m in mapping.items():
            idxs = m.get(g, [])
            cols = [f"rating_{i}" for i in idxs if f"rating_{i}" in df.columns]
            vals = [row[c] for c in cols if pd.notnull(row[c])]
            score = np.mean(vals) if len(vals) == 2 else np.nan
            rows.append({"participant_id": pid, "group": g, "dimension": dim, "score": score})
    return pd.DataFrame(rows)

'''
Perform the Mann-Whitney-U-Test
'''
def mwu_test(df, mapping, alpha=0.05):
    long_df = build_dimension_scores(df, mapping, group_col="group")

    rows = []
    for dim in mapping.keys():
        a = long_df[(long_df["dimension"] == dim) & (long_df["group"] == "Control")]["score"].dropna()
        b = long_df[(long_df["dimension"] == dim) & (long_df["group"] == "Experimental")]["score"].dropna()

        if len(a) == 0 or len(b) == 0:
            rows.append({
                "Dimension": dim, "n_Control": len(a), "n_Experimental": len(b),
                "Median (Control)": np.nan, "Median (Experimental)": np.nan,
                "U": np.nan, "p": np.nan, "Cliff's Δ": np.nan,
                "Richtung": "", "Signifikant (α=0.05)": "Nein"
            })
            continue

        U, p = mannwhitneyu(a, b, alternative="two-sided", method="auto")
        delta = cliffs_delta(b, a)  # Δ > 0 ⇒ Experimental > Control

        rows.append({
            "Dimension": dim,
            "Median (Control)": float(np.median(a)),
            "Median (Experimental)": float(np.median(b)),
            "U": float(U),
            "p": float(p),
            "Cliff's Δ": float(delta),
        })

    tests = pd.DataFrame(rows)
    for col in ["Median (Control)", "Median (Experimental)", "U", "p", "Cliff's Δ"]:
        if col in tests.columns:
            tests[col] = tests[col].round(4)
    return tests

'''
Display the Mann-Whitney-U-Test results
'''
table_mwu = mwu_test(df, mapping, alpha=0.05)
display(table_mwu)

Unnamed: 0,Dimension,Median (Control),Median (Experimental),U,p,Cliff's Δ
0,Comprehensibility & Interpretability,4.0,4.25,29.0,0.1069,0.42
1,Confidence in Predictions,2.25,4.0,11.5,0.0033,0.77
2,Usability for Recruiting,2.5,5.0,3.5,0.0004,0.93
3,Integration of Human Expertise and AI Support,2.75,3.75,17.0,0.0126,0.66
4,Perceived Value & Intention to Use,3.25,3.75,32.0,0.1799,0.36
