In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import math
import itertools
from scipy.stats import (
    ttest_rel,      # Paired t-test
    ttest_ind,      # Independent two-sample t-test
    f_oneway,       # One-way ANOVA
    mannwhitneyu,   # Non-parametric two-sample
    wilcoxon,       # Non-parametric paired
    kruskal,        # Non-parametric version of one-way ANOVA
    ttest_1samp
)


In [None]:
"""
Load and prepare sentiment analysis results.

Reads raw sentiment results from an Excel file, computes the Euclidean distance
between Arabic and English sentiment vectors, calculates per-dimension differences,
and prints a quick preview of the processed DataFrame and its column names.
"""

df = pd.read_excel("sentiment_analysis_full_results raw.xlsx")

# Compute Euclidean distance (L2 norm) across the four sentiment score dimensions
df["sentiment_diff"] = np.sqrt(
    (df["PositiveScore_ar"] - df["PositiveScore_en"]) ** 2 +
    (df["NegativeScore_ar"] - df["NegativeScore_en"]) ** 2 +
    (df["NeutralScore_ar"]  - df["NeutralScore_en"]) ** 2 +
    (df["MixedScore_ar"]    - df["MixedScore_en"]) ** 2
)

# Calculate directional differences for each sentiment dimension
df["diff_positive"] = df["PositiveScore_en"] - df["PositiveScore_ar"]
df["diff_negative"] = df["NegativeScore_en"] - df["NegativeScore_ar"]
df["diff_neutral"]  = df["NeutralScore_en"]  - df["NeutralScore_ar"]
df["diff_mixed"]    = df["MixedScore_en"]    - df["MixedScore_ar"]

# Display a sample of the processed data and the list of columns
print("Data sample:\n", df.head())
print("\nColumns:\n", df.columns)


In [None]:
"""
Compute and display mean Euclidean sentiment distances across different
groupings: by model & question, by question alone, by model & sensitivity flag,
by sensitivity flag alone, and by model alone.
"""

mean_scores = df.groupby(["model", "question_id"])["sentiment_diff"].mean()
print("\n=== ITEM 1: Mean scores per LLM & question ===")
print(mean_scores)

mean_scores_questions = df.groupby("question_id")["sentiment_diff"].mean()
print("\n=== ITEM 1: Mean scores per question ===")
print(mean_scores_questions)

mean_scores_sens_llm = df.groupby(["model", "is_sensitive"])["sentiment_diff"].mean()
print("\n=== ITEM 1: Mean scores by LLM & Sensitive ===")
print(mean_scores_sens_llm)

mean_scores_sens = df.groupby("is_sensitive")["sentiment_diff"].mean()
print("\n=== ITEM 1: Mean scores by Sensitive ===")
print(mean_scores_sens)

mean_scores_model = df.groupby("model")["sentiment_diff"].mean()
print("\n=== ITEM 1: Mean scores by model ===")
print(mean_scores_model)


In [None]:
"""
Configure plotting and generate descriptive visualizations of AR–EN sentiment differences.

This script performs the following steps:
1. Sets up Matplotlib aesthetics using the Aptos font.
2. Defines `compute_stats` to aggregate sentiment difference statistics.
3. Computes statistics grouped by various dimensions.
4. Produces four plots:
   - Mean sentiment difference by model & question (with overall mean background).
   - Mean sentiment difference by model & topic sensitivity.
   - Mean sentiment difference by topic sensitivity.
   - Mean sentiment difference by model.
"""

mpl.rcParams["font.family"] = "sans-serif"
mpl.rcParams["font.sans-serif"] = ["Aptos", "DejaVu Sans"]  # Fallback fonts
mpl.rcParams["axes.titleweight"] = "bold"
mpl.rcParams["axes.titlesize"] = 14
mpl.rcParams["axes.labelsize"] = 12
mpl.rcParams["xtick.labelsize"] = 11
mpl.rcParams["ytick.labelsize"] = 11
mpl.rcParams["legend.fontsize"] = 11


def compute_stats(df: pd.DataFrame, group_cols: list) -> pd.DataFrame:
    """
    Group by the specified columns and compute descriptive stats for 'sentiment_diff'.

    Parameters
    ----------
    df : pandas.DataFrame
        DataFrame containing a 'sentiment_diff' column and the grouping columns.
    group_cols : list of str
        The column names to group by.

    Returns
    -------
    pandas.DataFrame
        An aggregated DataFrame with columns:
        - mean : float, mean of sentiment_diff
        - std  : float, standard deviation
        - count: int, number of observations
        - sem  : float, standard error of the mean
    """
    stats = df.groupby(group_cols)["sentiment_diff"].agg(["mean", "std", "count"])
    stats["sem"] = stats["std"] / np.sqrt(stats["count"])
    return stats


stats_model_q    = compute_stats(df, ["model", "question_id"])
stats_model_sens = compute_stats(df, ["model", "is_sensitive"])
stats_sens       = compute_stats(df, ["is_sensitive"])
stats_model      = compute_stats(df, ["model"])


overall_mean = pivot_mean.mean(axis=1)
x_positions = np.arange(len(pivot_mean.index))

plt.figure(figsize=(10, 5))
ax = plt.gca()

ax.bar(
    x_positions,
    overall_mean,
    width=0.8,
    alpha=0.3,
    label="Overall Mean",
    zorder=0
)

pivot_mean.plot(
    kind="bar",
    yerr=pivot_sem,
    edgecolor="black",
    capsize=4,
    ax=ax,
    zorder=1
)

plt.title("AR–EN Sentiment Difference by LLM and Question")
plt.xlabel("Question ID")
plt.ylabel("Sentiment Difference")
plt.xticks(ticks=x_positions, labels=pivot_mean.index, rotation=45, ha="right")
plt.legend(title="LLM / Overall Mean", bbox_to_anchor=(1.04, 1), loc="upper left")
plt.grid(axis="y", alpha=0.4)
plt.tight_layout()
plt.show()


pivot_mean_ms = stats_model_sens["mean"].reset_index().pivot(
    index="model",
    columns="is_sensitive",
    values="mean"
)
pivot_sem_ms = stats_model_sens["sem"].reset_index().pivot(
    index="model",
    columns="is_sensitive",
    values="sem"
)

plt.figure(figsize=(9, 5))
ax = pivot_mean_ms.plot(
    kind="bar",
    yerr=pivot_sem_ms,
    edgecolor="black",
    capsize=4,
    ax=plt.gca()
)

plt.title("Mean AR–EN Sentiment Difference by LLM and Topic Sensitivity")
plt.xlabel("LLM (Model)")
plt.ylabel("Mean Sentiment Difference ± SEM")
plt.legend(["Non-sensitive", "Sensitive"], title="Topic")
plt.xticks(rotation=45, ha="right")
plt.grid(axis="y", alpha=0.4)
plt.tight_layout()
plt.show()


df_sens_plot = stats_sens.reset_index()

plt.figure(figsize=(5, 4))
plt.bar(
    x=["Non-sensitive", "Sensitive"],
    height=df_sens_plot["mean"],
    yerr=df_sens_plot["sem"],
    edgecolor="black",
    capsize=4
)

plt.title("Mean AR–EN Sentiment Difference by Topic Sensitivity")
plt.xlabel("Topic Sensitivity")
plt.ylabel("Mean Sentiment Difference ± SEM")
plt.grid(axis="y", alpha=0.4)
plt.tight_layout()
plt.show()


df_model_plot = stats_model.reset_index().sort_values("mean", ascending=False)

plt.figure(figsize=(7, 4))
plt.barh(
    y=df_model_plot["model"],
    width=df_model_plot["mean"],
    xerr=df_model_plot["sem"],
    edgecolor="black",
    capsize=4
)

plt.title("Mean AR–EN Sentiment Difference by LLM")
plt.xlabel("Mean Sentiment Difference ± SEM")
plt.ylabel("LLM (Model)")
plt.grid(axis="x", alpha=0.4)
plt.tight_layout()
plt.show()


In [None]:
"""
This compares the distributions of 'sentiment_diff' for sensitive
(is_sensitive == 1) versus non-sensitive (is_sensitive == 0) questions
across all models. It uses a two-sided Mann–Whitney U test to determine
if the difference between the two groups is statistically significant.
If either group has fewer than two samples, it reports insufficient data.
"""

print("\n=== Compare Sensitive vs Non-Sensitive across ALL LLMs ===")

sens = df[df["is_sensitive"] == 1]["sentiment_diff"]
nonsens = df[df["is_sensitive"] == 0]["sentiment_diff"]

if len(sens) < 2 or len(nonsens) < 2:
    print("Not enough data to test sensitive vs non-sensitive across all LLMs.")
else:
    stat, pval = mannwhitneyu(sens, nonsens, alternative='two-sided')
    print(f"Mann-Whitney (All LLMs, Sens vs. Non-Sens), p={pval:.4f}")


In [None]:
"""
For each language model (LLM), this section compares the distributions of
'sentiment_diff' for sensitive (is_sensitive == 1) versus non-sensitive
(is_sensitive == 0) questions using a two-sided Mann–Whitney U test.
If either subgroup has fewer than two observations, it reports insufficient data.
"""

print("\n=== ITEM 2: Compare Sensitive vs Non-Sensitive (sentiment_diff) per LLM ===")
llms = df["model"].unique()

for llm in llms:
    subset = df[df["model"] == llm]
    sens = subset[subset["is_sensitive"] == 1]["sentiment_diff"]
    nonsens = subset[subset["is_sensitive"] == 0]["sentiment_diff"]

    # Ensure at least two data points in each group for the test
    if len(sens) < 2 or len(nonsens) < 2:
        print(f"LLM={llm}: Not enough data to test sensitive vs non-sensitive.")
        continue

    # Perform two-sided Mann–Whitney U test
    stat, pval = mannwhitneyu(sens, nonsens, alternative='two-sided')
    print(f"LLM={llm}, Mann-Whitney U (sentiment_diff, Sens vs Non-sens): p={pval:.4f}")


In [None]:
"""
Compare sentiment differences across language models.

This code:
- Calculates summary stats (mean, std, count) of 'sentiment_diff' by model.
- Runs one-way ANOVA and Kruskal–Wallis tests across models that have >1 sample.
"""

summary = df.groupby("model")["sentiment_diff"].agg(["mean", "std", "count"])
print(summary)

print("\n=== Compare sentiment_diff across LLMs (ANOVA & Kruskal-Wallis) ===")
groups = []
models_for_test = []
for llm in llms:
    scores = df.loc[df["model"] == llm, "sentiment_diff"].dropna()
    if len(scores) > 1:
        groups.append(scores)
        models_for_test.append(llm)

if len(groups) >= 2:
    f_stat, p_val = f_oneway(*groups)
    print(f"One-way ANOVA (sentiment_diff) across LLMs: F={f_stat:.4f}, p={p_val:.4f}")
    k_stat, k_p = kruskal(*groups)
    print(f"Kruskal-Wallis (sentiment_diff) across LLMs: H={k_stat:.4f}, p={k_p:.4f}")
else:
    print("Not enough models with >1 observation for statistical tests.")

dims = ["diff_positive", "diff_negative", "diff_mixed", "diff_neutral"]
for col in dims:
    data = df[col].dropna()
    t_stat, p_val = ttest_1samp(data, 0)
    print(f"{col}: t={t_stat:.4f}, p={p_val:.4f} (one-sample t-test against zero)")



In [None]:
"""
Analyze per-dimension sentiment differences and perform one-sample t-tests.

`df` contains:
- PositiveScore_ar, PositiveScore_en
- NegativeScore_ar, NegativeScore_en
- MixedScore_ar,    MixedScore_en
- NeutralScore_ar,  NeutralScore_en
- is_sensitive (0 or 1)

For each sentiment dimension, this script:
1. Prints mean Arabic vs. English scores and their overall mean difference.
2. Runs a one-sample t-test on (EN – AR) for all rows.
3. Runs the same t-test separately for non-sensitive (0) and sensitive (1) subsets.
"""

base_dims = ["PositiveScore", "NegativeScore", "MixedScore", "NeutralScore"]

for dim in base_dims:
    col_ar = f"{dim}_ar"
    col_en = f"{dim}_en"

    mean_ar = df[col_ar].mean()
    mean_en = df[col_en].mean()
    diff_col = df[col_en] - df[col_ar]

    print(f"\n=== {dim} ===")
    print(
        f"Mean AR: {mean_ar:.4f}, "
        f"Mean EN: {mean_en:.4f}, "
        f"Overall mean diff: {diff_col.mean():.4f}"
    )

    diffs = diff_col.dropna()
    if len(diffs) > 1:
        t_stat, p_val = ttest_1samp(diffs, 0)
        print(f"Overall t-test: t={t_stat:.4f}, p={p_val:.4f}")
    else:
        print("Overall t-test: not enough data.")

    for cat in (0, 1):
        cat_diffs = (df[df["is_sensitive"] == cat][col_en] -
                     df[df["is_sensitive"] == cat][col_ar]).dropna()
        if len(cat_diffs) > 1:
            t_stat_cat, p_val_cat = ttest_1samp(cat_diffs, 0)
            print(
                f"  is_sensitive={cat}: mean diff={cat_diffs.mean():.4f}, "
                f"t={t_stat_cat:.4f}, p={p_val_cat:.4f}"
            )
        else:
            print(f"  is_sensitive={cat}: not enough data.")


In [None]:
"""
Plot mean Arabic vs. English sentiment scores by dimension and topic sensitivity.

For each sentiment dimension (Positive, Negative, Mixed, Neutral), this script:
1. Computes mean scores for Arabic and English in non-sensitive and sensitive subsets.
2. Performs a paired t-test (EN vs. AR) within each subset when possible.
3. Stores the means and p-values in a data structure.
4. Creates a 2×2 grid of bar plots showing the mean scores for each dimension,
   with separate bars for non-sensitive and sensitive topics.
"""

dimensions = ["PositiveScore", "NegativeScore", "MixedScore", "NeutralScore"]
dim_labels = ["Positive", "Negative", "Mixed", "Neutral"]

ar_color = "#1f77b4"
en_color = "#ff7f0e"

data_store = {}

for dim in dimensions:
    col_ar = f"{dim}_ar"
    col_en = f"{dim}_en"

    sub_non = df[df["is_sensitive"] == 0].dropna(subset=[col_ar, col_en])
    if not sub_non.empty:
        mean_ar_non = sub_non[col_ar].mean()
        mean_en_non = sub_non[col_en].mean()
        if len(sub_non) > 1:
            _, p_non = ttest_rel(sub_non[col_en], sub_non[col_ar])
        else:
            p_non = math.nan
    else:
        mean_ar_non, mean_en_non, p_non = 0.0, 0.0, math.nan

    sub_sens = df[df["is_sensitive"] == 1].dropna(subset=[col_ar, col_en])
    if not sub_sens.empty:
        mean_ar_sens = sub_sens[col_ar].mean()
        mean_en_sens = sub_sens[col_en].mean()
        if len(sub_sens) > 1:
            _, p_sens = ttest_rel(sub_sens[col_en], sub_sens[col_ar])
        else:
            p_sens = math.nan
    else:
        mean_ar_sens, mean_en_sens, p_sens = 0.0, 0.0, math.nan

    data_store[dim] = (
        mean_ar_non, mean_en_non, p_non,
        mean_ar_sens, mean_en_sens, p_sens
    )

def label_bar(rects, value):
    """
    Place a centered label within the bar showing its value.
    """
    rect = rects[0]
    x_center = rect.get_x() + rect.get_width() / 2
    y_center = value * 0.5
    plt.text(x_center, y_center, f"{value:.2f}",
             ha="center", va="center", fontsize=8)

fig, axes = plt.subplots(2, 2, figsize=(10, 8), sharey=False)
fig.suptitle("Mean Arabic vs. English Sentiment Scores by Dimension", fontsize=12, y=0.98)

for i, dim in enumerate(dimensions):
    ax = axes[i // 2, i % 2]
    (mean_ar_non, mean_en_non, p_non,
     mean_ar_sens, mean_en_sens, p_sens) = data_store[dim]

    positions = [0, 1]
    width = 0.35

    offsets = (-width/2, width/2)
    coords = {
        "ar_non": positions[0] + offsets[0],
        "en_non": positions[0] + offsets[1],
        "ar_sens": positions[1] + offsets[0],
        "en_sens": positions[1] + offsets[1]
    }

    bars = {
        "ar_non": ax.bar(coords["ar_non"], mean_ar_non, width, color=ar_color, edgecolor="black"),
        "en_non": ax.bar(coords["en_non"], mean_en_non, width, color=en_color, edgecolor="black"),
        "ar_sens": ax.bar(coords["ar_sens"], mean_ar_sens, width, color=ar_color, edgecolor="black"),
        "en_sens": ax.bar(coords["en_sens"], mean_en_sens, width, color=en_color, edgecolor="black")
    }

    label_bar(bars["ar_non"], mean_ar_non)
    label_bar(bars["en_non"], mean_en_non)
    label_bar(bars["ar_sens"], mean_ar_sens)
    label_bar(bars["en_sens"], mean_en_sens)

    values = [mean_ar_non, mean_en_non, mean_ar_sens, mean_en_sens]
    max_val = max(values) if any(values) else 1.0
    ax.set_ylim(0, max_val * 1.2)

    ax.set_xticks(positions)
    ax.set_xticklabels(["Non-Sensitive", "Sensitive"])
    ax.set_title(dim_labels[i])
    ax.set_ylabel("Mean Score")

handles = [
    plt.Rectangle((0, 0), 1, 1, color=ar_color, label="Arabic"),
    plt.Rectangle((0, 0), 1, 1, color=en_color, label="English")
]
fig.legend(handles=handles, loc="upper right", fontsize=9)

plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.show()


In [None]:
"""
Run one-sample t-tests on per-model difference metrics.

For each language model and each difference column
('diff_positive', 'diff_negative', 'diff_mixed', 'diff_neutral'), this code:
1. Extracts the non-null values for that model and dimension.
2. If there are at least two observations, performs a one-sample t-test
   comparing the mean difference against zero.
3. Prints the model name, dimension, observed mean, and p-value.
"""

import itertools
from scipy.stats import ttest_1samp

llms = df["model"].unique()
dims = ["diff_positive", "diff_negative", "diff_mixed", "diff_neutral"]

for llm, dim in itertools.product(llms, dims):
    values = df[df["model"] == llm][dim].dropna()
    if len(values) < 2:
        continue

    t_stat, p_val = ttest_1samp(values, 0)
    print(f"LLM={llm}, dimension={dim}: mean={values.mean():.4f}, p={p_val:.4f}")
