# Exercise: Statistics and Plotting in Python

# Load packages

In [None]:
import numpy as np
import pandas as pd
import pingouin as pg

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

# Dataset

Let's create another fictional dataset:

- **subject**: Unique subject ID including group label and number.
- **group**: Diagnostic group — MDD (Major Depressive Disorder), BD (Bipolar Disorder), or HC (Healthy Control).
- **sex**: Biological sex (m or w).
- **age**: Age in years (group-specific realistic means with variation).
- **diet**: Reported diet, either mediterranean or western.
- **season**: Within-subject variable — whether the data point was collected in summer or winter.
- **SleepScore**: Subjective sleep quality on a scale from 1 (poor) to 10 (excellent), group-specific.
- **Fatigue**: Daytime fatigue on a scale from 1 (none) to 10 (severe), inversely related to SleepScore plus noise.
- **KL**: Concentration performance score (from the d2 test), realistic range (60–130), group-specific.
- **HAMD**: Depression severity (Hamilton Depression Rating Scale, 0–35), highest in MDD, then BD, then HC.

**Run the code below to generate the dataset:**

In [None]:
import numpy as np
import pandas as pd

np.random.seed(42)

n_mdd = 40
n_bd = 40
n_hc = 40

groups = ['MDD'] * n_mdd + ['BD'] * n_bd + ['HC'] * n_hc
sexes = np.random.choice(['m', 'w'], size=n_mdd + n_bd + n_hc)
ages_mdd = np.random.normal(45, 7, n_mdd)
ages_bd = np.random.normal(40, 7, n_bd)
ages_hc = np.random.normal(35, 7, n_hc)
ages = np.round(np.concatenate([ages_mdd, ages_bd, ages_hc]), 1)

diets = np.random.choice(['mediterranean', 'western'], size=n_mdd + n_bd + n_hc)

subjects = [f"{g}_{i+1}" for g, i in zip(
    np.repeat(['MDD', 'BD', 'HC'], [n_mdd, n_bd, n_hc]),
    range(n_mdd + n_bd + n_hc)
)]

season = ['summer', 'winter']
subject_list = []
group_list = []
sex_list = []
age_list = []
diet_list = []
season_list = []

for i in range(len(subjects)):
    for s in season:
        subject_list.append(subjects[i])
        group_list.append(groups[i])
        sex_list.append(sexes[i])
        age_list.append(ages[i])
        diet_list.append(diets[i])
        season_list.append(s)

df = pd.DataFrame({
    'subject': subject_list,
    'group': group_list,
    'sex': sex_list,
    'age': age_list,
    'diet': diet_list,
    'season': season_list
})

# Function to simulate variables with group and season effects
def simulate_sleepscore(row):
    base = {'MDD': 4, 'BD': 5, 'HC': 7}[row['group']]
    # Season effect: slightly better sleep in summer
    season_effect = 0.5 if row['season'] == 'summer' else -0.5
    noise = np.random.normal(0, 1)
    val = base + season_effect + noise
    return np.clip(val, 1, 10)

df['SleepScore'] = df.apply(simulate_sleepscore, axis=1)
df['Fatigue'] = 11 - df['SleepScore'] + np.random.normal(0, 1, size=len(df))
df['Fatigue'] = df['Fatigue'].clip(1, 10)

# KL (concentration performance)
def simulate_kl(row):
    base = {'MDD': 75, 'BD': 90, 'HC': 110}[row['group']]
    season_effect = 2 if row['season'] == 'summer' else -2
    noise = np.random.normal(0, 8)
    val = base + season_effect + noise
    return np.clip(val, 60, 130)

df['KL'] = df.apply(simulate_kl, axis=1)

# HAMD (depression severity)
def simulate_hamd(row):
    base = {'MDD': 22, 'BD': 15, 'HC': 5}[row['group']]
    noise = np.random.normal(0, 4)
    val = base + noise
    return np.clip(val, 0, 35)

df['HAMD'] = df.apply(simulate_hamd, axis=1)

df.reset_index(drop=True, inplace=True)
data=df
data.head(10)

# Descriptive Statistics

### ❓ Question: Age Distribution

Print an overview (mean and std) of the age distribution of the sample:

<details>
<summary>💡 Show solution</summary>

```
data_unique = data.drop_duplicates(subset='subject')
data_unique['age'].describe()
```
</details>

### ❓ Question: Sex Ratio
Print the counts of each sex per group:

<details>
<summary>💡 Show solution</summary>

```
unique_subjects = data.drop_duplicates(subset='subject')

for group in unique_subjects['group'].unique():
    print(f"Sex counts for group {group}:")
    counts = unique_subjects[unique_subjects['group'] == group]['sex'].value_counts()
    print(counts)
```
</details>

### ❓ Question: HAMD Scores

Calculate and print the mean HAMD scores for each group:

<details>
<summary>💡 Show solution</summary>

```
unique_subjects = data.drop_duplicates(subset='subject')
group_means = unique_subjects.groupby('group')['HAMD'].mean()
print(group_means)
```
</details>

# Statistical Tests

### ❓ Question:

Test whether the groups (MDD, BD, HC) differ significantly in their HAMD scores.

**Hint:** Use a one-way ANOVA.

<details>
<summary>💡 Show solution</summary>

```
anova = pg.anova(data=unique_subjects, dv='HAMD', between='group', detailed=True)
print(anova)
```
</details>

Use post-hoc tests for pairwise comparisons.

<details>
<summary>💡 Show solution</summary>

```
posthoc = pg.pairwise_tukey(dv='HAMD', between='group', data=unique_subjects)
print(posthoc)
```
</details>

Visualize the data (HAMD scores per group)

<details>
<summary>💡 Show solution</summary>

```
plt.figure(figsize=(8,6))
sns.boxplot(data=unique_subjects, x='group', y='HAMD', palette='Set2')
sns.stripplot(data=unique_subjects, x='group', y='HAMD', color='black', alpha=0.5, jitter=True)

plt.title('HAMD Scores by Group')
plt.xlabel('Group')
plt.ylabel('HAMD Score')
plt.show()
```
</details>

### ❓ Question:

Is sleep quality influenced by both group membership (MDD, BD, HC) and season (winter, summer)?

**Hint:** Use a mixed ANOVA.

<details>
<summary>💡 Show solution</summary>

```
aov = pg.mixed_anova(data=data, 
                     dv='SleepScore', 
                     within='season', 
                     between='group', 
                     subject='subject', 
                     correction=False, 
                     effsize='np2')

pg.print_table(aov)
```
</details>

Visualize the results

<details>
<summary>💡 Show solution</summary>

```
# Set visual style
sns.set(style="whitegrid", context="talk")

# Create the plot
plt.figure(figsize=(8, 6))
sns.pointplot(data=data, 
              x='season', 
              y='SleepScore', 
              hue='group', 
              dodge=True, 
              markers=["o", "s", "D"], 
              capsize=.1, 
              errwidth=1.5, 
              palette='Set2')

# Customize labels and title
plt.title("Subjective Sleep Quality by Season and Group")
plt.xlabel("Season")
plt.ylabel("Sleep Score")
plt.legend(title="Group")
plt.tight_layout()
plt.show()

```
</details>

### ❓ Question:

Investigate whether subjective sleep quality (SleepScore) and daytime fatigue (Fatigue) are associated.

**Hint:** Use a pearson correlation.

<details>
<summary>💡 Show solution</summary>

```
correlation = pg.corr(data['SleepScore'], data['Fatigue'])
print(correlation)
```
</details>

Visualize the results

<details>
<summary>💡 Show solution</summary>

```
sns.set(style="whitegrid", context="talk")

# Create jointplot with regression line
plot = sns.jointplot(
    data=data,
    x='SleepScore',
    y='Fatigue',
    kind='reg',              
    height=6,
    scatter_kws={'alpha': 0.6},
    line_kws={'color': 'red'}
)

plot.fig.suptitle('Sleep Score vs. Fatigue', fontsize=16)
plot.fig.tight_layout()
plot.fig.subplots_adjust(top=0.95)  
```
</details>