In [None]:
import os
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind_from_stats, t
import math

# Define data paths
police_path = 'F:/FacialExpression/Police/TimeSeriesAnalysisResults/MicroExpression'
stim_path = 'F:/FacialExpression/3rdYear_stim/TimeSeriesAnalysisResults/EC_control'
mock_path = 'F:/FacialExpression/3rdYear/TimeSeriesAnalysisResults/EC_control'

police_output_path = 'F:/FacialExpression/Police/TimeSeriesAnalysisResults/StatisticalAnalysis'
stim_output_path = 'F:/FacialExpression/3rdYear_stim/TimeSeriesAnalysisResults/StatisticalAnalysis'
mock_output_path = 'F:/FacialExpression/3rdYear/TimeSeriesAnalysisResults/StatisticalAnalysis'

# Output file details
output_file = 'police_ME_AU06_11frame_statistics.csv'
feature_type = 'ME'
path = police_path
output_path = police_output_path

feat_idx = 4
frame_idx = 10

T_total_result = []
F_total_result = []

# Load data and collect relevant values
for f in sorted(os.listdir(path)):
    df = pd.read_csv(os.path.join(path, f))
    df = df.to_numpy()
    df = df[:, 1:]
    if f.split('.')[0].split('_')[-1] == 'T':
        T_total_result.append(df[frame_idx, feat_idx])
    elif f.split('.')[0].split('_')[-1] == 'F':
        F_total_result.append(df[frame_idx, feat_idx])

# Initialize statistics dictionary and confidence interval
statistics = {}
confidence_interval = 0.95  # Confidence interval (95%)

# Calculate statistics for observations A and B
observation_A = T_total_result
observation_B = F_total_result
observation_statistics = {}

# 1. Independent samples t-test between groups A and B
truth_mean = np.mean(T_total_result)
truth_std = np.std(T_total_result)
truth_sample_size = len(T_total_result)

deception_mean = np.mean(F_total_result)
deception_std = np.std(F_total_result)
deception_sample_size = len(F_total_result)

t_statistic, p_value = ttest_ind_from_stats(
    truth_mean, truth_std, truth_sample_size,
    deception_mean, deception_std, deception_sample_size
)
observation_statistics['t-test'] = {'t-statistic': t_statistic, 'p-value': p_value}

# 2. Calculate Cohen's d effect size
def cohen_d(a, b):
    pooled_std = math.sqrt((np.std(a, ddof=1) ** 2 + np.std(b, ddof=1) ** 2) / 2)
    return abs(np.mean(a) - np.mean(b)) / pooled_std

if len(observation_A) == len(observation_B):
    effect_size = cohen_d(observation_A, observation_B)
    observation_statistics["Cohen's d"] = effect_size
else:
    n1 = len(observation_A)
    n2 = len(observation_B)
    pooled_std = np.sqrt(((n1 - 1) * np.var(observation_A) + (n2 - 1) * np.var(observation_B)) / (n1 + n2 - 2))
    effect_size = (np.mean(observation_A) - np.mean(observation_B)) / pooled_std
    observation_statistics["Cohen's d (Adjusted)"] = effect_size

# 3. Calculate mean difference
mean_diff = np.mean(observation_A) - np.mean(observation_B)
observation_statistics['Mean Difference'] = mean_diff

# 4. Calculate standard error of the mean
s1 = np.std(observation_A, ddof=1)
s2 = np.std(observation_B, ddof=1)
n1 = len(observation_A)
n2 = len(observation_B)
sem = np.sqrt((s1**2 / n1) + (s2**2 / n2))
observation_statistics['Standard Error of the Mean'] = sem

# 5. Calculate means of groups A and B
mean_A = np.mean(observation_A)
mean_B = np.mean(observation_B)
observation_statistics['Mean'] = {'A_group': mean_A, 'B_group': mean_B}

# 6. Calculate t-value for groups A and B (when sample sizes are different)
if len(observation_A) != len(observation_B):
    var1 = np.var(observation_A, ddof=1)
    var2 = np.var(observation_B, ddof=1)
    n1 = len(observation_A)
    n2 = len(observation_B)
    dof = ((var1 / n1 + var2 / n2) ** 2) / (((var1 / n1) ** 2) / (n1 - 1) + ((var2 / n2) ** 2) / (n2 - 1))

    if np.isinf(dof):
        dof = np.inf

    t_critical = t.ppf(confidence_interval, dof)
    std_error_diff = np.sqrt((var1 / n1) + (var2 / n2))
    mean_diff_interval = t_critical * std_error_diff
    observation_statistics['t-value'] = mean_diff_interval
    observation_statistics['Degrees of Freedom'] = dof

# 7. Calculate confidence interval for groups A and B (when sample sizes are different)
t_critical = t.ppf(confidence_interval, min(len(observation_A), len(observation_B)) - 1)
std_err_A = np.std(observation_A, ddof=1) / np.sqrt(len(observation_A))
std_err_B = np.std(observation_B, ddof=1) / np.sqrt(len(observation_B))
mean_diff_interval = t_critical * np.sqrt(std_err_A**2 + std_err_B**2)
mean_A_minus_mean_B = np.mean(observation_A) - np.mean(observation_B)
observation_statistics['Confidence Interval'] = {'lower': mean_A_minus_mean_B - mean_diff_interval,
                                                 'upper': mean_A_minus_mean_B + mean_diff_interval}

# Add observation statistics to the main statistics dictionary
statistics[f'Observation'] = observation_statistics

# Convert statistics to a DataFrame and save as a CSV file
df = pd.DataFrame.from_dict(statistics, orient='columns')
output_dir = os.path.join(output_path, feature_type)
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
df.to_csv(os.path.join(output_dir, output_file), index=True)
