In [9]:
import numpy as np
import pandas as pd
import random
import math
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import joblib

# ================================
# Cognitive Sandstorm Task Data Generator
# Flexible Human Collective Wisdom replication (equal-strength condition)
# ================================

# Participant-specific parameters from the paper
np.random.seed(42)
random.seed(42)

In [2]:
def compute_mu_signal(mu_noise, sigma_0, SNReff, n=3):
    return mu_noise + (sigma_0 * (SNReff / math.sqrt(n)))

def generate_participant_parameters():
    params = {}
    n = 3
    bounds = {1: (260, 290), 2: (350, 380), 3: (440, 470)}
    SNReff_weak, SNReff_medium, SNReff_strong = 0.5, 1.35, 6.0

    for pid in [1, 2, 3]:
        mu_noise = random.uniform(*bounds[pid])
        mu_medium = mu_noise * 1.15
        sigma_0 = (mu_medium - mu_noise) / (SNReff_medium / math.sqrt(n))
        mu_weak = compute_mu_signal(mu_noise, sigma_0, SNReff_weak, n)
        mu_strong = compute_mu_signal(mu_noise, sigma_0, SNReff_strong, n)

        params[pid] = {
            'sigma': round(sigma_0, 2),
            'mu_noise': round(mu_noise, 2),
            'mu_weak': round(mu_weak, 2),
            'mu_medium': round(mu_medium, 2),
            'mu_strong': round(mu_strong, 2)
        }

    return params

participant_params = generate_participant_parameters()
print("Generated participant parameters:")
for pid, p in participant_params.items():
    print(f"Participant {pid}: {p}")

Generated participant parameters:
Participant 1: {'sigma': 53.73, 'mu_noise': 279.18, 'mu_weak': 294.69, 'mu_medium': 321.06, 'mu_strong': 465.3}
Participant 2: {'sigma': 67.5, 'mu_noise': 350.75, 'mu_weak': 370.24, 'mu_medium': 403.36, 'mu_strong': 584.58}
Participant 3: {'sigma': 86.27, 'mu_noise': 448.25, 'mu_weak': 473.15, 'mu_medium': 515.49, 'mu_strong': 747.08}


In [3]:
# Trial generation functions

def generate_trial(class_label, participant_id, strength='medium', n_measurements=3):
    params = participant_params[participant_id]
    sigma = params['sigma']
    
    if class_label == 0:
        mu = params['mu_noise']
    elif class_label == 1:
        mu = params[f'mu_{strength}']
    else:
        raise ValueError("class_label must be 0 or 1")
    
    measurements = np.random.normal(mu, sigma, n_measurements)
    return np.round(measurements).astype(int)

def generate_correlated_trials(class_label, strength='medium', n_measurements=3, rho=0.2):
    """
    Generate correlated measurements across all participants.
    """
    mus = [participant_params[pid]['mu_noise'] if class_label==0 else participant_params[pid][f'mu_{strength}'] for pid in [1,2,3]]
    sigma = participant_params[1]['sigma']  # Assuming same sigma across participants for simplicity

    cov = rho * sigma**2
    cov_matrix = np.full((3,3), cov)
    np.fill_diagonal(cov_matrix, sigma**2)

    samples = np.random.multivariate_normal(mus, cov_matrix, n_measurements)
    return np.round(samples).astype(int).T  # Shape: 3 x n_measurements

# -----------------------------
# Dataset generation

def generate_dataset(num_trials_per_class=5000, condition='equal', correlated=False, rho=0.2):
    """
    Generate datasets for each participant.
    """
    data = {1: [], 2: [], 3: []}
    
    for _ in range(num_trials_per_class):
        for class_label in [0,1]:
            if correlated:
                samples = generate_correlated_trials(class_label, strength='medium', n_measurements=3, rho=rho)
                for pid, meas in zip([1,2,3], samples):
                    data[pid].append([class_label] + meas.tolist())
            else:
                for pid in [1,2,3]:
                    if condition == 'mixture' and class_label == 1:
                        strength = random.choices(['weak','medium','strong'], weights=[1,1,1])[0]
                    else:
                        strength = 'medium'
                    meas = generate_trial(class_label, pid, strength=strength)
                    data[pid].append([class_label] + meas.tolist())

    for pid in [1,2,3]:
        df = pd.DataFrame(data[pid], columns=['label','feat1','feat2','feat3'])
        np.random.shuffle(df.values)
        df.to_csv(f"participant_{pid}_data_{condition}_condition.csv", index=False)
        print(f"Saved participant_{pid}_data_{condition}_condition.csv with shape {df.shape}")


In [7]:
if __name__ == "__main__":
    # Equal-strength condition
    generate_dataset(num_trials_per_class=5000, condition='equal', correlated=False)
    
    # Mixture condition
    generate_dataset(num_trials_per_class=5000, condition='mixture', correlated=False)
    
    # Correlated group decision phase
    generate_dataset(num_trials_per_class=5000, condition='equal', correlated=True, rho=0.2)

Saved participant_1_data_equal_condition.csv with shape (10000, 4)
Saved participant_2_data_equal_condition.csv with shape (10000, 4)
Saved participant_3_data_equal_condition.csv with shape (10000, 4)
Saved participant_1_data_mixture_condition.csv with shape (10000, 4)
Saved participant_2_data_mixture_condition.csv with shape (10000, 4)
Saved participant_3_data_mixture_condition.csv with shape (10000, 4)
Saved participant_1_data_equal_condition.csv with shape (10000, 4)
Saved participant_2_data_equal_condition.csv with shape (10000, 4)
Saved participant_3_data_equal_condition.csv with shape (10000, 4)


In [13]:
def load_data(filepath):
    df = pd.read_csv(filepath)
    X = df[['feat1', 'feat2', 'feat3']]
    y = df['label']
    return X, y

def train_logistic_regression(X_train, y_train):
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)
    return model

def evaluate_model(model, X_train, y_train, X_test, y_test, participant_id):
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    train_acc = accuracy_score(y_train, y_train_pred)
    test_acc = accuracy_score(y_test, y_test_pred)
    report = classification_report(y_test, y_test_pred)

    print(f"Participant {participant_id} Training Accuracy: {train_acc:.4f}")
    print(f"Participant {participant_id} Test Accuracy: {test_acc:.4f}")
    print(f"Participant {participant_id} Classification Report:\n{report}")

def save_model(model, filename):
    joblib.dump(model, filename)
    print(f"Model saved to {filename}")

In [14]:
if __name__ == "__main__":
    # Generate datasets if not already generated
    generate_dataset(num_trials_per_class=5000, condition='equal')
    
    # Train 3 independent logistic regression models with different seeds
    seeds = [42, 123, 999]
    for idx, seed in enumerate(seeds, start=1):
        X, y = load_data(f"participant_{idx}_data_equal_condition.csv")
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

        model = train_logistic_regression(X_train, y_train)
        evaluate_model(model, X_train, y_train, X_test, y_test, participant_id=idx)
        save_model(model, f"logistic_regression_participant{idx}.pkl")

Saved participant_1_data_equal_condition.csv with shape (10000, 4)
Saved participant_2_data_equal_condition.csv with shape (10000, 4)
Saved participant_3_data_equal_condition.csv with shape (10000, 4)
Participant 1 Training Accuracy: 0.7470
Participant 1 Test Accuracy: 0.7445
Participant 1 Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.75      0.74       990
           1       0.75      0.74      0.75      1010

    accuracy                           0.74      2000
   macro avg       0.74      0.74      0.74      2000
weighted avg       0.74      0.74      0.74      2000

Model saved to logistic_regression_participant1.pkl
Participant 2 Training Accuracy: 0.7480
Participant 2 Test Accuracy: 0.7525
Participant 2 Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.74      0.75      1017
           1       0.74      0.76      0.75       983

    accuracy                  