In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency

# Path to your HYPERAKTIV dataset
BASE = "/Users/callyminner/StThomas/SEIS631/ChooseYourOwnHypothesis/Repo/Data/hyperaktiv/"  

def read_semicolon_csv(path):
    """
    Reads semicolon-separated CSVs where delimiters may include extra spaces.
    Uses regex to handle both ';' and ';   ' and prevents parsing errors.
    """
    return pd.read_csv(path, sep=r';\s*', engine='python')

# -------------------------
# LOAD patient_info.csv
# -------------------------
patient_info = read_semicolon_csv(BASE + "patient_info.csv")

In [2]:
patient_info = patient_info.rename(columns={
    "ID": "patient_id",
    "SEX": "sex",
    "AGE": "age_group"
})

# 2. Convert sex from 0/1 to categorical labels
sex_map = {0: "Female", 1: "Male"}
patient_info["sex"] = patient_info["sex"].map(sex_map)

# 3. Convert age_group codes to real age ranges
age_map = {
    1: "17-29",
    2: "30-39",
    3: "40-49",
    4: "50-67"
}
patient_info["age_group_label"] = patient_info["age_group"].map(age_map)

# 4. Convert diagnosis fields (0/1) to boolean integers
diagnosis_cols = [
    "ADHD", "ADD", "BIPOLAR", "UNIPOLAR",
    "ANXIETY", "SUBSTANCE", "OTHER"
]

for col in diagnosis_cols:
    patient_info[col] = patient_info[col].astype(int)

# 5. Ensure clinical scales are numeric
clinical_cols = [
    "WURS", "ASRS", "MADRS", "HADS_A", "HADS_D"
]

for col in clinical_cols:
    patient_info[col] = pd.to_numeric(patient_info[col], errors="coerce")

# 6. Convert medication flags to booleans
med_cols = [
    "MED", "MED_Antidepr", "MED_Moodstab", "MED_Antipsych",
    "MED_Anxiety_Benzo", "MED_Sleep",
    "MED_Analgesics_Opioids", "MED_Stimulants"
]

for col in med_cols:
    patient_info[col] = patient_info[col].fillna(0).astype(int)

# 7. OPTIONAL: Remove the odd filter column unless needed
if "filter_$" in patient_info.columns:
    patient_info = patient_info.drop(columns=["filter_$"])

In [3]:
# ----------------------------------------------------
# STEP 3 — BUILD MASTER TABLE
# ----------------------------------------------------

# Load the remaining top-level files
features = read_semicolon_csv(BASE + "features.csv")
cpt = read_semicolon_csv(BASE + "CPT_II_ConnersContinuousPerformanceTest.csv")

# Rename ID column if present
if "ID" in features.columns:
    features = features.rename(columns={"ID": "patient_id"})
if "ID" in cpt.columns:
    cpt = cpt.rename(columns={"ID": "patient_id"})

# Ensure patient_id is string for safe merging
patient_info["patient_id"] = patient_info["patient_id"].astype(str)
features["patient_id"] = features["patient_id"].astype(str)
cpt["patient_id"] = cpt["patient_id"].astype(str)

# Perform incremental merges
master = patient_info.merge(features, on="patient_id", how="left")
master = master.merge(cpt, on="patient_id", how="left")

In [4]:
master[['ADHD', 'BIPOLAR']]   #do a permutation test by scrambling one of these columns

Unnamed: 0,ADHD,BIPOLAR
0,1,1
1,1,0
2,1,1
3,1,1
4,1,1
...,...,...
98,0,0
99,0,1
100,0,0
101,0,1


In [5]:
pd.crosstab(master['ADHD'], master['BIPOLAR'])  #then make this from the scrambled results? chi-square test of significance

BIPOLAR,0,1
ADHD,Unnamed: 1_level_1,Unnamed: 2_level_1
0,28,24
1,33,18


In [6]:
master[['ADHD', 'UNIPOLAR']]

Unnamed: 0,ADHD,UNIPOLAR
0,1,0
1,1,1
2,1,0
3,1,0
4,1,0
...,...,...
98,0,1
99,0,0
100,0,1
101,0,0


In [7]:
pd.crosstab(master['ADHD'], master['UNIPOLAR'])

UNIPOLAR,0,1
ADHD,Unnamed: 1_level_1,Unnamed: 2_level_1
0,38,14
1,34,17


In [8]:
master[['ADHD', 'ANXIETY']]

Unnamed: 0,ADHD,ANXIETY
0,1,1
1,1,0
2,1,0
3,1,1
4,1,1
...,...,...
98,0,1
99,0,1
100,0,1
101,0,1


In [9]:
pd.crosstab(master['ADHD'], master['ANXIETY'])

ANXIETY,0,1,9
ADHD,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,16,35,1
1,29,22,0


In [10]:
master[['ADHD', 'SUBSTANCE']]

Unnamed: 0,ADHD,SUBSTANCE
0,1,0
1,1,0
2,1,1
3,1,1
4,1,0
...,...,...
98,0,0
99,0,0
100,0,1
101,0,0


In [11]:
pd.crosstab(master['ADHD'], master['SUBSTANCE'])

SUBSTANCE,0,1,9
ADHD,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,43,8,1
1,35,16,0
