## Introduction 

In [1]:
import pandas as pd 
import numpy as np 
import zipfile  
import seaborn as sns 
import matplotlib.pyplot as plt 
%matplotlib inline

from sklearn.preprocessing import StandardScaler  
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler


## Load Datasets 

* Train Dataset

In [2]:
import zipfile

zip_path = r"C:\Users\wanji\Desktop\WIDS Challenge\widsdatathon2025.zip"

# List files inside the ZIP
with zipfile.ZipFile(zip_path, 'r') as z:
    print(z.namelist())  # This will list all files inside the ZIP


['Data Dictionary.xlsx', 'SAMPLE_SUBMISSION.xlsx', 'TEST/TEST_CATEGORICAL.xlsx', 'TEST/TEST_FUNCTIONAL_CONNECTOME_MATRICES.csv', 'TEST/TEST_QUANTITATIVE_METADATA.xlsx', 'TRAIN/TRAINING_SOLUTIONS.xlsx', 'TRAIN/TRAIN_CATEGORICAL_METADATA.xlsx', 'TRAIN/TRAIN_FUNCTIONAL_CONNECTOME_MATRICES.csv', 'TRAIN/TRAIN_QUANTITATIVE_METADATA.xlsx']


In [3]:
# File paths
train_fc_path = r"C:\Users\wanji\Desktop\WIDS Challenge\widsdatathon2025\TRAIN\TRAIN_FUNCTIONAL_CONNECTOME_MATRICES.csv"
train_cat_path = r"C:\Users\wanji\Desktop\WIDS Challenge\widsdatathon2025\TRAIN\TRAIN_CATEGORICAL_METADATA.xlsx"
train_quant_path = r"C:\Users\wanji\Desktop\WIDS Challenge\widsdatathon2025\TRAIN\TRAIN_QUANTITATIVE_METADATA.xlsx"

# Load datasets
train_fc = pd.read_csv(train_fc_path)
train_cat = pd.read_excel(train_cat_path)
train_quant = pd.read_excel(train_quant_path)

# Inspect data
print("Functional Connectome Matrices Shape:", train_fc.shape)
print("Categorical Metadata Shape:", train_cat.shape)
print("Quantitative Metadata Shape:", train_quant.shape)

train_cat.head()

Functional Connectome Matrices Shape: (1213, 19901)
Categorical Metadata Shape: (1213, 10)
Quantitative Metadata Shape: (1213, 19)


Unnamed: 0,participant_id,Basic_Demos_Enroll_Year,Basic_Demos_Study_Site,PreInt_Demos_Fam_Child_Ethnicity,PreInt_Demos_Fam_Child_Race,MRI_Track_Scan_Location,Barratt_Barratt_P1_Edu,Barratt_Barratt_P1_Occ,Barratt_Barratt_P2_Edu,Barratt_Barratt_P2_Occ
0,UmrK0vMLopoR,2016,1,0.0,0,1,21,45,21,45
1,CPaeQkhcjg7d,2019,3,1.0,2,3,15,15,0,0
2,Nb4EetVPm3gs,2016,1,1.0,8,1,18,40,0,0
3,p4vPhVu91o4b,2018,3,0.0,8,3,15,30,18,0
4,M09PXs7arQ5E,2019,3,0.0,1,3,15,20,0,0


* Test Dataset 

In [4]:
# File paths
test_fc_path = r"C:\Users\wanji\Desktop\WIDS Challenge\widsdatathon2025\TRAIN\TRAIN_FUNCTIONAL_CONNECTOME_MATRICES.csv"
test_cat_path = r"C:\Users\wanji\Desktop\WIDS Challenge\widsdatathon2025\TRAIN\TRAIN_CATEGORICAL_METADATA.xlsx"
test_quant_path = r"C:\Users\wanji\Desktop\WIDS Challenge\widsdatathon2025\TRAIN\TRAIN_QUANTITATIVE_METADATA.xlsx"

# Load datasets
test_fc = pd.read_csv(train_fc_path)
test_cat = pd.read_excel(train_cat_path)
test_quant = pd.read_excel(train_quant_path)

# Inspect data
print("Functional Connectome Matrices Shape:", test_fc.shape)
print("Categorical Metadata Shape:", test_cat.shape)
print("Quantitative Metadata Shape:", test_quant.shape)

train_cat.head()  

Functional Connectome Matrices Shape: (1213, 19901)
Categorical Metadata Shape: (1213, 10)
Quantitative Metadata Shape: (1213, 19)


Unnamed: 0,participant_id,Basic_Demos_Enroll_Year,Basic_Demos_Study_Site,PreInt_Demos_Fam_Child_Ethnicity,PreInt_Demos_Fam_Child_Race,MRI_Track_Scan_Location,Barratt_Barratt_P1_Edu,Barratt_Barratt_P1_Occ,Barratt_Barratt_P2_Edu,Barratt_Barratt_P2_Occ
0,UmrK0vMLopoR,2016,1,0.0,0,1,21,45,21,45
1,CPaeQkhcjg7d,2019,3,1.0,2,3,15,15,0,0
2,Nb4EetVPm3gs,2016,1,1.0,8,1,18,40,0,0
3,p4vPhVu91o4b,2018,3,0.0,8,3,15,30,18,0
4,M09PXs7arQ5E,2019,3,0.0,1,3,15,20,0,0


In [5]:
##Merge the datasets: Training and testing 
train_data = train_fc.merge(train_quant, on = "participant_id").merge(train_cat, on = "participant_id")
train_data.shape
train_data.head()

Unnamed: 0,participant_id,0throw_1thcolumn,0throw_2thcolumn,0throw_3thcolumn,0throw_4thcolumn,0throw_5thcolumn,0throw_6thcolumn,0throw_7thcolumn,0throw_8thcolumn,0throw_9thcolumn,...,MRI_Track_Age_at_Scan,Basic_Demos_Enroll_Year,Basic_Demos_Study_Site,PreInt_Demos_Fam_Child_Ethnicity,PreInt_Demos_Fam_Child_Race,MRI_Track_Scan_Location,Barratt_Barratt_P1_Edu,Barratt_Barratt_P1_Occ,Barratt_Barratt_P2_Edu,Barratt_Barratt_P2_Occ
0,70z8Q2xdTXM3,0.093473,0.146902,0.067893,0.015141,0.070221,0.063997,0.055382,-0.035335,0.068583,...,11.889002,2018,1,0.0,1,2,21,45,21,45
1,WHWymJu6zNZi,0.02958,0.179323,0.112933,0.038291,0.104899,0.06425,0.008488,0.077505,-0.00475,...,7.670088,2015,1,1.0,8,1,6,5,0,15
2,4PAQp1M6EyAo,-0.05158,0.139734,0.068295,0.046991,0.111085,0.026978,0.151377,0.021198,0.083721,...,7.743896,2019,1,0.0,0,2,18,35,9,20
3,obEacy4Of68I,0.016273,0.204702,0.11598,0.043103,0.056431,0.057615,0.055773,0.07503,0.001033,...,,2017,1,0.0,0,2,21,40,21,40
4,s7WzzDcmDOhF,0.065771,0.098714,0.097604,0.112988,0.071139,0.085607,0.019392,-0.036403,-0.020375,...,,2019,1,2.0,8,2,9,35,0,0


In [6]:
##Merge the datasets: Training and testing 
test_data = test_fc.merge(test_quant, on = "participant_id").merge(test_cat, on = "participant_id")
test_data.shape
test_data.head()

Unnamed: 0,participant_id,0throw_1thcolumn,0throw_2thcolumn,0throw_3thcolumn,0throw_4thcolumn,0throw_5thcolumn,0throw_6thcolumn,0throw_7thcolumn,0throw_8thcolumn,0throw_9thcolumn,...,MRI_Track_Age_at_Scan,Basic_Demos_Enroll_Year,Basic_Demos_Study_Site,PreInt_Demos_Fam_Child_Ethnicity,PreInt_Demos_Fam_Child_Race,MRI_Track_Scan_Location,Barratt_Barratt_P1_Edu,Barratt_Barratt_P1_Occ,Barratt_Barratt_P2_Edu,Barratt_Barratt_P2_Occ
0,70z8Q2xdTXM3,0.093473,0.146902,0.067893,0.015141,0.070221,0.063997,0.055382,-0.035335,0.068583,...,11.889002,2018,1,0.0,1,2,21,45,21,45
1,WHWymJu6zNZi,0.02958,0.179323,0.112933,0.038291,0.104899,0.06425,0.008488,0.077505,-0.00475,...,7.670088,2015,1,1.0,8,1,6,5,0,15
2,4PAQp1M6EyAo,-0.05158,0.139734,0.068295,0.046991,0.111085,0.026978,0.151377,0.021198,0.083721,...,7.743896,2019,1,0.0,0,2,18,35,9,20
3,obEacy4Of68I,0.016273,0.204702,0.11598,0.043103,0.056431,0.057615,0.055773,0.07503,0.001033,...,,2017,1,0.0,0,2,21,40,21,40
4,s7WzzDcmDOhF,0.065771,0.098714,0.097604,0.112988,0.071139,0.085607,0.019392,-0.036403,-0.020375,...,,2019,1,2.0,8,2,9,35,0,0


## Data Exploration


In [7]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1213 entries, 0 to 1212
Columns: 19928 entries, participant_id to Barratt_Barratt_P2_Occ
dtypes: float64(19903), int64(24), object(1)
memory usage: 184.4+ MB


In [8]:
train_data.describe()

Unnamed: 0,0throw_1thcolumn,0throw_2thcolumn,0throw_3thcolumn,0throw_4thcolumn,0throw_5thcolumn,0throw_6thcolumn,0throw_7thcolumn,0throw_8thcolumn,0throw_9thcolumn,0throw_10thcolumn,...,MRI_Track_Age_at_Scan,Basic_Demos_Enroll_Year,Basic_Demos_Study_Site,PreInt_Demos_Fam_Child_Ethnicity,PreInt_Demos_Fam_Child_Race,MRI_Track_Scan_Location,Barratt_Barratt_P1_Edu,Barratt_Barratt_P1_Occ,Barratt_Barratt_P2_Edu,Barratt_Barratt_P2_Occ
count,1213.0,1213.0,1213.0,1213.0,1213.0,1213.0,1213.0,1213.0,1213.0,1213.0,...,853.0,1213.0,1213.0,1202.0,1213.0,1213.0,1213.0,1213.0,1213.0,1213.0
mean,0.060553,0.122315,0.060268,0.041287,0.069722,0.091007,0.066852,0.000252,0.014128,-0.002914,...,11.245678,2017.652102,2.014839,0.424293,2.080791,2.288541,17.641385,24.892828,14.122012,24.719703
std,0.064178,0.054026,0.057495,0.043491,0.044222,0.049189,0.046864,0.049046,0.038205,0.042462,...,3.234372,1.122522,1.135147,0.68747,3.164636,0.758348,4.004639,17.025899,7.201023,17.171113
min,-0.183279,-0.059932,-0.145566,-0.127827,-0.072043,-0.079184,-0.105722,-0.164297,-0.137728,-0.14849,...,0.0,2015.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.018482,0.086102,0.026548,0.014457,0.042462,0.057614,0.036934,-0.031358,-0.010635,-0.030538,...,8.803901,2017.0,1.0,0.0,0.0,2.0,15.0,0.0,12.0,5.0
50%,0.058276,0.12322,0.061339,0.043246,0.067066,0.086494,0.067247,0.002549,0.01613,-0.002604,...,10.739219,2018.0,1.0,0.0,0.0,2.0,18.0,30.0,18.0,30.0
75%,0.100103,0.154518,0.099056,0.068408,0.096504,0.119404,0.095117,0.031053,0.03877,0.024507,...,13.460871,2019.0,3.0,1.0,2.0,3.0,21.0,40.0,21.0,40.0
max,0.321522,0.390895,0.278429,0.189825,0.3175,0.316811,0.270018,0.168196,0.145364,0.128301,...,21.564453,2020.0,4.0,3.0,11.0,4.0,21.0,45.0,21.0,45.0


In [9]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1213 entries, 0 to 1212
Columns: 19928 entries, participant_id to Barratt_Barratt_P2_Occ
dtypes: float64(19903), int64(24), object(1)
memory usage: 184.4+ MB


In [10]:
test_data.describe()

Unnamed: 0,0throw_1thcolumn,0throw_2thcolumn,0throw_3thcolumn,0throw_4thcolumn,0throw_5thcolumn,0throw_6thcolumn,0throw_7thcolumn,0throw_8thcolumn,0throw_9thcolumn,0throw_10thcolumn,...,MRI_Track_Age_at_Scan,Basic_Demos_Enroll_Year,Basic_Demos_Study_Site,PreInt_Demos_Fam_Child_Ethnicity,PreInt_Demos_Fam_Child_Race,MRI_Track_Scan_Location,Barratt_Barratt_P1_Edu,Barratt_Barratt_P1_Occ,Barratt_Barratt_P2_Edu,Barratt_Barratt_P2_Occ
count,1213.0,1213.0,1213.0,1213.0,1213.0,1213.0,1213.0,1213.0,1213.0,1213.0,...,853.0,1213.0,1213.0,1202.0,1213.0,1213.0,1213.0,1213.0,1213.0,1213.0
mean,0.060553,0.122315,0.060268,0.041287,0.069722,0.091007,0.066852,0.000252,0.014128,-0.002914,...,11.245678,2017.652102,2.014839,0.424293,2.080791,2.288541,17.641385,24.892828,14.122012,24.719703
std,0.064178,0.054026,0.057495,0.043491,0.044222,0.049189,0.046864,0.049046,0.038205,0.042462,...,3.234372,1.122522,1.135147,0.68747,3.164636,0.758348,4.004639,17.025899,7.201023,17.171113
min,-0.183279,-0.059932,-0.145566,-0.127827,-0.072043,-0.079184,-0.105722,-0.164297,-0.137728,-0.14849,...,0.0,2015.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.018482,0.086102,0.026548,0.014457,0.042462,0.057614,0.036934,-0.031358,-0.010635,-0.030538,...,8.803901,2017.0,1.0,0.0,0.0,2.0,15.0,0.0,12.0,5.0
50%,0.058276,0.12322,0.061339,0.043246,0.067066,0.086494,0.067247,0.002549,0.01613,-0.002604,...,10.739219,2018.0,1.0,0.0,0.0,2.0,18.0,30.0,18.0,30.0
75%,0.100103,0.154518,0.099056,0.068408,0.096504,0.119404,0.095117,0.031053,0.03877,0.024507,...,13.460871,2019.0,3.0,1.0,2.0,3.0,21.0,40.0,21.0,40.0
max,0.321522,0.390895,0.278429,0.189825,0.3175,0.316811,0.270018,0.168196,0.145364,0.128301,...,21.564453,2020.0,4.0,3.0,11.0,4.0,21.0,45.0,21.0,45.0


In [11]:
# check for duplicates in both training and testing data 
train_data.duplicated()


0       False
1       False
2       False
3       False
4       False
        ...  
1208    False
1209    False
1210    False
1211    False
1212    False
Length: 1213, dtype: bool

In [12]:
test_data.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
1208    False
1209    False
1210    False
1211    False
1212    False
Length: 1213, dtype: bool

In [13]:
# numeric_data = train_data.select_dtypes(include=['float64, int64'])

In [14]:
train_data.isnull().sum()

participant_id             0
0throw_1thcolumn           0
0throw_2thcolumn           0
0throw_3thcolumn           0
0throw_4thcolumn           0
                          ..
MRI_Track_Scan_Location    0
Barratt_Barratt_P1_Edu     0
Barratt_Barratt_P1_Occ     0
Barratt_Barratt_P2_Edu     0
Barratt_Barratt_P2_Occ     0
Length: 19928, dtype: int64

In [22]:
print(train_data.columns )

Index(['participant_id', '0throw_1thcolumn', '0throw_2thcolumn',
       '0throw_3thcolumn', '0throw_4thcolumn', '0throw_5thcolumn',
       '0throw_6thcolumn', '0throw_7thcolumn', '0throw_8thcolumn',
       '0throw_9thcolumn',
       ...
       'MRI_Track_Age_at_Scan', 'Basic_Demos_Enroll_Year',
       'Basic_Demos_Study_Site', 'PreInt_Demos_Fam_Child_Ethnicity',
       'PreInt_Demos_Fam_Child_Race', 'MRI_Track_Scan_Location',
       'Barratt_Barratt_P1_Edu', 'Barratt_Barratt_P1_Occ',
       'Barratt_Barratt_P2_Edu', 'Barratt_Barratt_P2_Occ'],
      dtype='object', length=19928)


In [16]:
test_data.columns

Index(['participant_id', '0throw_1thcolumn', '0throw_2thcolumn',
       '0throw_3thcolumn', '0throw_4thcolumn', '0throw_5thcolumn',
       '0throw_6thcolumn', '0throw_7thcolumn', '0throw_8thcolumn',
       '0throw_9thcolumn',
       ...
       'MRI_Track_Age_at_Scan', 'Basic_Demos_Enroll_Year',
       'Basic_Demos_Study_Site', 'PreInt_Demos_Fam_Child_Ethnicity',
       'PreInt_Demos_Fam_Child_Race', 'MRI_Track_Scan_Location',
       'Barratt_Barratt_P1_Edu', 'Barratt_Barratt_P1_Occ',
       'Barratt_Barratt_P2_Edu', 'Barratt_Barratt_P2_Occ'],
      dtype='object', length=19928)

In [17]:
# Check the data types of each column
non_numeric_columns = train_data.select_dtypes(exclude=[np.number]).columns
print("Non-numeric columns:", non_numeric_columns)


Non-numeric columns: Index(['participant_id'], dtype='object')


In [23]:
# Check the data types of each column
non_numeric_columns = test_data.select_dtypes(exclude=[np.number]).columns
print("Non-numeric columns:", non_numeric_columns)

Non-numeric columns: Index(['participant_id'], dtype='object')


#### Correlation

In [19]:
# # Exclude participant_id for correlation but keep the original dataset unchanged
# corr_matrix = train_data.drop(columns=['participant_id']).corr(method='spearman')

# plt.figure(figsize = (12,6))
# sns.heatmap(corr_matrix, annot = True, cmap = 'coolwarm', fmt = '2f')
# plt.title("Training Correlation")
# plt.show()

In [20]:
# # Exclude participant_id for correlation but keep the original dataset unchanged
# corr_matrix = train_data.drop(columns=['participant_id']).corr()

# plt.figure(figsize = (12,6))
# sns.heatmap(corr_matrix, annot = True, cmap = 'coolwarm', fmt = '2f', vmin = -1, vmax= 1)
# plt.title("Training Correlation")
# plt.show()

### Feature Analysis 

In [21]:
## Isolate the relevant columns
brain_data = train_data.loc[:,'0throw_1thcolumn' : '0throw_199thcolumn']
# Standardization
scaler = StandardScaler()

