## Uploading Data 

Uploading Categorical, Quantitative and Connectome Data

In [1]:
"""
ADHD Prediction Dataset - Data Merging Module

This notebook merges multiple ADHD-related datasets into a single comprehensive dataset
for machine learning analysis. The merged dataset combines neuroimaging connectome data,
behavioral questionnaires, demographic information, and target labels.


Project: ADHD Sex Prediction
Input Files: 4 separate datasets (connectome, quantitative, categorical, targets)
Output: Unified raw dataset for preprocessing pipeline
"""

import pandas as pd
import numpy as np
from pathlib import Path

# =============================================================================
# CONFIGURATION AND FILE PATHS
# =============================================================================

# Data file paths
DATA_DIR = Path("C:/Users/04ama/Downloads")
OUTPUT_DIR = Path(".")

# Input file paths
CONNECTOME_FILE = DATA_DIR / "TRAIN_FUNCTIONAL_CONNECTOME_MATRICES_new_36P_Pearson (1).csv"
QUANTITATIVE_FILE = DATA_DIR / "TRAIN_QUANTITATIVE_METADATA_new.xlsx"
CATEGORICAL_FILE = DATA_DIR / "TRAIN_CATEGORICAL_METADATA_new.xlsx"
TARGET_FILE = DATA_DIR / "TRAINING_SOLUTIONS (1).xlsx"

# Output file
OUTPUT_FILE = OUTPUT_DIR / "raw_dataset.csv"



In [2]:
# =============================================================================
# DATA LOADING FUNCTIONS
# =============================================================================

def load_dataset(file_path, dataset_name, file_type='csv'):
    """
    Load a dataset with error handling and basic validation.
    
    Parameters:
    -----------
    file_path : str or Path
        Path to the data file
    dataset_name : str
        Human-readable name for logging
    file_type : str
        File format ('csv' or 'excel')
    
    Returns:
    --------
    pd.DataFrame
        Loaded dataset
    
    Raises:
    -------
    FileNotFoundError
        If the file doesn't exist
    pd.errors.EmptyDataError
        If the file is empty
    """
    try:
        print(f"Loading {dataset_name} data")
        
        if file_type == 'csv':
            df = pd.read_csv(file_path)
        elif file_type == 'excel':
            df = pd.read_excel(file_path)
        else:
            raise ValueError(f"Unsupported file type: {file_type}")

        print(f"Successfully loaded: {df.shape[0]:,} rows √ó {df.shape[1]:,} columns")
        return df
        
    except FileNotFoundError:
        print(f" Error: File not found - {file_path}")
        raise
    except Exception as e:
        print(f"Error loading {dataset_name}: {str(e)}")
        raise



In [3]:
# =============================================================================
# DATASET LOADING 
# =============================================================================

# Load Connectome Data (Functional brain connectivity matrices)
conn_data = load_dataset(CONNECTOME_FILE, "Connectome", "csv")


# Load Quantitative Metadata (Behavioral questionnaires and assessments)
quant_data = load_dataset(QUANTITATIVE_FILE, "Quantitative", "excel")


# Load Categorical Metadata (Demographics and categorical variables)
cat_data = load_dataset(CATEGORICAL_FILE, "Categorical", "excel")


# Load Target Data (ADHD outcome labels)
target_data = load_dataset(TARGET_FILE, "Target", "excel")


print(f"\n Dataset Summary:")

print(f"{'Connectome':<15} {conn_data.shape[0]:<8} {conn_data.shape[1]:<8} ")
print(f"{'Quantitative':<15} {quant_data.shape[0]:<8} {quant_data.shape[1]:<8} ")
print(f"{'Categorical':<15} {cat_data.shape[0]:<8} {cat_data.shape[1]:<8} ")
print(f"{'Target':<15} {target_data.shape[0]:<8} {target_data.shape[1]:<8} ")

Loading Connectome data
Successfully loaded: 1,213 rows √ó 19,901 columns
Loading Quantitative data
Successfully loaded: 1,213 rows √ó 19 columns
Loading Categorical data
Successfully loaded: 1,213 rows √ó 10 columns
Loading Target data
Successfully loaded: 1,213 rows √ó 3 columns

 Dataset Summary:
Connectome      1213     19901    
Quantitative    1213     19       
Categorical     1213     10       
Target          1213     3        


In [15]:
target_data.head()

Unnamed: 0,participant_id,ADHD_Outcome,Sex_F
0,UmrK0vMLopoR,1,1
1,CPaeQkhcjg7d,1,0
2,Nb4EetVPm3gs,1,0
3,p4vPhVu91o4b,1,1
4,M09PXs7arQ5E,1,1


In [4]:
# =============================================================================
# DATA MERGING PROCESS
# =============================================================================

def merge_datasets_sequentially(datasets, dataset_names, key='participant_id'):
    """
    Sequentially merge multiple datasets on a common key.
    
    Parameters:
    -----------
    datasets : list of pd.DataFrame
        List of datasets to merge
    dataset_names : list of str
        Names of datasets for logging
    merge_key : str
        Column name to merge on
    
    Returns:
    --------
    pd.DataFrame
        Final merged dataset
    
    Notes:
    ------
    Uses inner joins to ensure only participants with data in ALL datasets
    are included in the final merged dataset.
    """
  
    
    # Start with first dataset
    merged_df = datasets[0].copy()

    
    #  merge remaining datasets
    for i, (dataset, name) in enumerate(zip(datasets[1:], dataset_names[1:]), 1):
        before_shape = merged_df.shape
        
        # Perform inner join to keep only matching participant_ids
        merged_df = pd.merge(merged_df, dataset, on=key, how='inner')
        after_shape = merged_df.shape
        
        # Calculate merge statistics
        rows_lost = before_shape[0] - after_shape[0]
        cols_added = after_shape[1] - before_shape[1]
        
        print(f"   Step {i}: Added {name}")
        print(f"            ‚Ä¢ Rows: {before_shape[0]:,} ‚Üí {after_shape[0]:,} ({-rows_lost:+,})")
        print(f"            ‚Ä¢ Cols: {before_shape[1]:,} ‚Üí {after_shape[1]:,} ({cols_added:+,})")
        
        
    return merged_df

# Perform sequential merging
datasets = [conn_data, quant_data, cat_data, target_data]
dataset_names = ["Connectome", "Quantitative", "Categorical", "Target"]

merged_df = merge_datasets_sequentially(datasets, dataset_names)

print(f"Final dataset shape: {merged_df.shape[0]:,} rows √ó {merged_df.shape[1]:,} columns")

   Step 1: Added Quantitative
            ‚Ä¢ Rows: 1,213 ‚Üí 1,213 (+0)
            ‚Ä¢ Cols: 19,901 ‚Üí 19,919 (+18)
   Step 2: Added Categorical
            ‚Ä¢ Rows: 1,213 ‚Üí 1,213 (+0)
            ‚Ä¢ Cols: 19,919 ‚Üí 19,928 (+9)
   Step 3: Added Target
            ‚Ä¢ Rows: 1,213 ‚Üí 1,213 (+0)
            ‚Ä¢ Cols: 19,928 ‚Üí 19,930 (+2)
Final dataset shape: 1,213 rows √ó 19,930 columns


In [5]:
# =============================================================================
# DATA EXPORT AND SUMMARY
# =============================================================================

def save_merged_dataset(df, output_path):
    """
    Save merged dataset with metadata documentation.
    
    Parameters:
    -----------
    df : pd.DataFrame
        Merged dataset to save
    output_path : str or Path
        Output file path
    validation_report : dict
        Validation statistics for documentation
    """

    try:
        # Save main dataset
        df.to_csv(output_path, index=False)
        file_size_mb = Path(output_path).stat().st_size / (1024*1024)
        
        print(f"   ‚úÖ Dataset saved: {output_path}")
        print(f"   üìÅ File size: {file_size_mb:.1f} MB")
        
        # Create metadata file
        metadata = {
            'creation_date': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S'),
            'source_files': {
                'connectome': str(CONNECTOME_FILE),
                'quantitative': str(QUANTITATIVE_FILE),
                'categorical': str(CATEGORICAL_FILE),
                'target': str(TARGET_FILE)
            },
            'merge_method': 'inner_join',
            'merge_key': 'participant_id'
        }
        
        # Save metadata
        metadata_path = output_path.with_suffix('.json')
        import json
        with open(metadata_path, 'w') as f:
            json.dump(metadata, f, indent=2, default=str)
        
        
    except Exception as e:
        print(f"   ‚ùå Error saving dataset: {str(e)}")
        raise

# Save the merged dataset
save_merged_dataset(merged_df, OUTPUT_FILE)


print(f"üìä Final dataset: {merged_df.shape[0]:,} participants √ó {merged_df.shape[1]:,} features")
print(f"üìÅ Output file: {OUTPUT_FILE}")


   ‚úÖ Dataset saved: raw_dataset.csv
   üìÅ File size: 446.5 MB
üìä Final dataset: 1,213 participants √ó 19,930 features
üìÅ Output file: raw_dataset.csv


In [8]:
merged_df.head()

Unnamed: 0,participant_id,0throw_1thcolumn,0throw_2thcolumn,0throw_3thcolumn,0throw_4thcolumn,0throw_5thcolumn,0throw_6thcolumn,0throw_7thcolumn,0throw_8thcolumn,0throw_9thcolumn,...,Basic_Demos_Study_Site,PreInt_Demos_Fam_Child_Ethnicity,PreInt_Demos_Fam_Child_Race,MRI_Track_Scan_Location,Barratt_Barratt_P1_Edu,Barratt_Barratt_P1_Occ,Barratt_Barratt_P2_Edu,Barratt_Barratt_P2_Occ,ADHD_Outcome,Sex_F
0,70z8Q2xdTXM3,0.22293,0.527903,0.429966,0.060457,0.566489,0.315342,0.508408,-0.07829,0.525692,...,1,0.0,1.0,2.0,21.0,45.0,21.0,45.0,1,0
1,WHWymJu6zNZi,0.614765,0.577255,0.496127,0.496606,0.404686,0.439724,0.12259,-0.085452,0.120673,...,1,1.0,8.0,1.0,6.0,5.0,,15.0,1,1
2,4PAQp1M6EyAo,-0.116833,0.458408,0.260703,0.639031,0.769337,0.442528,0.63711,0.19201,0.520379,...,1,0.0,0.0,2.0,18.0,35.0,9.0,20.0,1,1
3,obEacy4Of68I,0.199688,0.752714,0.658283,0.575096,0.692867,0.645789,0.52275,0.412188,0.530843,...,1,0.0,0.0,2.0,21.0,40.0,21.0,40.0,1,1
4,s7WzzDcmDOhF,0.227321,0.613268,0.621447,0.562673,0.736709,0.589813,0.266676,0.359668,0.300771,...,1,2.0,8.0,2.0,9.0,35.0,,,1,1


DONE