In [7]:
import zipfile
import zipfile
import os
from pathlib import Path
import glob
import pandas as pd

def extract_zip(zip_path, extract_to):
    """
    Extracts the contents of the given zip file into the target directory.
    """
    # Create the extraction directory if it doesn't exist
    Path(extract_to).mkdir(parents=True, exist_ok=True)
    
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        # List the files in the archive
        file_list = zip_ref.namelist()
        print("Files in the archive:")
        for file in file_list:
            print("  -", file)
        # Extract all contents
        zip_ref.extractall(extract_to)
    print(f"\nExtraction complete. Files are extracted to: {extract_to}")
    return file_list

def process_csv_files(extract_to):
    """
    Optionally locate all CSV files in the extracted folder and print the first few rows.
    """
    # Recursively locate all CSV files in the extracted folder
    csv_files = glob.glob(os.path.join(extract_to, '**', '*.csv'), recursive=True)
    if not csv_files:
        print("No CSV files found in the extracted contents.")
        return

    for csv_file in csv_files:
        print(f"\n-----\nProcessing file: {csv_file}")
        try:
            df = pd.read_csv(csv_file, nrows=5)
            print(df.head())
        except Exception as e:
            print(f"Error reading {csv_file}: {e}")

if __name__ == '__main__':
    # Path to the zip file
    zip_file_path = '/Users/arsalonamini/Desktop/Research/data/adni_all_csv.zip'
    # Define the folder into which to extract the contents
    output_folder = '/Users/arsalonamini/Desktop/Research/data/extracted_adni_all_csv'
    
    # Extract the contents of the zip
    extracted_files = extract_zip(zip_file_path, output_folder)
    
    # Optionally process CSV files to check for expected fields
    process_csv_files(output_folder)


Files in the archive:
  - ITEM.csv
  - ADNI_CBBRESULTS_01Feb2024.csv
  - ADNI_Cogstate_Description_and_Description_29October2015.pdf
  - ADNI_EMBICDCB_01Feb2024.csv
  - CDR_01Feb2024.csv
  - MMSE_01Feb2024.csv
  - MOCA_01Feb2024.csv
  - ECOGPT_01Feb2024.csv
  - ECOGSP_01Feb2024.csv
  - NEUROBAT_01Feb2024.csv
  - ADAS_ADNIGO23_01Feb2024.csv
  - ADAS_ADNI1_01Feb2024.csv
  - FAQ_01Feb2024.csv
  - DXSUM_PDXCONV_ADNIALL_01Feb2024.csv
  - ADSXLIST_01Feb2024.csv
  - CCI_01Feb2024.csv
  - BLCHANGE_01Feb2024.csv
  - CBBCOMP_01Feb2024.csv
  - FCI_01Feb2024.csv
  - GDSCALE_01Feb2024.csv
  - MODHACH_01Feb2024.csv
  - NPI_01Feb2024.csv
  - NPIQ_01Feb2024.csv
  - BHR_BASELINE_QUESTIONNAIRE_01Feb2024.csv
  - BHR_EVERYDAY_COGNITION_01Feb2024.csv
  - BHR_LONGITUDINAL_QUESTIONNAIRE_01Feb2024.csv
  - BHR_MEMTRAX_01Feb2024.csv
  - BHR_SP_ADL_01Feb2024.csv
  - BHR_SP_CAREGIVER_BURDEN_01Feb2024.csv
  - BHR_SP_EVERYDAY_COGNITION_01Feb2024.csv
  - BHR_SP_FAQ_01Feb2024.csv
  - BHR_SP_INITIAL_01Feb2024.csv
  - 