In [None]:
import pandas as pd
import numpy as np

In [None]:
def process_longitudinal_data(df):

    print("Processing dementia_dataset.csv (longitudinal)...")
    # This is an efficient O(N) filtering operation.
    processed_df = df.loc[df['Visit'] == 1].copy()
    
    # Rename columns for consistency. This is a fast, constant-time operation.
    processed_df.rename(columns={'EDUC': 'Education', 'MMSE': 'MMSE', 'CDR': 'CDR', 'M/F': 'Gender'}, inplace=True)
    
    # Create the target variable 'Dementia'. This O(N) mapping is efficient.
    # 'Nondemented' = 0, 'Demented' or 'Converted' = 1
    group_map = {'Nondemented': 0, 'Demented': 1, 'Converted': 1}
    processed_df['Dementia'] = processed_df['Group'].map(group_map)
    
    # Standardize 'Gender' column (F=0, M=1). Another efficient O(N) map.
    processed_df['Gender'] = processed_df['Gender'].map({'F': 0, 'M': 1})

    # Select and return only the columns we need.
    features = ['Age', 'Education', 'Gender', 'MMSE', 'CDR', 'Dementia']
    print(f"Found {len(processed_df)} first-visit subjects.")
    return processed_df[features]

In [None]:
def process_cross_sectional_data(df):
    print("\nProcessing oasis_cross-sectional.csv...")
    processed_df = df.copy()
    
    # Rename columns for consistency.
    processed_df.rename(columns={'Educ': 'Education', 'MMSE': 'MMSE', 'CDR': 'CDR', 'M/F': 'Gender'}, inplace=True)
    
    # Create the target variable 'Dementia'. Vectorized np.where is very fast (O(N)).
    # A Clinical Dementia Rating (CDR) greater than 0 indicates dementia.
    processed_df['Dementia'] = np.where(processed_df['CDR'] > 0, 1, 0)
    
    # Standardize 'Gender' column (F=0, M=1).
    processed_df['Gender'] = processed_df['Gender'].map({'F': 0, 'M': 1})
    
    # Select and return only the columns we need.
    features = ['Age', 'Education', 'Gender', 'MMSE', 'CDR', 'Dementia']
    print(f"Found {len(processed_df)} subjects.")
    return processed_df[features]

In [2]:
def main():
    """
    Main function to orchestrate the loading, processing, combining, and saving of datasets.
    """
    try:
        long_df = pd.read_csv("dementia_dataset.csv")
        cross_df = pd.read_csv("oasis_cross-sectional.csv")
    except FileNotFoundError as e:
        print(f"Error: {e}. Make sure 'dementia_dataset.csv' and 'oasis_cross-sectional.csv' are in the same folder.")
        return

    # Process each DataFrame using the dedicated functions.
    long_df_clean = process_longitudinal_data(long_df)
    cross_df_clean = process_cross_sectional_data(cross_df)
    
    # --- Step 3: Combine and Finalize ---
    print("\nCombining datasets...")
    # pd.concat is optimized for this operation.
    combined_df = pd.concat([long_df_clean, cross_df_clean], ignore_index=True)

    # Drop rows with any missing values in our key columns. O(N) operation.
    initial_rows = len(combined_df)
    combined_df.dropna(inplace=True)
    print(f"Dropped {initial_rows - len(combined_df)} rows with missing values.")
    
    # Ensure all feature columns are numeric. This loop is efficient as it runs a fixed number of times.
    feature_cols = ['Age', 'Education', 'Gender', 'MMSE', 'CDR']
    for col in feature_cols:
        combined_df[col] = pd.to_numeric(combined_df[col], errors='coerce')
        
    combined_df.dropna(inplace=True)

    print(f"\nTotal combined and cleaned rows: {len(combined_df)}")
    
    # Save the final dataset to a new CSV file.
    output_filename = 'combined_dementia_dataset.csv'
    combined_df.to_csv(output_filename, index=False)
    print(f"Successfully created '{output_filename}'. You can now proceed to the next step.")


In [3]:
if __name__ == '__main__':
    main()


NameError: name 'pd' is not defined