In [32]:
# pandas for data manipulation
import pandas as pd

In [33]:
# path to csv file
csv_path = r"C:\Users\gjkku\OneDrive\Documenten\CSAI year 3\Thesis\oasis_cross-sectional-5708aa0a98d82080.csv"

In [42]:
# reading the csv file into a dataframe with error handling
try:
    df = pd.read_csv(csv_path, decimal='.')
except FileNotFoundError:
    print(f"Error: Path to file was not found")

print(df['CDR'].value_counts(dropna=False))

CDR
NaN    201
0.0    135
0.5     70
1.0     28
2.0      2
Name: count, dtype: int64


CDR
0.0    69.074074
0.5    76.214286
1.0    77.750000
2.0    82.000000
Name: Age, dtype: float64

In [35]:
# function that classifies subjects based on their CDR rating
def CDR_classes(cdr):
    if pd.isna(cdr): # NaN values classified as non demented
        return "Non Demented"
    elif cdr == 0: # -> no dementia
        return "Non Demented"
    elif cdr == 0.5: # -> very mild dementia
        return "Very Mild Dementia"
    elif cdr >= 1: # -> mild dementia
        return "Mild Dementia"
    else:
        return "unexpected value" # handling any unexpected values

In [36]:
# applying CDR classes function to create a new 'Class' column
df['Class'] = df['CDR'].apply(CDR_classes) # 'Class' column containing the dementia category for each subject

# group dataframe by 'Class' and collecting a list of subject IDs for each class
class_groups = df.groupby('Class')['ID'].apply(list).to_dict() # storing result as dictionary: keys are class names, values are lists of IDs

In [37]:
# number of subjects for each dementia class
print("Number of subjects in each class:")
for class_name, subject_ids in class_groups.items():
    print(f"{class_name}: {len(subject_ids)} subjects")

Number of subjects in each class:
Mild Dementia: 30 subjects
Non Demented: 336 subjects
Very Mild Dementia: 70 subjects


In [38]:
# define classes: non demented and very mild demented
binary_classes = ["Non Demented", "Very Mild Dementia"]
# filter dataframe to only include rows with the binary classes
binary_df = df[df['Class'].isin(binary_classes)]

In [39]:
# path for the new csv file
new_binary_csv_path = r"C:\Users\gjkku\OneDrive\Documenten\CSAI year 3\Thesis\csv_binary.csv"
# save dataframe to new csv file
binary_df.to_csv(new_binary_csv_path, index=False, sep='\t')

In [40]:
# print path where csv file is saved
print(f"\nBinary classification subset saved to {new_binary_csv_path}")
# print total numbers of subjects in classification subset (included NaN values)
print(f"Total subjects for binary classification: {len(binary_df)}")


Binary classification subset saved to C:\Users\gjkku\OneDrive\Documenten\CSAI year 3\Thesis\csv_binary.csv
Total subjects for binary classification: 406


In [41]:
# print list of subject IDs for each dementia class
print("\nSubject IDs for each class:")
for class_name, subject_ids in class_groups.items():
    print(f"{class_name}: {subject_ids}")


Subject IDs for each class:
Mild Dementia: ['OAS1_0028_MR1', 'OAS1_0031_MR1', 'OAS1_0035_MR1', 'OAS1_0052_MR1', 'OAS1_0053_MR1', 'OAS1_0056_MR1', 'OAS1_0067_MR1', 'OAS1_0073_MR1', 'OAS1_0122_MR1', 'OAS1_0134_MR1', 'OAS1_0137_MR1', 'OAS1_0184_MR1', 'OAS1_0185_MR1', 'OAS1_0223_MR1', 'OAS1_0268_MR1', 'OAS1_0269_MR1', 'OAS1_0278_MR1', 'OAS1_0291_MR1', 'OAS1_0308_MR1', 'OAS1_0316_MR1', 'OAS1_0351_MR1', 'OAS1_0373_MR1', 'OAS1_0382_MR1', 'OAS1_0388_MR1', 'OAS1_0399_MR1', 'OAS1_0405_MR1', 'OAS1_0424_MR1', 'OAS1_0425_MR1', 'OAS1_0430_MR1', 'OAS1_0452_MR1']
Non Demented: ['OAS1_0001_MR1', 'OAS1_0002_MR1', 'OAS1_0004_MR1', 'OAS1_0005_MR1', 'OAS1_0006_MR1', 'OAS1_0007_MR1', 'OAS1_0009_MR1', 'OAS1_0010_MR1', 'OAS1_0011_MR1', 'OAS1_0012_MR1', 'OAS1_0013_MR1', 'OAS1_0014_MR1', 'OAS1_0017_MR1', 'OAS1_0018_MR1', 'OAS1_0019_MR1', 'OAS1_0020_MR1', 'OAS1_0025_MR1', 'OAS1_0026_MR1', 'OAS1_0027_MR1', 'OAS1_0029_MR1', 'OAS1_0030_MR1', 'OAS1_0032_MR1', 'OAS1_0033_MR1', 'OAS1_0034_MR1', 'OAS1_0037_MR1', 'OAS1