In [11]:
import pandas as pd

# load csv files to data frame -> take transcripts from clean_train_transcripts.csv and other columns from dementia_data.csv
DEMENTIA_DATA = 'data/dementia_data.csv'
TRAIN_TRANSCRIPTS = 'data/clean_train_transcripts.csv'
DEMENTIA_COLUMNS = [
    'Record-ID',
    'TrainOrDev',
    'Class',  #  0 (Healthy Control), 1 (Mild Cognitive Impairment), or 2 (Dementia)
    'Gender',
    'Age',
    'Converted-MMSE',
    # precomputed linguistic features
    'filler_count',
    'token_count',
    'type_count',
    'type_token_ratio',
    'ma_ttr',
    'brunets_index',
    'content_density',
    'repetitions',
    'sentence_count',
    'average_words_per_sentence'
]

# TRAIN_TRANSCRIPTS contains already cleaned transcripts
TRANSCRIPT_COLUMNS = ['Record-ID','Transcript_CTD','Transcript_PFT','Transcript_SFT']

# Load CSVs
dementia_df = pd.read_csv(DEMENTIA_DATA)
transcripts_df = pd.read_csv(TRAIN_TRANSCRIPTS)

dementia_df = dementia_df[DEMENTIA_COLUMNS]
transcripts_df = transcripts_df[TRANSCRIPT_COLUMNS]

# Merge dataframes on Record-ID
combined_df = pd.merge(
    dementia_df,
    transcripts_df,
    on='Record-ID',
    how='inner'
)

print("Combined df shape: ", combined_df.shape)

# Split into train and test sets
train_df = combined_df[combined_df['TrainOrDev'] == 'train'].reset_index(drop=True) # (~70% of data)
test_df  = combined_df[combined_df['TrainOrDev'] == 'dev'].reset_index(drop=True) # records with 'dev' will be our test set (~30% of data)
# NOTE: can adjust split ratio later if needed

print("Train df shape:", train_df.shape)
print("Test df shape:", test_df.shape)
# combined_df.head()
test_df.head()
# train_df.head()



Combined df shape:  (157, 19)
Train df shape: (117, 19)
Test df shape: (40, 19)


Unnamed: 0,Record-ID,TrainOrDev,Class,Gender,Age,Converted-MMSE,filler_count,token_count,type_count,type_token_ratio,ma_ttr,brunets_index,content_density,repetitions,sentence_count,average_words_per_sentence,Transcript_CTD,Transcript_PFT,Transcript_SFT
0,Process-rec-002,dev,MCI,1,61,25.0,2,76,46,0.605263,1.0,1.487174,0.421053,"{'there': 2, 's': 7, 'a': 5, 'stood': 2, 'on':...",5,15.2,There's a lad stood on the stool that's fallen...,"pipe, plain, peephole, plumbing, plastic, filt...","dogs, cats, birds, monkeys, rabbits, pigs, bir..."
1,Process-rec-004,dev,MCI,0,67,29.0,4,162,88,0.54321,1.0,1.675909,0.425926,"{'a': 5, 'or': 2, 'an': 4, 'adult': 3, 'female...",4,40.5,"A mother presumably or a female, an adult fema...","like pool, swimming pool, different kind of po...","and Impala, Cheetah, Lion, Red Panda, Black an..."
2,Process-rec-007,dev,HC,1,65,27.362319,3,219,119,0.543379,1.0,1.675336,0.406393,"{'and': 7, 'drying': 2, 'the': 17, 'sink': 2, ...",8,27.375,washing dishes and drying them. The sink is ov...,"Place, plant, plantage in it, person, people, ...","goat, cat, horse, elephant, giraffe, alligator..."
3,Process-rec-008,dev,MCI,0,62,25.0,7,100,57,0.57,0.989796,1.589386,0.43,"{'a': 8, 'is': 3, 'the': 4, 'and': 4, '.': 7, ...",9,11.111111,A woman is washing the dishes in her kitchen a...,um peanut butter pear place put people pause p...,"Giraffe, tiger, lion, coffin, dog cat, zebra, ..."
4,Process-rec-016,dev,Dementia,0,66,27.362319,3,143,85,0.594406,0.997636,1.517353,0.384615,"{'s': 5, 'a': 8, 'lady': 2, 'her': 4, 'up': 2,...",6,23.833333,as a lady in her kitchen doing her washing up ...,"Parrot, Penguin, Petunia, Pay, Pay, Pay, Pay, ...","cat, dog, fox, hedgehog, mice, or mouse, rat, ..."
