In [3]:
import pandas as pd

# load csv files to data frame -> take transcripts from clean_train_transcripts.csv and other columns from dementia_data.csv
DEMENTIA_DATA = 'data/dementia_data.csv'
TRAIN_TRANSCRIPTS = 'data/clean_train_transcripts.csv'
DEMENTIA_COLUMNS = [
    'Record-ID',
    'TrainOrDev',
    'Class',  #  0 (Healthy Control), 1 (Mild Cognitive Impairment), or 2 (Dementia)
    'Gender',
    'Age',
    'Converted-MMSE',
    # precomputed linguistic features
    'filler_count',
    'token_count',
    'type_count',
    'type_token_ratio',
    'ma_ttr',
    'brunets_index',
    'content_density',
    'repetitions',
    'sentence_count',
    'average_words_per_sentence'
]

# TRAIN_TRANSCRIPTS contains already cleaned transcripts
TRANSCRIPT_COLUMNS = ['Record-ID','Transcript_CTD','Transcript_PFT','Transcript_SFT']

# 1. Load both CSVs
dementia_df = pd.read_csv(DEMENTIA_DATA)
transcripts_df = pd.read_csv(TRAIN_TRANSCRIPTS)

# 2. Subset to the columns we actually want
dementia_df = dementia_df[DEMENTIA_COLUMNS]
transcripts_df = transcripts_df[TRANSCRIPT_COLUMNS]

# 3. Merge on Record-ID so each row is one person / recording
combined_df = pd.merge(
    dementia_df,
    transcripts_df,
    on='Record-ID',
    how='inner'   # inner keeps only IDs present in both files
)

# (optional) quick sanity check
print(combined_df.shape)
combined_df.head()


(157, 19)


Unnamed: 0,Record-ID,TrainOrDev,Class,Gender,Age,Converted-MMSE,filler_count,token_count,type_count,type_token_ratio,ma_ttr,brunets_index,content_density,repetitions,sentence_count,average_words_per_sentence,Transcript_CTD,Transcript_PFT,Transcript_SFT
0,Process-rec-001,train,MCI,1,62,25.0,1,0,0,0.0,0.0,0.0,0.0,{},0,0.0,It looks like a kitchen. The mother is... The ...,"People, partner, plate, platter, pants, porter...","giraffe kangaroo, lion, tiger, spider, fly, mo..."
1,Process-rec-002,dev,MCI,1,61,25.0,2,76,46,0.605263,1.0,1.487174,0.421053,"{'there': 2, 's': 7, 'a': 5, 'stood': 2, 'on':...",5,15.2,There's a lad stood on the stool that's fallen...,"pipe, plain, peephole, plumbing, plastic, filt...","dogs, cats, birds, monkeys, rabbits, pigs, bir..."
2,Process-rec-003,train,MCI,0,62,29.0,6,150,84,0.56,0.995495,1.620714,0.433333,"{'the': 10, 'is': 5, 'of': 2, 'a': 10, 'there'...",7,21.428571,The picture is of a kitchen. There's a mum and...,"purple, pale, placid, pink, peony, clumsy, pat...","cow, bull, you, brown, chicken, goose, camel, ..."
3,Process-rec-004,dev,MCI,0,67,29.0,4,162,88,0.54321,1.0,1.675909,0.425926,"{'a': 5, 'or': 2, 'an': 4, 'adult': 3, 'female...",4,40.5,"A mother presumably or a female, an adult fema...","like pool, swimming pool, different kind of po...","and Impala, Cheetah, Lion, Red Panda, Black an..."
4,Process-rec-005,train,MCI,1,65,27.0,4,44,36,0.818182,1.0,1.057222,0.5,"{'s': 3, 'of': 4, 'the': 3, 'and': 2}",1,44.0,this style scene of domestic confusion where t...,"Pillic, Postbox, Pyracanthus, Pudding, Prometh...","Dogcat, giraffe, Wallaby, kangaroo, tortoise, ..."
