In [1]:
import pandas as pd
import numpy as np
import plotly 

import plotly.express as px

# import seaborn as sns
# import missingno # to identify and visualize missing data prior to ML

### Datasets

In [2]:
# Clinical dementia rating scale
cdr_df = pd.read_csv('CDR.csv')

# Columns we are interested in.
cdr_columns = ['ID', 'RID', 'VISCODE', 'VISCODE2', 'CDMEMORY', 'CDORIENT', 'CDJUDGE', 
               'CDCOMMUN', 'CDHOME', 'CDCARE', 'CDGLOBAL']
cdr_df = cdr_df[cdr_columns]

# Drop rows with NA values.
cdr_df = cdr_df.dropna()

# Sum all the scores. Is it meaningful?
cdr_score_columns = ['CDMEMORY', 'CDORIENT', 'CDJUDGE', 'CDCOMMUN', 'CDHOME', 'CDCARE']
cdr_df['total_score'] = 0
for col in cdr_score_columns:
    cdr_df['total_score'] += cdr_df[col]

# Convert score columns and diagnosis score into categorical values.
for col in cdr_score_columns:
    cdr_df[col] = cdr_df[col].astype('category')
    
    
# diag_df = pd.read_csv('DXSUM_PDXCONV_ADNIALL.csv')         #diagnosis
# gds_df = pd.read_csv('GDSCALE.csv')       #geriatric depresion scale
# mmse_df = pd.read_csv('MMSE.csv')         #mini mental state examination
# moca = pd.read_csv('MOCA.csv')            #montreal cognitive assessment
# neurob = pd.read_csv('NEUROBAT.csv')      #neuropsychological battery
# demo = pd.read_csv('PTDEMOG.csv')         #demographics

# List of tasks
# - inspect
# - clean (remove irrelevant columns)
# - perform statistical calculations (on cols with explained meaning in data dict.)
# - visualize
# - set aside columns, for which we're not sure what they mean and ask 

### Basic information and insights

In [3]:
cdr_dict = pd.read_csv('CDR_dict.csv')
cdr_dict

Unnamed: 0,Phase,Phase.1
0,ID,Record ID
1,RID,Participant roster ID
2,SITEID,Site ID
3,VISCODE,Visit code
4,VISCODE2,Translated visit code
5,USERDATE,Date record created
6,USERDATE2,Date record last updated
7,EXAMDATE,Examination Date
8,CDSOURCE,Information Source: 1=Participant Visit;2=Tele...
9,CDVERSION,1=CDR version 1- Full interview with informant...


In [4]:
# TODO: What do negative rating values mean?
# CDR (Clinical Dementia Rating scale): 3-severe, 2-moderate, 1-mild, 0.5-very mild, 0-normal
cdr_df[cdr_df['CDGLOBAL'] < 0]

Unnamed: 0,ID,RID,VISCODE,VISCODE2,CDMEMORY,CDORIENT,CDJUDGE,CDCOMMUN,CDHOME,CDCARE,CDGLOBAL,total_score
770,1556,48,m06,m06,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-6.0
951,8412,575,m36,m36,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-6.0
2425,4868,492,m06,m06,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-6.0
2535,5088,958,m06,m06,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-6.0
2591,5200,898,m12,m12,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-6.0
...,...,...,...,...,...,...,...,...,...,...,...,...
9866,10206,5202,v31,m36,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-6.0
9867,10208,5131,v21,m24,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-6.0
9870,10216,4050,v41,m48,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-6.0
9871,10218,4343,v41,m48,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-6.0


In [5]:
# Filter out negative values.
cdr_df = cdr_df[cdr_df['CDGLOBAL'] >= 0]

In [6]:
cdr_df.head()

Unnamed: 0,ID,RID,VISCODE,VISCODE2,CDMEMORY,CDORIENT,CDJUDGE,CDCOMMUN,CDHOME,CDCARE,CDGLOBAL,total_score
0,8,2,sc,sc,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,10,3,sc,sc,1.0,1.0,1.0,1.0,0.5,0.0,1.0,4.5
2,12,4,sc,sc,0.5,0.0,0.5,0.0,0.0,0.0,0.5,1.0
3,14,5,sc,sc,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,16,7,sc,sc,1.0,1.0,1.0,1.0,1.0,1.0,1.0,6.0


From the CDR table we select and keep the columns we deem relevant.  
We need more information on the meaning of CDSOB(Sum of Boxes) and CDRSB.  

In [12]:
fig = px.histogram(cdr_df, "diagnosis", category_orders=dict(diagnosis=diagnosis_order))
fig.update_layout(bargap=0.2)
fig.show()

In [8]:
px.histogram(cdr_df, "total_score", color='CDGLOBAL')

In [9]:
px.box(cdr_df, y='total_score', x='CDGLOBAL')

In [13]:
# Correlation between CDGLOBAL and total score
cdr_corr = cdr_df[['CDGLOBAL', 'total_score']].corr()
cdr_corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,CDGLOBAL,total_score
CDGLOBAL,1.0,0.920073
total_score,0.920073,1.0


POSSIBLE QUESTIONS:  
    - meaning of some parameters, as marked  
    - goal of the analysis: find out which test questions are the most relevant and have largest ROI when making a diagnose (more relevant with MMSE, MOMA ...)