## Narrowing down the dataset by various criteria

In [8]:
import pandas as pd
from dateutil import parser
import numpy as np

In [11]:
loadPath = '../data/'
writePath = '../data/'

Loading and extracting data.

In [12]:
# Load the CSV file
df = pd.read_csv(loadPath + 'MRIT1_longitudinal.csv')

# all the unique patients
patients = df['NACCID'].unique()

# number of MRIT1 scans each unique patients
number_of_scans = len(df['NACCID'])

# Count the occurrences of each patient (NACCID) in the dataset
naccid_counts = df['NACCID'].value_counts()

Calculating time interval between scans.

In [13]:
differences = []
sum = 0                                                     # sum to store indexing of the patients (borders for what dates to read)

for i in range(0, len(patients)):                               # one subarray for each unique NACCID (patient)

    naccid = df['NACCID'].unique()[i]                           # unique patients

    num = naccid_counts.loc['{}'.format(naccid)]                # number of MRIT1 scans


    months = np.array(df['MRIMO'][sum : sum + num])
    day = np.array(df['MRIDY'][sum : sum + num])
    years = np.array(df['MRIYR'][sum : sum + num])


    diffs = [0]


    date1 = np.zeros(num, dtype=object)


    for k in range(0, num):
        
        date1[k] = ('{}-{}-{}'.format(years[k], months[k], day[k]))
        
        date1[k] = parser.parse(date1[k])

        
    for l in range(0, len(date1)-1, 1):
            
        d = np.abs(date1[l+1] - date1[l])
        

        diffs.append(d.days)

    sum += num

    differences.append(diffs)

In [14]:
# transforming the difference to months
for p in differences:

    for e in range(0, len(p)):

        p[e] = p[e] / 30.437  

Saving the data to a dictionary.

In [15]:
dic_unsorted = {}

for i in range(0, len(patients)):    
    
    dic_unsorted['{}'.format(patients[i])] = differences[i] 


print(dic_unsorted)

{'NACC869232': [0.0, 33.47898938791602], 'NACC105473': [0.0, 2.825508427243158], 'NACC792077': [0.0, 16.493084075303084], 'NACC072928': [0.0, 78.09573873903473], 'NACC593201': [0.0, 0.4599664881558629], 'NACC402274': [0.0, 20.665637217859842], 'NACC354797': [0.0, 9.232184512271248, 2.9569274238591188, 6.34096658672011], 'NACC433861': [0.0, 9.002201268193318, 0.16427374576995105, 0.032854749153990205], 'NACC789249': [0.0, 37.55297828301081, 2.989782173013109], 'NACC930350': [0.0, 85.52091204783652], 'NACC599120': [0.0, 9.65929625127312], 'NACC041379': [0.0, 5.9138548477182376, 100.20698491967013], 'NACC430620': [0.0, 47.73795052074777], 'NACC146540': [0.0, 0.5256759864638433], 'NACC176553': [0.0, 5.683871603640306, 1.741301705161481, 12.81335217005618], 'NACC412452': [0.0, 35.84453132700332], 'NACC292034': [0.0, 14.587508624371653], 'NACC059294': [0.0, 18.59578802115846, 5.979564346026218, 5.979564346026218, 6.209547590104149], 'NACC024190': [0.0, 46.35805105628018], 'NACC092622': [0.0,

Sorting the dictionary by lenghts of values and last element in values.

In [16]:
# sorting the dictionary by lenghts of value array and by last element in array
sorted_items = sorted(dic_unsorted.items(), key=lambda x: (len(x[1]), (x[1][-1])))          

# Rebuilding the dictionary manually
dic_sorted = {k: v for k, v in sorted_items}

In [17]:
print(dic_sorted)

{'NACC100274': [0.0, 0.0], 'NACC898140': [0.0, 0.0], 'NACC654599': [0.0, 0.0], 'NACC502826': [0.0, 0.0], 'NACC826333': [0.0, 0.0], 'NACC992797': [0.0, 0.0], 'NACC220489': [0.0, 0.0], 'NACC647006': [0.0, 0.0], 'NACC271918': [0.0, 0.0], 'NACC145120': [0.0, 0.0], 'NACC062768': [0.0, 0.0], 'NACC280497': [0.0, 0.0], 'NACC884845': [0.0, 0.0], 'NACC587976': [0.0, 0.0], 'NACC445846': [0.0, 0.0], 'NACC587626': [0.0, 0.0], 'NACC289485': [0.0, 0.0], 'NACC701098': [0.0, 0.0], 'NACC751081': [0.0, 0.0], 'NACC028843': [0.0, 0.0], 'NACC195208': [0.0, 0.0], 'NACC404251': [0.0, 0.0], 'NACC454368': [0.0, 0.0], 'NACC362036': [0.0, 0.0], 'NACC064163': [0.0, 0.0], 'NACC876572': [0.0, 0.0], 'NACC089322': [0.0, 0.0], 'NACC239362': [0.0, 0.0], 'NACC988056': [0.0, 0.0], 'NACC708176': [0.0, 0.0], 'NACC553123': [0.0, 0.0], 'NACC946938': [0.0, 0.0], 'NACC340526': [0.0, 0.0], 'NACC547399': [0.0, 0.0], 'NACC678556': [0.0, 0.0], 'NACC189820': [0.0, 0.0], 'NACC255137': [0.0, 0.0], 'NACC559983': [0.0, 0.0], 'NACC464658

### Narrowing the data 
I will delete each patient that had multiple scans taken on the same day - that data is not longitudinal. 

In [23]:
naccid_trash = [k for k, v in dic_sorted.items() if v.count(0.0) > 1]


In [26]:
print(naccid_trash)
print(len(naccid_trash))             # we get rid of 166 patients

['NACC100274', 'NACC898140', 'NACC654599', 'NACC502826', 'NACC826333', 'NACC992797', 'NACC220489', 'NACC647006', 'NACC271918', 'NACC145120', 'NACC062768', 'NACC280497', 'NACC884845', 'NACC587976', 'NACC445846', 'NACC587626', 'NACC289485', 'NACC701098', 'NACC751081', 'NACC028843', 'NACC195208', 'NACC404251', 'NACC454368', 'NACC362036', 'NACC064163', 'NACC876572', 'NACC089322', 'NACC239362', 'NACC988056', 'NACC708176', 'NACC553123', 'NACC946938', 'NACC340526', 'NACC547399', 'NACC678556', 'NACC189820', 'NACC255137', 'NACC559983', 'NACC464658', 'NACC219517', 'NACC010965', 'NACC890339', 'NACC020071', 'NACC212357', 'NACC558077', 'NACC661374', 'NACC265342', 'NACC977742', 'NACC943840', 'NACC488571', 'NACC586606', 'NACC491900', 'NACC039431', 'NACC762343', 'NACC436723', 'NACC731841', 'NACC774125', 'NACC116218', 'NACC819248', 'NACC034649', 'NACC417667', 'NACC861111', 'NACC146521', 'NACC524845', 'NACC462523', 'NACC452859', 'NACC469292', 'NACC848866', 'NACC567807', 'NACC476861', 'NACC310858', 'NACC

Removing patients that are in the naccid_trash array.

In [28]:
for i in naccid_trash:
    dic_sorted.pop('{}'.format(i))

In [37]:
print(dic_sorted)
print(len(dic_sorted))                # 1778 - 166 = 1612

{'NACC931236': [0.0, 0.032854749153990205], 'NACC265245': [0.0, 0.13141899661596082], 'NACC536405': [0.0, 0.22998324407793144], 'NACC229318': [0.0, 0.26283799323192164], 'NACC911187': [0.0, 0.36140224069389226], 'NACC593201': [0.0, 0.4599664881558629], 'NACC431913': [0.0, 0.4599664881558629], 'NACC952268': [0.0, 0.4599664881558629], 'NACC146540': [0.0, 0.5256759864638433], 'NACC196447': [0.0, 0.5913854847718237], 'NACC438624': [0.0, 0.7556592305417748], 'NACC912768': [0.0, 0.8542234780037454], 'NACC242848': [0.0, 1.1827709695436475], 'NACC871822': [0.0, 1.3798994644675888], 'NACC282073': [0.0, 1.6755922068535005], 'NACC640635': [0.0, 1.905575450931432], 'NACC430058': [0.0, 2.0041396983934026], 'NACC366434': [0.0, 2.0041396983934026], 'NACC664392': [0.0, 2.036994447547393], 'NACC970271': [0.0, 2.0698491967013832], 'NACC823627': [0.0, 2.234122942471334], 'NACC603039': [0.0, 2.2669776916253244], 'NACC579527': [0.0, 2.562670434011236], 'NACC105473': [0.0, 2.825508427243158], 'NACC556637': 

Writing down the narrowed down patients in a new .csv file.

In [40]:
df = pd.read_csv(loadPath + 'MRIT1_longitudinal.csv')

In [44]:
# extracting keys in sorted dictionary
keys = dic_sorted.keys()

In [45]:
df_narrow = df[df['NACCID'].isin(keys)]

print(len(df_narrow))

4182


In [46]:
# Select only the relevant columns and rename them according to the new file structure
df_final = df_narrow[['NACCID', 'NACCMNUM', 'MRIMO', 'MRIDY', 'MRIYR', 'NACCMRIA', 'NACCMRFI']]
df_final.columns = ['NACCID', 'NACCMNUM', 'MRIMO', 'MRIDY', 'MRIYR', 'NACCMRIA', 'NACCMRFI']

# Check the final dataset before saving
print("Final dataset preview:")
print(df_final.head())


Final dataset preview:
       NACCID  NACCMNUM  MRIMO  MRIDY  MRIYR  NACCMRIA  \
0  NACC869232         1      7     27   2015        64   
1  NACC869232         2      5     11   2018        67   
2  NACC105473         1      9     19   2005        56   
3  NACC105473         3     12     14   2005        56   
4  NACC792077         1      8     19   2005        69   

                                            NACCMRFI  
0                                        mri3468.zip  
1                                        mri8604.zip  
2  NACC105473_13466705895214219799509911350064674...  
3  NACC105473_13466705895214219799509911345809482...  
4  NACC792077_13466705891140826127662450041565297...  


In [48]:
# sanity check
print(len(df_narrow['NACCID'].unique()))

1612


In [47]:
#df_final.to_csv(writePath + 'MRIT1_longitudinal_narrow_v1.csv', index=False)

print("Data writing to CSV complete!")

Data writing to CSV complete!


### Narrowing by matching MRIT1 scan dates and UDS visits
I will keep only the patients, that had their MRIT1 taken within 1 year of UDS visit. That way I can match results from scans and their diagnosis.