12-17
Goal here is to take a groups from the sequences to use as a cohort for feature extraction at different time steps 

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns


# Gathering some statistics of: 

1. How many have a baseline MCI diagnosis
2. Individuals who progressed to AD at some point later in the sequence. (presymptomatic AD)
3. Indivudals who maintained a MCI diagnosis throughout their surveillance. (non AD dementia etc)

# 1-23 Update
I am going to be now including CN, and addding stable CN to the non progressors and CN to MCI to the progressors. Will not be including CN to MCI to AD

In [7]:
# Read the response_combinations CSV file
response_combinations_df = pd.read_csv('response_combinations.csv')

# Convert the 'Responses' column from string to list
response_combinations_df['Responses'] = response_combinations_df['Responses'].apply(eval)

# Filter sequences that start with MCI and have at least two responses
mci_start_df = response_combinations_df[response_combinations_df['Responses'].apply(lambda x: x[0] == 'MCI' and len(x) >= 2)]

# Count sequences that stayed MCI
stayed_mci_count = mci_start_df[mci_start_df['Responses'].apply(lambda x: all(response == 'MCI' for response in x))]['Counts'].sum()

# Count sequences that transitioned to AD, excluding mixed sequences
transitioned_to_ad_count = mci_start_df[mci_start_df['Responses'].apply(lambda x: 'AD' in x and all(response in ['MCI', 'AD'] for response in x))]['Counts'].sum()

# Output the results
print(f"Number of sequences that started with MCI and stayed MCI: {stayed_mci_count}")
print(f"Number of sequences that started with MCI and transitioned to AD: {transitioned_to_ad_count}")

stay_mci_counts = {}
transitioned_to_ad_counts = {}

# Iterate over each unique length of responses
for length in mci_start_df['Responses'].apply(len).unique():
    # Filter sequences of the current length
    length_df = mci_start_df[mci_start_df['Responses'].apply(len) == length]
    
    # Count sequences that stayed MCI
    stay_mci_counts[length] = length_df[length_df['Responses'].apply(lambda x: all(response == 'MCI' for response in x))]['Counts'].sum()
    
    # Count sequences that transitioned to AD, excluding mixed sequences
    transitioned_to_ad_counts[length] = length_df[length_df['Responses'].apply(lambda x: 'AD' in x and all(response in ['MCI', 'AD'] for response in x))]['Counts'].sum()

# Output the results
for length in sorted(stay_mci_counts.keys()):
    print(f"{length} visits - Stayed MCI: {stay_mci_counts[length]}, Transitioned to AD: {transitioned_to_ad_counts[length]}")


Number of sequences that started with MCI and stayed MCI: 877
Number of sequences that started with MCI and transitioned to AD: 1523
2 visits - Stayed MCI: 531, Transitioned to AD: 323
3 visits - Stayed MCI: 188, Transitioned to AD: 305
4 visits - Stayed MCI: 87, Transitioned to AD: 269
5 visits - Stayed MCI: 35, Transitioned to AD: 174
6 visits - Stayed MCI: 13, Transitioned to AD: 122
7 visits - Stayed MCI: 6, Transitioned to AD: 109
8 visits - Stayed MCI: 9, Transitioned to AD: 72
9 visits - Stayed MCI: 3, Transitioned to AD: 49
10 visits - Stayed MCI: 1, Transitioned to AD: 37
11 visits - Stayed MCI: 0, Transitioned to AD: 25
12 visits - Stayed MCI: 1, Transitioned to AD: 17
13 visits - Stayed MCI: 1, Transitioned to AD: 9
14 visits - Stayed MCI: 0, Transitioned to AD: 5
15 visits - Stayed MCI: 1, Transitioned to AD: 3
16 visits - Stayed MCI: 0, Transitioned to AD: 3
17 visits - Stayed MCI: 0, Transitioned to AD: 1
18 visits - Stayed MCI: 1, Transitioned to AD: 0


In [9]:
# Read the response_combinations CSV file
response_combinations_df = pd.read_csv('response_combinations.csv')

# Convert the 'Responses' column from string to list
response_combinations_df['Responses'] = response_combinations_df['Responses'].apply(eval)

# Filter sequences that start with MCI or CN and have at least two responses
mci_cn_start_df = response_combinations_df[response_combinations_df['Responses'].apply(lambda x: (x[0] == 'MCI' or x[0] == 'CN') and len(x) >= 2)]

# Nonprogressors: sequences that stayed the same (either all MCI or all CN)
nonprogressors_df = mci_cn_start_df[mci_cn_start_df['Responses'].apply(lambda x: all(response == x[0] for response in x))]

# Progressors: sequences that moved from CN to MCI or from MCI to AD
progressors_df = mci_cn_start_df[mci_cn_start_df['Responses'].apply(lambda x: (x[0] == 'CN' and 'MCI' in x and all(response in ['CN', 'MCI'] for response in x)) or (x[0] == 'MCI' and 'AD' in x and all(response in ['MCI', 'AD'] for response in x)))]

# Count sequences in each category
nonprogressors_count = nonprogressors_df['Counts'].sum()
progressors_count = progressors_df['Counts'].sum()

# Output the results
print(f"Number of nonprogressor sequences: {nonprogressors_count}")
print(f"Number of progressor sequences: {progressors_count}")

nonprogressors_counts = {}
progressors_counts = {}

# Iterate over each unique length of responses
for length in mci_cn_start_df['Responses'].apply(len).unique():
    # Filter sequences of the current length
    length_df = mci_cn_start_df[mci_cn_start_df['Responses'].apply(len) == length]
    
    # Count nonprogressor sequences
    nonprogressors_counts[length] = length_df[length_df['Responses'].apply(lambda x: all(response == x[0] for response in x))]['Counts'].sum()
    
    # Count progressor sequences
    progressors_counts[length] = length_df[length_df['Responses'].apply(lambda x: (x[0] == 'CN' and 'MCI' in x and all(response in ['CN', 'MCI'] for response in x)) or (x[0] == 'MCI' and 'AD' in x and all(response in ['MCI', 'AD'] for response in x)))]['Counts'].sum()

# Output the results
for length in sorted(nonprogressors_counts.keys()):
    print(f"Length {length}: Nonprogressors: {nonprogressors_counts[length]}, Progressors: {progressors_counts[length]}")
    
print("\nRandom nonprogressor sequences:")
print(nonprogressors_df.sample(n=5))

print("\nRandom progressor sequences:")
print(progressors_df.sample(n=5))

Number of nonprogressor sequences: 12292
Number of progressor sequences: 2265
Length 2: Nonprogressors: 3305, Progressors: 491
Length 3: Nonprogressors: 1860, Progressors: 413
Length 4: Nonprogressors: 1569, Progressors: 367
Length 5: Nonprogressors: 1173, Progressors: 245
Length 6: Nonprogressors: 1006, Progressors: 199
Length 7: Nonprogressors: 790, Progressors: 151
Length 8: Nonprogressors: 511, Progressors: 107
Length 9: Nonprogressors: 454, Progressors: 76
Length 10: Nonprogressors: 359, Progressors: 66
Length 11: Nonprogressors: 321, Progressors: 49
Length 12: Nonprogressors: 265, Progressors: 37
Length 13: Nonprogressors: 216, Progressors: 24
Length 14: Nonprogressors: 141, Progressors: 10
Length 15: Nonprogressors: 100, Progressors: 14
Length 16: Nonprogressors: 90, Progressors: 7
Length 17: Nonprogressors: 75, Progressors: 6
Length 18: Nonprogressors: 43, Progressors: 2
Length 19: Nonprogressors: 14, Progressors: 1
Length 20: Nonprogressors: 0, Progressors: 0

Random nonprogre

# Making dataframes for each time step
1. NACCIDs
2. 

In [5]:
# Initialize dictionaries to store NACCIDs
stay_mci_naccids = {}
transitioned_to_ad_naccids = {}

# Iterate over each unique length of responses
for length in mci_start_df['Responses'].apply(len).unique():
    # Filter sequences of the current length
    length_df = mci_start_df[mci_start_df['Responses'].apply(len) == length]
    
    # Extract NACCIDs for sequences that stayed MCI
    stay_mci_naccids[length] = length_df[length_df['Responses'].apply(lambda x: all(response == 'MCI' for response in x))]['NACCID'].explode().unique()
    
    # Extract NACCIDs for sequences that transitioned to AD, excluding mixed sequences
    transitioned_to_ad_naccids[length] = length_df[length_df['Responses'].apply(lambda x: 'AD' in x and all(response in ['MCI', 'AD'] for response in x))]['NACCID'].explode().unique()

# Save NACCIDs to CSV files
for length in stay_mci_naccids.keys():
    # Save NACCIDs for sequences that stayed MCI
    pd.DataFrame(stay_mci_naccids[length], columns=['NACCID']).to_csv(f'{length}visit_nonprogressor.csv', index=False)
    
    # Save NACCIDs for sequences that transitioned to AD
    pd.DataFrame(transitioned_to_ad_naccids[length], columns=['NACCID']).to_csv(f'{length}visit_progressors.csv', index=False)

# Output the results
for length in sorted(stay_mci_counts.keys()):
    print(f"{length} visits - Stayed MCI: {stay_mci_counts[length]}, Transitioned to AD: {transitioned_to_ad_counts[length]}")
    
    
print("\nRandom nonprogressor sequences:")
print(stayed_mci_count.sample(n=5))

print("\nRandom progressor sequences:")
print(transitioned_to_ad_count.sample(n=5))

2 visits - Stayed MCI: 531, Transitioned to AD: 323
3 visits - Stayed MCI: 188, Transitioned to AD: 305
4 visits - Stayed MCI: 87, Transitioned to AD: 269
5 visits - Stayed MCI: 35, Transitioned to AD: 174
6 visits - Stayed MCI: 13, Transitioned to AD: 122
7 visits - Stayed MCI: 6, Transitioned to AD: 109
8 visits - Stayed MCI: 9, Transitioned to AD: 72
9 visits - Stayed MCI: 3, Transitioned to AD: 49
10 visits - Stayed MCI: 1, Transitioned to AD: 37
11 visits - Stayed MCI: 0, Transitioned to AD: 25
12 visits - Stayed MCI: 1, Transitioned to AD: 17
13 visits - Stayed MCI: 1, Transitioned to AD: 9
14 visits - Stayed MCI: 0, Transitioned to AD: 5
15 visits - Stayed MCI: 1, Transitioned to AD: 3
16 visits - Stayed MCI: 0, Transitioned to AD: 3
17 visits - Stayed MCI: 0, Transitioned to AD: 1
18 visits - Stayed MCI: 1, Transitioned to AD: 0
