In [1]:
import pandas as pd
import numpy as np
import scipy.stats
import altair as alt

In [2]:
df_healthy = pd.read_csv('https://github.com/BrainPowerChemE/brainpowerdata/blob/main/raw_data/healthy_data.csv?raw=true')
df_PD_MCI_LBD = pd.read_csv('https://github.com/BrainPowerChemE/brainpowerdata/blob/main/raw_data/PD_MCI_LBD_data.csv?raw=true')
df_PD = pd.read_csv('https://github.com/BrainPowerChemE/brainpowerdata/blob/main/raw_data/PD_data.csv?raw=true')
df_AD_MCI = pd.read_csv('https://github.com/BrainPowerChemE/brainpowerdata/blob/main/raw_data/AD_MCI_data.csv?raw=true')

In [3]:
df_AD_MCI.set_index('group', inplace=True)
df_healthy.set_index('group', inplace=True)
df_PD.set_index('group', inplace=True)
df_PD_MCI_LBD.set_index('group', inplace=True)

In [5]:
groups = ['AD_MCI','healthy','PD','PD_MCI_LBD']

In [6]:
# Confirm the headers of all the dataframes are the same
list(df_AD_MCI.columns) == list(df_healthy.columns) == list(df_PD.columns) == list(df_PD_MCI_LBD.columns)

True

In [7]:
list_columns = list(df_AD_MCI.columns)

In [8]:
def reversed_delimited_tuple(string,delimiter='|'):
    delimited_tuple = string.split(delimiter)
    reversed_tuple = delimited_tuple[::-1]
    return reversed_tuple

In [9]:
# List proteins by last short-name identifier used in column title
list_proteins = []
for protein_string in list_columns:
    list_proteins.append(reversed_delimited_tuple(protein_string)[0])

In [10]:
# Confirm this list is unique
for elem in list_proteins:
    if list_proteins.count(elem) > 1:
        print(elem)
    else:
        pass

In [11]:
# Create dictionary of extended protein info : short identifier
dict_proteins = {}
for i in range(0,len(list_proteins)):
    dict_proteins[list_columns[i]] = list_proteins[i]

In [12]:
df_AD_MCI = df_AD_MCI.rename(columns=dict_proteins)
df_healthy = df_healthy.rename(columns=dict_proteins)
df_PD = df_PD.rename(columns=dict_proteins)
df_PD_MCI_LBD = df_PD_MCI_LBD.rename(columns=dict_proteins)

In [13]:
print('AD_MCI_NaN:',len(df_AD_MCI.columns[df_AD_MCI.isna().any()]))
print('healthy_NaN:',len(df_healthy.columns[df_healthy.isna().any()]))
print('PD_NaN:',len(df_PD.columns[df_PD.isna().any()]))
print('PD_MCI_NaN:',len(df_PD_MCI_LBD.columns[df_PD_MCI_LBD.isna().any()]))

AD_MCI_NaN: 192
healthy_NaN: 193
PD_NaN: 204
PD_MCI_NaN: 204


In [14]:
def gen_volcano_tuple(test_frame,control_frame):
    # Returns ['protein','avg_expr_cond','avg_expr_healthy','log2_FC','t_value','log10_p_value']
    datalist = []
    
    for protein in list_proteins:
        t_stat, p_stat = scipy.stats.ttest_ind(
            test_frame[protein],
            control_frame[protein],
            equal_var=False, 
            nan_policy='omit'
        )
        
        datalist.append(
            [protein,test_frame[protein].mean(), control_frame[protein].mean(),
                        np.log2(test_frame[protein].mean()/control_frame[protein].mean()),
                        float(t_stat), float(np.log10(p_stat))*-1]
        )

    return datalist

In [15]:
AD_MCI_volcano_datalist = gen_volcano_tuple(df_AD_MCI,df_healthy)
PD_volcano_datalist = gen_volcano_tuple(df_PD,df_healthy)
PD_MCI_LBD_volcano_datalist = gen_volcano_tuple(df_PD_MCI_LBD,df_healthy)
PD_vs_MCI_volcano_datalist = gen_volcano_tuple(df_PD_MCI_LBD,df_PD)

In [16]:
df_volcano_AD_MCI = pd.DataFrame(data=AD_MCI_volcano_datalist, columns=['protein','avg_expr_cond','avg_expr_healthy','log2_FC','t_value','-log10_p_value'])
df_volcano_PD = pd.DataFrame(data=PD_volcano_datalist, columns=['protein','avg_expr_cond','avg_expr_healthy','log2_FC','t_value','-log10_p_value'])
df_volcano_PD_MCI = pd.DataFrame(data=PD_MCI_LBD_volcano_datalist, columns=['protein','avg_expr_cond','avg_expr_healthy','log2_FC','t_value','-log10_p_value'])
df_volcano_PD_MCI_vs_PD = pd.DataFrame(data=PD_vs_MCI_volcano_datalist, columns=['protein','avg_expr_cond','avg_expr_healthy','log2_FC','t_value','-log10_p_value'])

In [17]:
df_volcano_AD_MCI[df_volcano_AD_MCI['protein'] == 'ABCD2']

Unnamed: 0,protein,avg_expr_cond,avg_expr_healthy,log2_FC,t_value,-log10_p_value
1336,ABCD2,21.924592,22.01385,-0.005861,-0.300267,0.116285


In [18]:
significance_categories = ['nosig', 'significant_downreg', 'significant_upreg']

In [19]:
def gen_pandas_volcano_significance_column(frame):
    significance_conditions = [
    (frame['-log10_p_value'] < 1.3),
    (frame['-log10_p_value'] > 1.3) & (frame['log2_FC'] < 0),
    (frame['-log10_p_value'] > 1.3) & (frame['log2_FC'] > 0)
]
    return np.select(significance_conditions,significance_categories)

In [20]:
df_volcano_AD_MCI['significance'] = gen_pandas_volcano_significance_column(df_volcano_AD_MCI)
df_volcano_PD['significance'] = gen_pandas_volcano_significance_column(df_volcano_PD)
df_volcano_PD_MCI['significance'] = gen_pandas_volcano_significance_column(df_volcano_PD_MCI)
df_volcano_PD_MCI_vs_PD['significance'] = gen_pandas_volcano_significance_column(df_volcano_PD_MCI_vs_PD)

In [21]:
color_range = ['grey','blue','red']

In [22]:
chart_volcano_AD_MCI = alt.Chart(df_volcano_AD_MCI, title='Proteins in CSF of Alzheimers patients compared to healthy patients').mark_point().encode(
x='log2_FC',
y='-log10_p_value',
tooltip='protein',
color=alt.Color('significance', scale=alt.Scale(domain=significance_categories, range=color_range))).interactive()

chart_volcano_PD = alt.Chart(df_volcano_PD, title='Proteins in CSF of Parkinsons patients compared to healthy patients').mark_point().encode(
x='log2_FC',
y='-log10_p_value',
tooltip='protein',
color=alt.Color('significance', scale=alt.Scale(domain=significance_categories, range=color_range))).interactive()

chart_volcano_PD_MCI = alt.Chart(df_volcano_PD_MCI, title='Proteins in CSF of Parkinsons patients with mild cognitive impairment compared to healthy patients').mark_point().encode(
x='log2_FC',
y='-log10_p_value',
tooltip='protein',
color=alt.Color('significance', scale=alt.Scale(domain=significance_categories, range=color_range))).interactive()

chart_volcano_PD_MCI_vs_PD = alt.Chart(df_volcano_PD_MCI_vs_PD, title='Proteins in CSF of Parkinsons patients with mild cognitive impairment compared to PD patients without MCI').mark_point().encode(
x='log2_FC',
y='-log10_p_value',
tooltip='protein',
color=alt.Color('significance', scale=alt.Scale(domain=significance_categories, range=color_range))).interactive()

In [23]:
chart_volcano_AD_MCI.save('chart_volcano_AD_MCI.html')
chart_volcano_PD.save('chart_volcano_PD.html')
chart_volcano_PD_MCI.save('chart_volcano_PD_MCI.html')
chart_volcano_PD_MCI_vs_PD.save('chart_volcano_PD_MCI_vs_PD.html')

In [24]:
chart_volcano_AD_MCI