In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
import altair

In [2]:
df_healthy = pd.read_csv('healthy_data.csv')
df_PD_MCI_LBD = pd.read_csv('PD_MCI_LBD_data.csv')
df_PD = pd.read_csv('PD_data.csv')
df_AD_MCI = pd.read_csv('AD_MCI_data.csv')

In [3]:
df_AD_MCI.set_index('group', inplace=True)
df_healthy.set_index('group', inplace=True)
df_PD.set_index('group', inplace=True)
df_PD_MCI_LBD.set_index('group', inplace=True)

In [4]:
groups = ['AD_MCI','healthy','PD','PD_MCI_LBD']

In [5]:
# Confirm the headers of all the dataframes are the same
list(df_AD_MCI.columns) == list(df_healthy.columns) == list(df_PD.columns) == list(df_PD_MCI_LBD.columns)

True

In [6]:
list_columns = list(df_AD_MCI.columns)

In [7]:
def reversed_delimited_tuple(string,delimiter='|'):
    delimited_tuple = string.split(delimiter)
    reversed_tuple = delimited_tuple[::-1]
    return reversed_tuple

In [8]:
# List proteins by last short-name identifier used in column title
list_proteins = []
for protein_string in list_columns:
    list_proteins.append(reversed_delimited_tuple(protein_string)[0])

In [9]:
# Confirm this list is unique
for elem in list_proteins:
    if list_proteins.count(elem) > 1:
        print(elem)
    else:
        pass

In [10]:
# Create dictionary of extended protein info : short identifier
dict_proteins = {}
for i in range(0,len(list_proteins)):
    dict_proteins[list_columns[i]] = list_proteins[i]

In [11]:
df_AD_MCI = df_AD_MCI.rename(columns=dict_proteins)
df_healthy = df_healthy.rename(columns=dict_proteins)
df_PD = df_PD.rename(columns=dict_proteins)
df_PD_MCI_LBD = df_PD_MCI_LBD.rename(columns=dict_proteins)

In [12]:
df_AD_MCI.head()

Unnamed: 0_level_0,KV37,LV469,LV861,LVX54,LV746,LV218,LV316,LV312,LV310,LV39,...,EMIL3,ABCD2,TEN1,PCDAD,ITM2B,ADSV,A0A1W2PRN1,APOF,DCBD2,LMF2
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AD_MCI_TPAD0210,28.898373,24.125336,22.676771,20.327832,25.570557,24.797337,23.543465,22.55549,22.78356,23.245883,...,20.373782,19.310236,17.856492,18.453124,,20.879716,24.781034,,,
AD_MCI_TPAD0262,28.915048,22.824361,23.197649,19.990306,25.316286,23.571908,24.051694,22.300534,23.046184,24.363683,...,17.709244,,16.816413,17.987179,,,,19.805173,21.125776,
AD_MCI_TPAD0266,28.274292,24.358802,24.066243,23.589011,26.241142,26.079288,24.586944,24.266694,23.910209,25.419193,...,20.682527,,,,,,,,,
AD_MCI_TPAD0273,29.193195,22.187373,23.550432,20.329372,25.789922,24.700275,23.881087,22.998723,22.4309,23.599526,...,20.929981,22.488293,17.568634,16.772132,16.934531,18.446332,24.750659,,,
AD_MCI_TPAD0292,28.212282,22.520379,24.50462,20.68165,26.287648,25.020801,23.176368,23.293055,22.188477,22.789873,...,21.209263,,18.116146,18.547994,18.692276,,,20.264877,21.677769,16.254727


In [13]:
def gen_volcano_tuple(test_frame,control_frame):
    # Returns ['protein','avg_expr_cond','avg_expr_healthy','log2_FC','t_value','log10_p_value']
    datalist = []
    
    for protein in list_proteins:
        t_stat, p_stat = scipy.stats.ttest_ind(test_frame[protein],control_frame[protein],equal_var=False)
        datalist.append([protein,test_frame[protein].mean(), control_frame[protein].mean(),
                        np.log2(test_frame[protein].mean()/control_frame[protein].mean()),
                        float(t_stat), float(np.log10(p_stat))*-1])

    return datalist

In [14]:
AD_MCI_volcano_datalist = gen_volcano_tuple(df_AD_MCI,df_healthy)

PD_volcano_datalist = gen_volcano_tuple(df_PD,df_healthy)

PD_MCI_LBD_volcano_datalist = gen_volcano_tuple(df_PD_MCI_LBD,df_healthy)


In [15]:
df_volcano_AD_MCI = pd.DataFrame(data=AD_MCI_volcano_datalist, columns=['protein','avg_expr_cond','avg_expr_healthy','log2_FC','t_value','log10_p_value'])
df_volcano_PD = pd.DataFrame(data=PD_volcano_datalist, columns=['protein','avg_expr_cond','avg_expr_healthy','log2_FC','t_value','log10_p_value'])
df_volcano_PD_MCI = pd.DataFrame(data=PD_MCI_LBD_volcano_datalist, columns=['protein','avg_expr_cond','avg_expr_healthy','log2_FC','t_value','log10_p_value'])

In [16]:
significance_conditions = [
    (df_volcano_AD_MCI['log10_p_value'] < 1.3),
    (df_volcano_AD_MCI['log10_p_value'] > 1.3) & (df_volcano_AD_MCI['log2_FC'] < 0),
    (df_volcano_AD_MCI['log10_p_value'] > 1.3) & (df_volcano_AD_MCI['log2_FC'] > 0)
]
significance_categories = ['nosig', 'significant_downreg', 'significant_upreg']
df_volcano_AD_MCI['significance'] = np.select(significance_conditions,significance_categories)

In [17]:
df_volcano_AD_MCI.head()

Unnamed: 0,protein,avg_expr_cond,avg_expr_healthy,log2_FC,t_value,log10_p_value,significance
0,KV37,28.625901,28.757594,-0.006622,-1.370797,0.760079,nosig
1,LV469,22.950857,23.052196,-0.006356,-0.654448,0.288264,nosig
2,LV861,23.686,23.251352,0.02672,1.80792,1.129861,nosig
3,LVX54,20.674124,21.257603,-0.040153,-1.417459,0.793985,nosig
4,LV746,25.488773,25.680084,-0.010788,-1.470697,0.837352,nosig


In [21]:
altair.Chart(df_volcano_AD_MCI).mark_circle().encode(
    x='log2_FC',
    y='log10_p_value')

In [20]:
chart

In [35]:
altair.Chart(df_volcano_PD).mark_point().encode(
    x='log2_FC:Q',
    y='log10_p_value:Q',
    color='significant_0.05:O')

In [24]:
altair.Chart(df_volcano_PD_MCI).mark_point().encode(
    x='log2_FC:Q',
    y='log10_p_value:Q',
    color='significant_0.05')