In [1]:
import warnings
warnings.filterwarnings(action='ignore') # To show code clearly

In [2]:
import os
import csv
import scipy
import numpy as np
import pandas as pd
import networkx as nx
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
import GEOparse
gse = GEOparse.get_GEO("GSE39445", annotate_gpl='GPL15331', silent=True)

### Pre-processing

In [4]:
# Blood transcriptome data
sample_info_dict = {}
try : 
    for gsm_name, gsm in gse.gsms.items():
        if len(gsm.metadata["title"][0].split('_')) < 4 : 
            subject = gsm.metadata["title"][0].split('_')[0]
            timepoint = gsm.metadata["title"][0].split('_')[2]
        else :
            subject = gsm.metadata["title"][0].split('_')[1]
            timepoint = gsm.metadata["title"][0].split('_')[3]
        for i in gsm.metadata["characteristics_ch1"]:
            if i.split(': ')[0] in ['sleepprotocol', 'hoursawake', 'timesampletaken', 'circadianphase']:
                vars()[i.split(': ')[0]] = i.split(': ')[1]
        sample_info_dict[gsm.metadata["geo_accession"][0]] = [subject, sleepprotocol, timepoint, hoursawake, timesampletaken, circadianphase]     
except:
    print(gsm_name)
col = ['subject', 'sleepprotocol', 'timepoint', 'hoursawake', 'timesampletaken', 'circadianphase']    
sample_info = pd.DataFrame.from_dict(sample_info_dict, orient='index', columns=col)
sample_info.loc['GSM969077']['circadianphase'] = '0' # manual curation

In [5]:
# Annotation table
gpl = gse.gpls['GPL15331'].table.dropna(subset=['GENE_SYMBOL'])
annot_table = gsm.annotate(gpl, annotation_column="GENE_SYMBOL")
annot_table = annot_table.drop(['VALUE'], axis=1)
annot_table = annot_table.set_index('ID_REF')

In [6]:
# Sleep condition groups
SE = sample_info[sample_info['sleepprotocol']=='Sleep Extension'].index.tolist()
SR = sample_info[sample_info['sleepprotocol']=='Sleep Restriction'].index.tolist()

SE_df = gse.pivot_samples('VALUE')[SE] # Quantile normalized signal intensity
SR_df = gse.pivot_samples('VALUE')[SR]

In [7]:
annot_dict = {k:v['GENE_SYMBOL'] for k, v in annot_table.to_dict('index').items()}

SE_df.rename(index=annot_dict, inplace=True)
SE_df.drop((i for i in SE_df.index if type(i)==int), inplace=True)
SE_df = SE_df.groupby(SE_df.index).mean() #19541

SR_df.rename(index=annot_dict, inplace=True)
SR_df.drop((i for i in SR_df.index if type(i)==int), inplace=True)
SR_df = SR_df.groupby(SR_df.index).mean() #19541

In [8]:
def fillna(data):
    df = pd.DataFrame(data, columns=['T'+str(i+1) for i in range(10)])
    df = df.T.interpolate()
    df = df.fillna(method='ffill') #fillna with previous time value
    df = df.fillna(method='bfill') #fillna with next time value
    return df.T

In [9]:
# Subject dataframe
for subject in set(sample_info['subject']) :
    for protocol in ["Sleep Extension", "Sleep Restriction"]:
        df = sample_info[(sample_info['sleepprotocol']==protocol)&(sample_info['subject']==subject)]
        if protocol == 'Sleep Extension' :
            vars()[subject+'_SE'] = SE_df[df.index]
            vars()[subject+'_SE'].columns = df.timepoint
        elif protocol == 'Sleep Restriction' :
            vars()[subject+'_SR'] = SR_df[df.index]
            vars()[subject+'_SR'].columns = df.timepoint
# Sleep Extension data is mission in Subjects AF0079, AF0091.

for subject in set(sample_info['subject']) :
    for protocol in ["SE", "SR"]:
        vars()[subject+'_'+protocol+'_filled'] = fillna(vars()[subject+'_'+protocol])

### DEG search

In [10]:
# Mixed-model ANOVA
import pingouin as pg

DF = pd.DataFrame()
for subject in set(sample_info.subject):
    if subject not in ['AF0079','AF0091']:
        df = sample_info[['subject','sleepprotocol','timepoint']]
        df_se = pd.merge(df[(df['subject']==subject)&(df['sleepprotocol']=='Sleep Extension')], vars()[subject+'_SE_filled'].T, how='outer', left_on='timepoint', right_index=True)
        df_se['subject'] = subject + '_SE'
        df_se['sleepprotocol'] = df_se['sleepprotocol'].fillna('Sleep Extension')

        df_sr = pd.merge(df[(df['subject']==subject)&(df['sleepprotocol']=='Sleep Restriction')], vars()[subject+'_SR_filled'].T, how='outer', left_on='timepoint', right_index=True)
        df_sr['subject'] = subject + '_SR'
        df_sr['sleepprotocol'] = df_sr['sleepprotocol'].fillna('Sleep Restriction')

        DF = pd.concat([DF, df_se, df_sr])

In [11]:
DEG_anova = []
if os.path.isfile('../data/SR-GSE39445/ANOVA762.csv'):
    with open('../data/SR-GSE39445/ANOVA762.csv') as fr:
        reader = csv.reader(fr)
        for line in reader:
            DEG_anova.append(line)
    DEG_anova = DEG_anova[0]
    print(len(DEG_anova))
else :
    for gene in DF.columns[3:]:
        results = pg.mixed_anova(dv=gene, between='sleepprotocol', within='timepoint', subject='subject', data=DF[['subject','sleepprotocol','timepoint',gene]])
        reject, corrected_pval = pg.multicomp(results['p-unc'], method='fdr_bh')
        if corrected_pval[2] < 0.05:
            DEG_anova.append(gene)
    print(len(DEG_anova)) #762

762


### Stationary test

In [12]:
# Stationary test (Augmented Dickey-Fuller test)
from statsmodels.tsa.stattools import adfuller

if os.path.isfile('../data/SR-GSE39445/nonstationary.csv'):
    nonstationary_list = []
    with open('../data/SR-GSE39445/nonstationary.csv', 'r') as fr :
        reader = csv.reader(fr)
        for line in reader:
            nonstationary_list.append(line) 
    nonstationary_list = nonstationary_list[0]
    print(len(nonstationary_list))
else :
    nonstationary = dict()
    for subject in set(sample_info['subject']) :
        df = vars()[subject+'_SE']
        if len(df.columns) == 10 :  # samples should have all time-points
            sns = []
            try :
                for i in df.index:
                    if adfuller(df.loc[i])[1] >= 0.05 :
                        sns.append(i)
                nonstationary[subject] = sns
            except : print("X",subject, i)

    nonstationary_list = set.union(*map(set, nonstationary.values()))
    len(nonstationary_list) #19529

19529


### Oscillation test

In [14]:
# Cosinor analysis - 24 periodicity
from CosinorPy import cosinor, cosinor1

if os.path.isfile('../data/SR-GSE39445/oscillation_SE.csv') & os.path.isfile('../data/SR-GSE39445/oscillation_SR.csv') :
    oscillation_SE = []; oscillation_SR = []
    with open('../data/SR-GSE39445/oscillation_SE.csv','r') as fr:
        reader = csv.reader(fr)
        for line in reader:
            oscillation_SE.append(line)
    oscillation_SE = oscillation_SE[0]
    
    with open('../data/SR-GSE39445/oscillation_SR.csv','r') as fr:
        reader = csv.reader(fr)
        for line in reader:
            oscillation_SR.append(line)
    oscillation_SR = oscillation_SR[0]
    print(len(oscillation_SE), len(oscillation_SR)) #17783, 17912

else :
    cosinor_dict = {}
    for subject in set(sample_info['subject']) :
        cosinor_dict[subject] = {}
        for group in ["SE","SR"]:
            cosinor_dict[subject][group] = pd.DataFrame()
            if len(vars()[subject+"_"+group].columns) == 0 : pass
            else :
                df = fillna(vars()[subject+"_"+group])
                for i in vars()[subject+"_"+group].index :
                    cosinor_df = pd.DataFrame({'x':np.linspace(0,30,10), 'y':df.loc[i].values, 'test':['test1']*10})
                    results = cosinor.fit_group(cosinor_df, period=24, plot=False)
                    df_best_models = cosinor.get_best_models(cosinor_df, results)
                    df_best_models['test'] = i
                    cosinor_dict[subject][group] = pd.concat([cosinor_dict[subject][group], df_best_models])
                    
    for group in ["SE","SR"]:
        vars()["oscillation_"+group] = []
        for subject, cosinor_res in cosinor_dict.items():
            if len(cosinor_res[group]) != 0 :
                vars()["oscillation_"+group].extend(cosinor_res[group][cosinor_res[group]['p'] < 0.05]['test'])

oscillation_genes = set(oscillation_SE).intersection(set(nonstationary_list))
print(len(oscillation_genes)) #17771

17783 17912
17771


### Network analysis

In [15]:
# Gene co-expressed network
G = nx.Graph()
DEGs = list(set(oscillation_genes).intersection(set(DEG_anova))) # You can get DEGs from ./data/SR-GSE39445/DEG732b.csv
G.add_nodes_from(DEGs) #732

732


In [19]:
# Phase dataframe
protocols = []
for time in ['T'+str(i+1) for i in range(10)]:
    for sleep in ['Sleep Extension', 'Sleep Restriction']:
        if sleep == 'Sleep Extension': protocol = "SE"
        elif sleep == 'Sleep Restriction': protocol = "SR"
        name = protocol+'_'+time
        vars()[name] = sample_info[(sample_info.sleepprotocol == sleep) & (sample_info.timepoint == time)].index
        vars()[name+'_df'] = vars()[protocol+'_df'][vars()[name]].loc[DEGs]
        protocols.append(name)

In [None]:
# Make the edges
from tqdm import tqdm

for protocol in protocols:
    if os.path.isfile('../data/SR-GSE39445/network/{}_edges.csv'.format(protocol)):
        vars()[protocol+'_edges'] = []
        with open('../data/SR-GSE39445/network/{}_edges.csv'.format(protocol)) as fr:
            reader = csv.reader(fr)
            for line in reader:
                vars()[protocol+'_edges'].append(line)
        vars()[protocol+'_edges'] = vars()[protocol+'_edges'][0]
        vars()[protocol+'_filtered_edge'] = [eval(edge) for edge in vars()[protocol+'_edges'] if edge.split(', ')[-1][:-1] != '0']
        
    else:
        df = vars()[protocol+'_df']
        edges = []
        for i in tqdm(range(len(df.index))):
            for j in range(len(df.index)):
                if i < j :
                    r, p = scipy.stats.pearsonr(df.iloc[i], df.iloc[j])
                    if p < 0.05:
                        edges.append((df.index[i], df.index[j], r)) 
                    else: edges.append((df.index[i], df.index[j], 0))
        vars()[protocol+'_edges'] = edges
        vars()[protocol+'_filtered_edge'] = [edge for edge in vars()[protocol+'_edges'] if edge[2] != 0]
        
    globals()[protocol+'_graph'] = G.copy() # nodes : 732
    globals()[protocol+'_graph'].add_weighted_edges_from(vars()[protocol+'_filtered_edge'])

### Network topological features