In [1]:
import os
import csv
import scipy
import numpy as np
import pandas as pd
import networkx as nx
import seaborn as sns
import matplotlib.pyplot as plt

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [11]:
import warnings
warnings.filterwarnings(action='ignore')

In [3]:
import GEOparse
gse = GEOparse.get_GEO("GSE56931", annotate_gpl='GPL10379')

31-May-2024 16:07:12 DEBUG utils - Directory ./ already exists. Skipping.
31-May-2024 16:07:12 INFO GEOparse - Downloading ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE56nnn/GSE56931/soft/GSE56931_family.soft.gz to ./GSE56931_family.soft.gz
100%|██████████| 89.1M/89.1M [00:17<00:00, 5.35MB/s]  
31-May-2024 16:07:31 DEBUG downloader - Size validation passed
31-May-2024 16:07:31 DEBUG downloader - Moving /tmp/tmpo5emhwiw to /data5/jypark/research/Transitionpoints/Sleep/GitHub/dynamics-in-sleep/src/GSE56931_family.soft.gz
31-May-2024 16:07:31 DEBUG downloader - Successfully downloaded ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE56nnn/GSE56931/soft/GSE56931_family.soft.gz
31-May-2024 16:07:31 INFO GEOparse - Parsing ./GSE56931_family.soft.gz: 
31-May-2024 16:07:31 DEBUG GEOparse - DATABASE: GeoMiame
31-May-2024 16:07:31 DEBUG GEOparse - SERIES: GSE56931
31-May-2024 16:07:31 DEBUG GEOparse - PLATFORM: GPL10379
31-May-2024 16:07:31 DEBUG GEOparse - SAMPLE: GSM1371587
31-May-2024 16:07:31 DEBUG GEOp

31-May-2024 16:07:43 DEBUG GEOparse - SAMPLE: GSM1371715
31-May-2024 16:07:43 DEBUG GEOparse - SAMPLE: GSM1371716
31-May-2024 16:07:44 DEBUG GEOparse - SAMPLE: GSM1371717
31-May-2024 16:07:44 DEBUG GEOparse - SAMPLE: GSM1371718
31-May-2024 16:07:44 DEBUG GEOparse - SAMPLE: GSM1371719
31-May-2024 16:07:44 DEBUG GEOparse - SAMPLE: GSM1371720
31-May-2024 16:07:44 DEBUG GEOparse - SAMPLE: GSM1371721
31-May-2024 16:07:44 DEBUG GEOparse - SAMPLE: GSM1371722
31-May-2024 16:07:44 DEBUG GEOparse - SAMPLE: GSM1371723
31-May-2024 16:07:44 DEBUG GEOparse - SAMPLE: GSM1371724
31-May-2024 16:07:44 DEBUG GEOparse - SAMPLE: GSM1371725
31-May-2024 16:07:45 DEBUG GEOparse - SAMPLE: GSM1371726
31-May-2024 16:07:45 DEBUG GEOparse - SAMPLE: GSM1371727
31-May-2024 16:07:45 DEBUG GEOparse - SAMPLE: GSM1371728
31-May-2024 16:07:45 DEBUG GEOparse - SAMPLE: GSM1371729
31-May-2024 16:07:45 DEBUG GEOparse - SAMPLE: GSM1371730
31-May-2024 16:07:45 DEBUG GEOparse - SAMPLE: GSM1371731
31-May-2024 16:07:45 DEBUG GEOp

### Pre-processing

In [4]:
# Blood transcriptome data
sample_info_dict = {}
try : 
    for gsm_name, gsm in gse.gsms.items():
        subject, responder, gender, day, timepoint, group = gsm.metadata["title"][0].split(', ')
        subject = subject.split(' ')[1]
        responder = responder.split(' ')[0]
        day = day[3:]; timepoint = timepoint[4:].strip()

        sample_info_dict[gsm.metadata["geo_accession"][0]] = [subject, group, gender, responder, day, timepoint]
except:
    print(gsm_name)
col = ['subject', 'group', 'gender', 'responder', 'day', 'timepoint']
sample_info = pd.DataFrame.from_dict(sample_info_dict, orient='index', columns=col)

In [5]:
# Annotation table
gpl = gse.gpls['GPL10379'].table.dropna(subset=['GeneSymbol'])
annot_table = gsm.annotate(gpl, annotation_column="GeneSymbol")
annot_table = annot_table.drop(['VALUE'], axis=1)
annot_table = annot_table.set_index('ID_REF')

In [6]:
# Sleep condition groups
BS = sample_info[sample_info['group']=='baseline'].index.tolist()
SR = sample_info[sample_info['group']=='sleepdep'].index.tolist()
BS = list(set(BS) - set(sample_info[(sample_info.day == '1')].index))

BS_df = gse.pivot_samples('VALUE')[BS] # Quantile normalized signal intensity
SR_df = gse.pivot_samples('VALUE')[SR]

In [7]:
annot_dict = {k:v['GeneSymbol'] for k, v in annot_table.to_dict('index').items()}

BS_df.rename(index=annot_dict, inplace=True)
BS_df.drop((i for i in BS_df.index if type(i)==int), inplace=True)
BS_df = BS_df.groupby(BS_df.index).mean() #33475

SR_df.rename(index=annot_dict, inplace=True)
SR_df.drop((i for i in SR_df.index if type(i)==int), inplace=True)
SR_df = SR_df.groupby(SR_df.index).mean() #33475

In [9]:
# Subject dataframe
BS_phase = {(2,0):0, (2,4):4, (2,8):8, (2,12):12, (2,16):16, (2,20):20}
SR_phase = {(3,0):0, (3,4):4, (3,8):8, (3,12):12, (3,16):16, (3,20):20}

for subject in set(sample_info['subject']) :
    for group in ["BS", "SR"]:
        df = sample_info.loc[list(set(sample_info[(sample_info['subject']==subject)].index).intersection(set(vars()[group])))]
        df.day = [int(d) for d in df.day]
        df.timepoint = [int(t) for t in df.timepoint]
        df.sort_values(by=['day','timepoint'], inplace=True)
        df['phase'] = df.apply(lambda x: globals()[group+'_phase'][(x.day, x.timepoint)], axis=1)
        
        vars()[subject+'_'+group] = vars()[group+'_df'][df.index]
        idx = [i for i in vars()[group+'_df'].index if ('_at' not in i)&('tcag' not in i)&('hCG' not in i)]
        vars()[subject+'_'+group] = vars()[subject+'_'+group].loc[idx] #18661
        vars()[subject+'_'+group].columns = df.phase

### DEG search

In [12]:
# Mixed-model ANOVA
import pingouin as pg

DF = pd.DataFrame()
for subject in set(sample_info.subject):
    for group in ["BS", "SR"]:
        df = sample_info.loc[list(set(sample_info[(sample_info['subject']==subject)].index).intersection(set(vars()[group])))]
        df.day = [int(d) for d in df.day]
        df.timepoint = [int(t) for t in df.timepoint]
        df.sort_values(by=['day','timepoint'], inplace=True)
        df['phase'] = df.apply(lambda x: globals()[group+'_phase'][(x.day, x.timepoint)], axis=1)
        df = df[['subject','group','phase']]
        df['group'] = group
        
        if group == 'BS':
            df_bs = pd.merge(df[(df['subject']==subject)&(df['group']==group)], vars()[subject+'_BS'].T, how='outer', left_on='phase', right_index=True)
            df_bs['subject'] = subject + '_BS'
        elif group == 'SR':
            df_sr = pd.merge(df[(df['subject']==subject)&(df['group']==group)], vars()[subject+'_SR'].T, how='outer', left_on='phase', right_index=True)
            df_sr['subject'] = subject + '_SR'
    DF = pd.concat([DF, df_bs, df_sr])

In [17]:
DEG_anova = []
if os.path.isfile('../data/DEGs_621.csv'):
    with open('../data/DEGs_621.csv','r') as fr :
        reader = csv.reader(fr)
        for line in reader:
            DEG_anova.append(line)
    DEG_anova = DEG_anova[0]
    print(len(DEG_anova))
else:
    for gene in DF.columns[3:]:
        results = pg.mixed_anova(dv=gene, between='group', within='phase', subject='subject', data=DF[['subject','group','phase',gene]])
        reject, corrected_pval = pg.multicomp(results['p-unc'], method='fdr_bh')
        if corrected_pval[2] < 0.05:
            DEG_anova.append(gene)
    print(len(DEG_anova)) #621

621


### Stationary test

In [16]:
def fillna(data):
    df = pd.DataFrame(data, columns=list(range(0,24,4)))
    df = df.T.interpolate()
    df = df.fillna(method='ffill') #fillna with previous time value
    df = df.fillna(method='bfill') #fillna with next time value
    return df.T

In [19]:
# Stationary test (Augemented Dickey-Fuller test)
from statsmodels.tsa.stattools import adfuller

if os.path.isfile('../data/nonstationary.csv'):
    nonstationary_list = []
    with open('../data/nonstationary.csv', 'r') as fr :
        reader = csv.reader(fr)
        for line in reader:
            nonstationary_list.append(line) 
    nonstationary_list = nonstationary_list[0]
    print(len(nonstationary_list))
else :
    nonstationary = dict()
    for subject in set(sample_info['subject']) :
        df = fillna(vars()[subject+'_BS'])
        if len(df.columns) == 6 :  # samples should have all time-points
            sns = []
            try :
                for i in df.index:
                    if adfuller(df.loc[i])[1] >= 0.05 :
                        sns.append(i)
                nonstationary[subject] = sns
            except : print("X",subject, i)

    nonstationary_list = set.union(*map(set, nonstationary.values())) 
    print(len(nonstationary_list)) #18660

18660


### Oscillation test

In [None]:
# Cosinor analysis - 24 periodicity
from CosinorPy import cosinor, cosinor1

cosinor_dict = {}
for subject in set(sample_info['subject']) :
    cosinor_dict[subject] = {}

    for group in ["BS","SR"]:
        cosinor_dict[subject][group] = pd.DataFrame()
        if len(vars()[subject+"_"+group].columns) == 0 : pass
        else :
            df = fillna(vars()[subject+"_"+group])
            print(subject, group)
            for i in vars()[subject+"_"+group].index :
                cosinor_df = pd.DataFrame({'x':np.linspace(0,24,7)[:-1], 'y':df.loc[i].values, 'test':['test1']*6})
                results = cosinor.fit_group(cosinor_df, period=12, plot=False) #Period 12
                df_best_models = cosinor.get_best_models(cosinor_df, results)
                df_best_models['test'] = i
                cosinor_dict[subject][group] = pd.concat([cosinor_dict[subject][group], df_best_models])