In [217]:
from itertools import product,combinations
import re
import pandas as pd
class GLM_prepare():
    
    def fit(self,Df,SubjectCol,Title='Haha',ClassColList=None,VariableColList=None,ResultPath='./FSGD_freesurfer.fsgd',Regressors='DODS'):
        '''
        SubCol the file name in $SUBJECT_DIR
        
        Regressors please check: https://surfer.nmr.mgh.harvard.edu/fswiki/DodsDoss
        DODS with interaction between class and variable
            example:
                Regressor1: ones for subjects in Group 1, 0 otherwise. Codes intercept/mean for Group 1 
                Regressor2: ones for subjects in Group 2, 0 otherwise. Codes intercept/mean for Group 2 
                Regressor3: age for subjects in Group 1, 0 otherwise. Codes age slope for Group 1 
                Regressor4: age for subjects in Group 2, 0 otherwise. Codes age slope for Group 2 
                Regressor5: weight for subjects in Group 1, 0 otherwise. Codes weight slope for Group 1 
                Regressor6: weight for subjects in Group 2, 0 otherwise. Codes weight slope for Group 2 
        DODS without interaction
            example:
                Regressor1: ones for subjects in Group 1, 0 otherwise. Codes intercept/mean for Group 1 
                Regressor2: ones for subjects in Group 2, 0 otherwise. Codes intercept/mean for Group 2
                Regressor3: age for each subject, 0 otherwise. Codes age slope for all subjects. 
                Regressor4: weight for each subject, 0 otherwise. Codes weight slope for all subjects. 
        '''
        self.Df=Df
        self.SubCol=SubjectCol
        self.ClassCol=ClassColList
        self.VariableCol=VariableColList
        self.ResultPath=ResultPath
        self.Regressors=Regressors
        self.VariableList=None
        self.Title=Title
    def Df2FSGD(self):
        with open(self.ResultPath,'w') as result:
            result.write('GroupDescriptorFile 1\n')
            result.write('Title '+self.Title+'\n')
            if self.ClassCol!=None:
                self.Df.loc[:,'Class']=self.Df.loc[:,self.ClassCol].apply(axis=1,
                                                                          func=lambda x: 
                                                                          '-'.join('.'.join(g) for g in zip(self.ClassCol,[str(i) for i in x])))
                self.ClassList=list(set(self.Df['Class']))
                for c in set(self.Df['Class']):
                    result.write('Class '+c+'\n')
            else:
                self.Df.loc[:,'Class']='Main'
                self.ClassList=['Main']
                result.write('Class Main\n')
            
            if self.VariableCol!=None:
                result.write(' '.join(['Variables']+self.VariableCol)+'\n')
                self.VariableList=self.VariableCol
                for i in self.Df.index:
                    result.write(' '.join(['Input']+
                                          list(self.Df.loc[i,[self.SubCol,'Class']+
                                                                     self.VariableCol].astype(str)))+'\n')
            else:
                for i in self.Df.index:
                    result.write(' '.join(['Input']+
                                          list(self.Df.loc[i,[self.SubCol,'Class']].astype(str)))+'\n')
        with open(self.ResultPath,'r') as result:
            self.FSGD=result.read()
            
            if self.VariableList!=None:
                if self.Regressors=='DODS':
                    self.ContinuousCovs=['_'.join([x,y]) for x,y in product(self.VariableList,self.ClassList)]
                    self.RegressorList=self.ClassList+self.ContinuousCovs
                elif self.Regressors=='DOSS':
                    self.ContinuousCovs=self.VariableList
                    self.RegressorList=self.ClassList+self.VariableList
            else:
                self.RegressorList=self.ClassList
    def findlevel(self,sign,classname):
        try:
            result=re.search(sign+'\.(\w+)',classname).group(1)
            return '.'.join([sign,result])
        except:
            pass
            
    def BuildContrastMatrix(self,SimpleVersion=False):
        """SimpleVersion remove the subgroup effect of such as FemaleGroup1, femaleGroup2,maleGroup1, maleGroup2"""
        ColumnList=self.RegressorList
        Groups=self.ClassList
        # Groups(subgroups) contain FemaleGroup1, femaleGroup2, maleGroup1, maleGroup2
        if self.VariableList!=None:
            ContinuousCovs=self.ContinuousCovs
            ContinuousCovSigns=self.VariableList
        else:
            ContinuousCovs,ContinuousCovSigns=None,None
        # FemaleGroup1age,femaleGroup2Age....
        #ColumnList=Groups+ContinuousCovs
        if self.ClassCol!=None:
            GroupSigns=self.ClassCol
        else:
            GroupSigns=self.ClassList

        # merge subgroup: gender vs Female, male 
        # groupsign=gender
        # GroupLevels=Female, male
        
        # age 
        Result={}

        # group effect
        for Id,i,iS in [('Group',Groups,GroupSigns),('Cove',ContinuousCovs,ContinuousCovSigns)]:
            if i==None or iS==None:
                pass
            else:
                for s in iS:
                    Grouped=filter(lambda x: s in x, i)
                # group effect
                    Result[s+'_Effect']=dict(zip(ColumnList,
                                         [int(any([G==c for G in Grouped]))/float(len(Grouped)) for c in ColumnList ]))
                if SimpleVersion:
                    if Id=='Cove':
                        for g in i:
                            Result[g+'_Effect']=dict(zip(ColumnList,[int(c==g) for c in ColumnList]))
                else:
                    for g in i:
                        Result[g+'_Effect']=dict(zip(ColumnList,[int(c==g) for c in ColumnList]))
                if Id=='Group':
                    # interact within group
                    for Sign in iS:
                        GroupLevels=set([self.findlevel(sign=Sign,classname=g) for g in i if Sign in g])
                        #print GroupLevels
                        for X,Y in combinations(GroupLevels,2):
                            list1=[int(X in c) for c in i]
                            list2=[-int(Y in c) for c in i]
                            listUse1=map(lambda x:sum(x)/float(sum(list1)),zip(list1,list2))+[0]*len(ContinuousCovs)
                            Result['-'.join([X,Y])]=dict(zip(ColumnList,listUse1))
                            # covariant effect within interacted group
                            if self.Regressors=='DODS':
                                for CoS in ContinuousCovSigns:
                                    list3=[int(all([(X in c),(CoS in c)])) for c in ContinuousCovs]
                                    list4=[-int(all([(Y in c),(CoS in c)])) for c in ContinuousCovs]
                                    listUse2=[0]*len(i)+map(lambda x:sum(x)/float(sum(list3)),zip(list3,list4))
                                Result['-'.join([X,Y])+'_'+CoS+'.SlopeChanging']=dict(zip(ColumnList,listUse2))
                            elif self.Regressors=='DOSS':
                                pass
                    # subgroup effect
                
                
        ContrastMatrix_pre=pd.DataFrame(Result).T
        temp={s:i for i,s in enumerate(list(ContrastMatrix_pre.columns))}
        Ordered=[temp[i] for i in aa.RegressorList]
        self.ContrastMatrix=ContrastMatrix_pre.iloc[:,Ordered]
    def BuildDesignMatrix(self,PatientList_WithOrder):
        from scipy import stats
        DfWithClass=self.Df
        ID=self.SubCol
        GroupedColumn='Class'
        Groups=self.ClassList
        if self.VariableList!=None:
                ContinuousCovs=self.ContinuousCovs
                ContinuousCovSigns=self.VariableList
        DfWithClass.loc[:,'Value']=1
        for C in ContinuousCovSigns:
            DfWithClass.loc[:,C+'_Zscore']=stats.zscore(DfWithClass.loc[:,C])
        tempD={}

        for i in ['Value']+ContinuousCovSigns:
            if i=='Value':
                tempD[i]=DfWithClass.loc[:,[ID,GroupedColumn,i]].set_index([ID,GroupedColumn]).unstack().reset_index()
                tempD[i].columns=[tempD[i].columns.levels[0][1]]+list(tempD[i].columns.levels[1][:-1])
            else:
                i_use=i+'_Zscore'
                tempD[i]=DfWithClass.loc[:,[ID,GroupedColumn,i_use]].set_index([ID,GroupedColumn]).unstack().reset_index()
                tempD[i].columns=[tempD[i].columns.levels[0][1]]+[i+'_'+c for c in list(tempD[i].columns.levels[1][:-1])]
        DesignMatrix_Pre=tempD['Value']
        for C in ContinuousCovSigns:
            DesignMatrix_Pre=pd.merge(DesignMatrix_Pre,tempD[C],on=ID)
        DesignMatrix_Pre1=DesignMatrix_Pre.set_index(ID).fillna(0)
        self.DesignMatrix=DesignMatrix_Pre1.loc[PatientList_WithOrder,:]

In [2]:
# test
import pandas as pd
test=pd.read_excel('./test.xlsx')

In [3]:
test.columns=['studyID', u'age', u'group']

In [17]:
test.loc[:,'Gene']=[1,0,0,1]
test.loc[:,'PP']=[1.5,0.5,3,10]

In [218]:
aa=FreesurferPre()

aa.fit(ClassColList=['group','Gene'],Df=test,SubjectCol='studyID',VariableColList=['age','PP'])

aa.Df2FSGD()

In [116]:
print aa.FSGD

GroupDescriptorFile 1
Title Haha
Class group.1-Gene.0
Class group.1-Gene.1
Class group.3-Gene.0
Class group.3-Gene.1
Variables age PP
Input re_COH001a group.1-Gene.1 69 1.5
Input re_COH002a group.1-Gene.0 66 0.5
Input re_COH010a group.3-Gene.0 62 3.0
Input re_COH011a group.3-Gene.1 67 10.0



In [201]:
aa.BuildContrastMatrix(SimpleVersion=True)


In [219]:
b=aa.BuildDesignMatrix(PatientList_WithOrder=['re_COH001a','re_COH011a','re_COH010a','re_COH002a'])

In [221]:
aa.DesignMatrix

Unnamed: 0_level_0,group.1-Gene.0,group.1-Gene.1,group.3-Gene.0,group.3-Gene.1,age_group.1-Gene.0,age_group.1-Gene.1,age_group.3-Gene.0,age_group.3-Gene.1,PP_group.1-Gene.0,PP_group.1-Gene.1,PP_group.3-Gene.0,PP_group.3-Gene.1
studyID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
re_COH001a,0.0,1.0,0.0,0.0,0.0,1.176697,0.0,0.0,0.0,-0.605406,0.0,0.0
re_COH011a,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.392232,0.0,0.0,0.0,1.681682
re_COH010a,0.0,0.0,1.0,0.0,0.0,0.0,-1.568929,0.0,0.0,0.0,-0.201802,0.0
re_COH002a,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.874475,0.0,0.0,0.0


In [209]:
aa.SubCol

'studyID'

In [148]:
aa.ContrastMatrix.columns

Index([u'group.1-Gene.0', u'group.1-Gene.1', u'group.3-Gene.0',
       u'group.3-Gene.1', u'age_group.1-Gene.0', u'age_group.1-Gene.1',
       u'age_group.3-Gene.0', u'age_group.3-Gene.1', u'PP_group.1-Gene.0',
       u'PP_group.1-Gene.1', u'PP_group.3-Gene.0', u'PP_group.3-Gene.1'],
      dtype='object')

In [142]:
temp={s:i for i,s in enumerate(list(aa.ContrastMatrix.columns))}
Ordered=[temp[i] for i in aa.RegressorList]

aa.ContrastMatrix.iloc[:,Ordered]

In [37]:
ColumnList=aa.RegressorList
Groups=aa.ClassList
Regressors='DODS'
# Groups(subgroups) contain FemaleGroup1, femaleGroup2, maleGroup1, maleGroup2

ContinuousCovs=aa.ContinuousCovs
ContinuousCovSigns=aa.VariableList


GroupSigns=aa.ClassCol


# merge subgroup: gender vs Female, male 
# groupsign=gender
# GroupLevels=Female, male

# age 
Result={}

In [94]:
# group effect
Result={}
for Id,i,iS in [('Group',Groups,GroupSigns)]:#,('Cove',ContinuousCovs,ContinuousCovSigns)]:
    if i==None or iS==None:
        pass
    else:
        for s in iS:
            Grouped=filter(lambda x: s in x, i)
            print Grouped
        # group effect
            Result[s+'_Effect']=dict(zip(ColumnList,
                                         [int(any([G==c for G in Grouped]))/float(len(Grouped)) for c in ColumnList ]))

        if Id=='Group':
            # interact within group
            for Sign in iS:
                def findlevel(sign,classname):
                    import re
                    try:
                        result=re.search(sign+'\.(\w+)',classname).group(1)
                        return '.'.join([sign,result])
                    except:
                        pass
                GroupLevels=set([findlevel(sign=Sign,classname=g) for g in i if Sign in g])
                for X,Y in combinations(GroupLevels,2):
                    list1=[int(X in c) for c in i]
                    list2=[-int(Y in c) for c in i]
                    listUse1=map(lambda x:sum(x)/float(sum(list1)),zip(list1,list2))+[0]*len(ContinuousCovs)
                    Result['-'.join([X,Y])]=dict(zip(ColumnList,listUse1))
                    # covariant effect within interacted group
                    if Regressors=='DODS':
                        for CoS in ContinuousCovSigns:
                            list3=[int(all([(X in c),(CoS in c)])) for c in ContinuousCovs]
                            list4=[-int(all([(Y in c),(CoS in c)])) for c in ContinuousCovs]
                            listUse2=[0]*len(i)+map(lambda x:sum(x)/float(sum(list3)),zip(list3,list4))
                        Result['-'.join([X,Y])+'_'+CoS+'.SlopeChanging']=dict(zip(ColumnList,listUse2))
                    elif Regressors=='DOSS':
                        pass
            # subgroup effect
        for g in i:
            Result[g+'_Effect']=dict(zip(ColumnList,[int(c==g) for c in ColumnList]))

['group.1-Gene.0', 'group.1-Gene.1', 'group.3-Gene.0', 'group.3-Gene.1']
['group.1-Gene.0', 'group.1-Gene.1', 'group.3-Gene.0', 'group.3-Gene.1']


In [61]:
i

['group.1-Gene.0', 'group.1-Gene.1', 'group.3-Gene.0', 'group.3-Gene.1']

In [64]:
GroupLevels

{None}

In [66]:
for g in i:
    print g

group.1-Gene.0
group.1-Gene.1
group.3-Gene.0
group.3-Gene.1


In [67]:
Sign

'Gene'

In [68]:
findlevel(sign=Sign,classname=g)

'1'

In [65]:
[findlevel(sign=Sign,classname=i) for g in i if Sign in g]

[None, None, None, None]

In [83]:
[int(any([G==c for G in Grouped]))/float(len(Grouped)) for c in ColumnList ]

[0.25, 0.25, 0.25, 0.25, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]

In [81]:
s

'Gene'

In [82]:
i

['group.1-Gene.0', 'group.1-Gene.1', 'group.3-Gene.0', 'group.3-Gene.1']

In [91]:
i

['group.1-Gene.0', 'group.1-Gene.1', 'group.3-Gene.0', 'group.3-Gene.1']

In [90]:
[int(X in c) for c in i]

[1, 1, 0, 1]

In [95]:
pd.DataFrame(Result)

Unnamed: 0,Gene.1-Gene.0,Gene.1-Gene.0_PP.SlopeChanging,Gene_Effect,group.1-Gene.0_Effect,group.1-Gene.1_Effect,group.1-group.3,group.1-group.3_PP.SlopeChanging,group.3-Gene.0_Effect,group.3-Gene.1_Effect,group_Effect
group.1-Gene.0,-0.5,0.0,0.25,1,0,0.5,0.0,0,0,0.25
group.1-Gene.0_PP,0.0,-0.5,0.0,0,0,0.0,0.5,0,0,0.0
group.1-Gene.0_age,0.0,0.0,0.0,0,0,0.0,0.0,0,0,0.0
group.1-Gene.1,0.5,0.0,0.25,0,1,0.5,0.0,0,0,0.25
group.1-Gene.1_PP,0.0,0.5,0.0,0,0,0.0,0.5,0,0,0.0
group.1-Gene.1_age,0.0,0.0,0.0,0,0,0.0,0.0,0,0,0.0
group.3-Gene.0,-0.5,0.0,0.25,0,0,-0.5,0.0,1,0,0.25
group.3-Gene.0_PP,0.0,-0.5,0.0,0,0,0.0,-0.5,0,0,0.0
group.3-Gene.0_age,0.0,0.0,0.0,0,0,0.0,0.0,0,0,0.0
group.3-Gene.1,0.5,0.0,0.25,0,0,-0.5,0.0,0,1,0.25
