In [1]:
import pandas as pd
import numpy as np
import os
from Bio import Entrez,SeqIO
Entrez.email = "huangyan8@genomics.cn"  # 设置你的邮箱地址
from ete3 import NCBITaxa, PhyloTree
ncbi = NCBITaxa()
from collections import Counter
# 提供一个物种Taxonomy ID的列表
table_path='C:\\Users\\huangyan8\\Desktop\\work\\2024-09-03 TE_Evolution_Draft\\Tables\\'
fig_path='C:\\Users\\huangyan8\\Desktop\\work\\2024-09-03 TE_Evolution_Draft\\Figures\\'
samples=pd.read_excel("D:\\18.TE_Evolution\\00.Sample_Information\\Plant_558_Genome_Infor_this_project_2024-08-16.xlsx").fillna("")
#samples.to_excel("./Tables/Table S1/Genome_Information_Plant294.xlsx",index=False)
samples.to_excel(table_path+'Table S1\\Table S1.TE_Evo_558_Species.xlsx',index=False)
species_list =samples['Taxonomy_ID'].unique() # 人类、小鼠、大鼠的示例
samples['CODE_ID']=samples['Specie_ID']
taxid_dic={}
for i in range(samples.shape[0]):
    taxid_dic[samples.loc[i,'CODE_ID']]=samples.loc[i,'Taxonomy_ID']
samples.head(1)

Unnamed: 0,Name,Specie_ID,Taxonomy_ID,Assembly_Accession,Assembly,Genome_Source,CODE_ID
0,Acer negundo,PBXX,4023,GCA_025594385.1,ASM2559438v1,NCBI,PBXX


### Prepare Plant Groups

In [4]:
def get_specie_type1(taxid):
    lineage = ncbi.get_lineage(taxid)
    #藻类 - Algae
    #苔藓类 - Mosses
    #蕨类 - Ferns
    #裸子植物 - Gymnosperms
    #被子植物 - Angiosperms
    #for tax in [4410,4418,4405,13714]:
    #    if tax in lineage:
    #        return 'Other Eudicotyledons'
    if 4447 in lineage:
        return 'Angiosperms' #Monocots 
    elif 3398 in lineage:
        return 'Angiosperms' #Dicots
    elif 71240 in lineage:
        return 'Dicots' #'Angiosperms'
    elif 1437183 in lineage:
        return 'Dicots' #'Angiosperms'
    for tax in [131209,2763,1035538,3041,304574] :
        if tax in lineage:
            return 'Algae'# 藻类
    for tax in [3195,3208]:
        if tax in lineage:
            return 'Mosses' #苔藓
    for tax in [3244,3249,241806]:
        if tax in lineage:
            return 'Ferns' #蕨类
    if 58019 in lineage:
        return 'Gymnosperms' #裸子植物
samples['Group']=samples['Taxonomy_ID'].apply(get_specie_type1)
samples['Group'].value_counts()

Group
Angiosperms    517
Algae           25
Mosses           8
Ferns            5
Gymnosperms      3
Name: count, dtype: int64

### Prepare Features

In [24]:
save_path='D:\\18.TE_Evolution\\12.Analysis\\2024-08-16 TE Evolution\\Part3.Feature_Selection\\'
anno_path='D:\\18.TE_Evolution\\03.Protein_Annotation\\'
kegg_path=anno_path+"KEGG.ko\\"
go_path=anno_path+"GO\\"
PlantTF_path=anno_path+'PlantTF_Family\\'
FLOR_path=anno_path+'FLOR_Family\\'
terms_dic={}
gene_set={}

In [25]:
# FLOR
for i in range(samples.shape[0]):
    specie_id=samples.loc[i,'CODE_ID']
    df=pd.read_csv(FLOR_path+specie_id+"_FLOR.csv",sep='\t').dropna()
    df['Terms']=df['FLOR_Family'].apply(lambda x:x.split(":")[0])
    if specie_id not in terms_dic:
        terms_dic[specie_id]={}
        gene_set[specie_id]={}
    for j in range(df.shape[0]):
        term=df.loc[j,'Terms']
        if term in terms_dic[specie_id]:
            terms_dic[specie_id][term]+=1
            #gene_set[specie_id][term].append(df.loc[j,'Gene'])
        else:
            terms_dic[specie_id][term]=1
            #gene_set[specie_id][term]=[df.loc[j,'Gene']]

In [26]:
#PlantTF
for i in range(samples.shape[0]):
    specie_id=samples.loc[i,'CODE_ID']
    df=pd.read_csv(PlantTF_path+specie_id+"_PlantTF.csv",sep='\t').dropna()
    df['Terms']=df['TF_Family'].apply(lambda x:x.split(":")[0])
    if specie_id not in terms_dic:
        terms_dic[specie_id]={}
        gene_set[specie_id]={}
    for j in range(df.shape[0]):
        term=df.loc[j,'Terms']
        if term in terms_dic[specie_id]:
            terms_dic[specie_id][term]+=1
            #gene_set[specie_id][term].append(df.loc[j,'Gene'])
        else:
            terms_dic[specie_id][term]=1
            #gene_set[specie_id][term]=[df.loc[j,'Gene']]

In [27]:
for i in range(samples.shape[0]):
    specie_id=samples.loc[i,'CODE_ID']
    df=pd.read_csv(kegg_path+specie_id+".ko.csv").dropna().reset_index(drop=True)
    #df=pd.read_csv(go_path+specie_id+".wego.csv").dropna()
    if specie_id not in terms_dic:
        terms_dic[specie_id]={}
    for j in range(df.shape[0]):
        terms=df.loc[j,'KEGG_Terms']
        if '\t' in terms:
            terms=terms.split("\t")
        elif ' 'in terms:
            terms=terms.split(" ")
        else:
            terms=[terms]
        for term in terms:
            if term in terms_dic[specie_id]:
                terms_dic[specie_id][term]+=1
                #gene_set[specie_id][term].append(df.loc[j,'Gene'])
            else:
                terms_dic[specie_id][term]=1
                #gene_set[specie_id][term]=[df.loc[j,'Gene']]

In [28]:
for i in range(samples.shape[0]):
    if i%50==0:
        print(i)
    specie_id=samples.loc[i,'CODE_ID']
    df=pd.read_csv(go_path+specie_id+".wego.csv").dropna().reset_index(drop=True)
    if specie_id not in terms_dic:
        terms_dic[specie_id]={}
    for j in range(df.shape[0]):
        terms=df.loc[j,'Go_Terms']
        if '\t' in terms:
            terms=terms.split("\t")
        elif ' 'in terms:
            terms=terms.split(" ")
        else:
            terms=[terms]
        for term in terms:
            if term in terms_dic[specie_id]:
                terms_dic[specie_id][term]+=1
            else:
                terms_dic[specie_id][term]=1
            #if term in gene_set[specie_id]:
            #    gene_set[specie_id][term].append(df.loc[j,'Gene'])
            #else:
            #    gene_set[specie_id][term]=[df.loc[j,'Gene']]

0
50
100
150
200
250
300
350
400
450
500
550


In [29]:
#ncRNA
ncRNA_path='D:\\18.TE_Evolution\\06.Bed\\ncRNA_Bed\\'
for i in range(samples.shape[0]):
    if i%50==0:
        print(i)
    specie_id=samples.loc[i,'CODE_ID']
    if specie_id+"_ncRNA.bed" in os.listdir(ncRNA_path):
        df=pd.read_csv(ncRNA_path+specie_id+"_ncRNA.bed",sep='\t').dropna().reset_index(drop=True)
        if specie_id not in terms_dic:
            terms_dic[specie_id]={}
        for f,f_df in df.groupby('ncRNA_Family'):
            terms_dic[specie_id][f]=f_df.shape[0]

0
50
100
150
200
250
300
350
400
450
500
550


In [30]:
Feature=pd.DataFrame.from_dict(terms_dic,orient='index').fillna(0)#.reset_index()
Feature.to_csv("D:\\18.TE_Evolution\\12.Analysis\\2024-08-16 TE Evolution\\Part3.Feature_Selection\\Plant558_Features.csv")
print(Feature.shape)
Feature.head()

(558, 72537)


Unnamed: 0,FLOR_0,FLOR_1,FLOR_2,FLOR_3,FLOR_5,FLOR_7,FLOR_39,FLOR_10,FLOR_11,FLOR_12,...,K48044,K48045,K48046,K48047,K48048,K48049,K48050,K48051,RF02020,RF02869
PBXX,99,40,1141,157.0,183.0,160.0,67.0,9.0,122,8.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
QIBM,118,42,1214,147.0,208.0,212.0,78.0,9.0,135,10.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BUKK,209,52,1769,177.0,308.0,147.0,114.0,15.0,197,14.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
GJNO,110,27,988,86.0,157.0,92.0,58.0,8.0,119,11.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
WWFJ,199,63,1673,160.0,382.0,158.0,133.0,20.0,222,12.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [2]:
Feature=pd.read_csv("D:\\18.TE_Evolution\\12.Analysis\\2024-08-16 TE Evolution\\Part3.Feature_Selection\\Plant558_Features.csv")
Feature=Feature.set_index("Unnamed: 0")
print(Feature.shape)
Feature.head()

(558, 72538)


Unnamed: 0.1,Unnamed: 0,FLOR_0,FLOR_1,FLOR_2,FLOR_3,FLOR_5,FLOR_7,FLOR_39,FLOR_10,FLOR_11,...,K48044,K48045,K48046,K48047,K48048,K48049,K48050,K48051,RF02020,RF02869
0,PBXX,99,40,1141,157.0,183.0,160.0,67.0,9.0,122,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,QIBM,118,42,1214,147.0,208.0,212.0,78.0,9.0,135,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,BUKK,209,52,1769,177.0,308.0,147.0,114.0,15.0,197,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,GJNO,110,27,988,86.0,157.0,92.0,58.0,8.0,119,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,WWFJ,199,63,1673,160.0,382.0,158.0,133.0,20.0,222,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
sig_terms=[]
title_dic={}
for f_type in ['FLOR','GO',"KEGG","PlantTF",'ncRNA']:#
    df=pd.read_excel("D:\\18.TE_Evolution\\12.Analysis\\2024-08-16 TE Evolution\\Part1.Family_Enrichment\\"+f_type+"_Dom_and_TE_DupGenes_Enrichment.xlsx")
    #df['Term']=df['index']
    df['LogP']=df['Adjusted P-value'].apply(lambda x:-np.log10(x))
    df=df[df['LogP']>=2].reset_index(drop=True)
    print(f_type,df.shape)
    for i in range(df.shape[0]):
        title_dic[df.loc[i,'Term']]=df.loc[i,'Title']
    sig_terms+=df['Term'].tolist()
len(sig_terms)

FLOR (21, 10)
GO (1110, 12)
KEGG (1565, 10)
PlantTF (73, 10)
ncRNA (36, 10)


2805

In [35]:
samples['Group'].value_counts().index

Index(['Angiosperms', 'Algae', 'Mosses', 'Ferns', 'Gymnosperms'], dtype='object', name='Group')

In [33]:
Feature.head(1)

Unnamed: 0,FLOR_0,FLOR_1,FLOR_2,FLOR_3,FLOR_5,FLOR_7,FLOR_39,FLOR_10,FLOR_11,FLOR_12,...,K48044,K48045,K48046,K48047,K48048,K48049,K48050,K48051,RF02020,RF02869
PBXX,99,40,1141,157.0,183.0,160.0,67.0,9.0,122,8.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Get Groups

In [None]:
def get_specie_type1(taxid):
    #'Algae', 'Mosses', 'Ferns', 'Gymnosperms','Angiosperms'
    lineage = ncbi.get_lineage(taxid)
    #藻类 - Algae
    #苔藓类 - Mosses
    #蕨类 - Ferns
    #裸子植物 - Gymnosperms
    #被子植物 - Angiosperms
    #for tax in [4410,4418,4405,13714]:
    #    if tax in lineage:
    #        return 'Other Eudicotyledons'
    if 4447 in lineage:
        return 'Angiosperms' #Monocots 
    elif 3398 in lineage:
        return 'Angiosperms' #Dicots
    elif 71240 in lineage:
        return 'Dicots' #'Angiosperms'
    elif 1437183 in lineage:
        return 'Dicots' #'Angiosperms'
    for tax in [131209,2763,1035538,3041,304574] :
        if tax in lineage:
            return 'Algae'# 藻类
    for tax in [3195,3208]:
        if tax in lineage:
            return 'Mosses' #苔藓
    for tax in [3244,3249,241806]:
        if tax in lineage:
            return 'Ferns' #蕨类
    if 58019 in lineage:
        return 'Gymnosperms' #裸子植物
samples['Group']=samples['Taxonomy_ID'].apply(get_specie_type1)
samples['Group'].value_counts()

In [119]:
samples=samples[samples['Group']=='Angiosperms']

In [121]:
def get_specie_type2(taxid):
    # 'Dicots','Monocots'
    lineage = ncbi.get_lineage(taxid)
    #藻类 - Algae
    #苔藓类 - Mosses
    #蕨类 - Ferns
    #裸子植物 - Gymnosperms
    #被子植物 - Angiosperms
    #for tax in [4410,4418,4405,13714]:
    #    if tax in lineage:
    #        return 'Other Eudicotyledons'
    if 4447 in lineage:
        return 'Monocots'
    elif 3398 in lineage:
        return 'Dicots' #
    else:
        return 'Other'
samples['Group']=samples['Taxonomy_ID'].apply(get_specie_type2)
samples['Group'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples['Group']=samples['Taxonomy_ID'].apply(get_specie_type2)


Group
Dicots      387
Monocots    130
Name: count, dtype: int64

In [206]:
samples=pd.read_excel("D:\\18.TE_Evolution\\00.Sample_Information\\Plant_558_Genome_Infor_this_project_2024-08-16.xlsx").fillna("")
#samples.to_excel("./Tables/Table S1/Genome_Information_Plant294.xlsx",index=False)
samples.to_excel(table_path+'Table S1\\Table S1.TE_Evo_558_Species.xlsx',index=False)
species_list =samples['Taxonomy_ID'].unique() # 人类、小鼠、大鼠的示例
samples['CODE_ID']=samples['Specie_ID']
samples['Group']=samples['Taxonomy_ID'].apply(get_specie_type2)
samples=samples[samples['Group']=='Monocots']
samples.shape

(130, 8)

In [134]:
tax_path='D:\\18.TE_Evolution\\00.Sample_Information\\Genomes_Info\\taxdmp\\'
tax=pd.read_csv(tax_path+"nodes.dmp",sep='\t').fillna("")
use_col=[]

for col in tax.columns:
    #print(col,tax.loc[0,col])
    if tax.loc[0,col]!="|":
        use_col.append(col)
tax=tax[use_col]
tax.columns=["tax_id",'parent tax_id','rank','embl code','division id','inherited div flag','genetic code id','inherited GC flag',
            'mitochondrial genetic code id','inherited MGC flag','GenBank hidden flag','hidden subtree root flag','comments']
names=pd.read_csv(tax_path+"names.dmp",sep='\t').fillna("")
use_col=[]
for col in names.columns:
    if names.loc[0,col]!="|":
        use_col.append(col)
names=names[use_col]
names.columns=['tax_id','name_txt','unique_name','name class']
print(names.shape)
taxid_dic={}
for taxon_id in names["tax_id"].unique():
    taxid_dic[taxon_id]={}
    for c in ['scientific name', 'blast name', 'genbank common name', 'in-part',
       'authority', 'type material', 'equivalent name', 'includes',
       'synonym', 'common name', 'acronym', 'genbank acronym']:
        taxid_dic[taxon_id][c]=""
for i in range(names.shape[0]):
    taxon_id=names.loc[i,"tax_id"]
    taxid_dic[taxon_id][names.loc[i,'name class']]=names.loc[i,"name_txt"]
Plant=tax[tax['division id']==4].reset_index(drop=True)
Rank_dic={}
for i in range(Plant.shape[0]):
    Rank_dic[Plant.loc[i,"tax_id"]]=Plant.loc[i,'rank']

(3962317, 4)


In [140]:
def get_tree(tax_id):
    find=True
    Parents=[]
    while find==True:
        df0=Plant[Plant['tax_id']==tax_id]
        if df0.shape[0]>0:
            tax_id=df0['parent tax_id'].tolist()[0]
            if tax_id in Rank_dic:
                Parents.append(str(tax_id)+":"+Rank_dic[tax_id]+":"+taxid_dic[tax_id]['scientific name'])
        else:
            find=False
    return " | ".join(Parents[::-1])


In [208]:
def get_specie_type3(taxid):
    # 'Dicots','Monocots'
    lineage = get_tree(taxid)#ncbi.get_lineage(taxid)
    def get_family(x):
        if x =="":
            return ""
        values=x.split(" | ")
        Keywords=[]
        for value in values:
            kw=value.split(":")[1]
            if kw in ["family"]:
                Keywords.append(value)
        return " | ".join(Keywords)
    family=get_family(lineage)
    return family
samples['Family']=samples['Taxonomy_ID'].apply(get_specie_type3)
samples['Family'].value_counts()

Family
4479:family:Poaceae          100
4747:family:Orchidaceae        5
4609:family:Cyperaceae         4
4710:family:Arecaceae          3
4671:family:Dioscoreaceae      3
4637:family:Musaceae           3
42228:family:Acoraceae         2
4613:family:Bromeliaceae       2
4454:family:Araceae            2
27254:family:Zosteraceae       2
4626:family:Cannaceae          1
14101:family:Juncaceae         1
4642:family:Zingiberaceae      1
40552:family:Asparagaceae      1
Name: count, dtype: int64

In [209]:
samples['Group']=samples['Family'].apply(lambda x:x.split(":")[-1].split(" ")[0])
samples['Group'].value_counts()

Group
Poaceae          100
Orchidaceae        5
Cyperaceae         4
Arecaceae          3
Dioscoreaceae      3
Musaceae           3
Acoraceae          2
Bromeliaceae       2
Araceae            2
Zosteraceae        2
Cannaceae          1
Juncaceae          1
Zingiberaceae      1
Asparagaceae       1
Name: count, dtype: int64

In [192]:
df2=pd.DataFrame(samples['Group'].value_counts())
df2.head(10).index

Index(['Fabaceae', 'Brassicaceae', 'Solanaceae', 'Rosaceae', 'Asteraceae',
       'Salicaceae', 'Cucurbitaceae', 'Malvaceae', 'Euphorbiaceae',
       'Sapindaceae'],
      dtype='object', name='Group')

In [151]:
samples=samples[samples['Group']=='Poaceae']

In [159]:
def get_specie_type4(taxid):
    # 'Dicots','Monocots'
    lineage = get_tree(taxid)#ncbi.get_lineage(taxid)
    def get_subfamily(x):
        if x =="":
            return ""
        values=x.split(" | ")
        Keywords=[]
        for value in values:
            kw=value.split(":")[1]
            if kw in ["subfamily"]:
                Keywords.append(value)
        return " | ".join(Keywords)
    family=get_subfamily(lineage)
    return family
samples['Sub_Family']=samples['Taxonomy_ID'].apply(get_specie_type4)
samples['Sub_Family'].value_counts()
samples['Group']=samples['Sub_Family'].apply(lambda x:x.split(":")[2].split(" ")[0])
samples['Group'].value_counts()

Group
Oryzoideae       42
Panicoideae      40
Pooideae          9
Chloridoideae     7
Bambusoideae      1
Arundinoideae     1
Name: count, dtype: int64

In [174]:
samples=pd.read_excel("D:\\18.TE_Evolution\\00.Sample_Information\\Plant_558_Genome_Infor_this_project_2024-08-16.xlsx").fillna("")
#samples.to_excel("./Tables/Table S1/Genome_Information_Plant294.xlsx",index=False)
samples.to_excel(table_path+'Table S1\\Table S1.TE_Evo_558_Species.xlsx',index=False)
species_list =samples['Taxonomy_ID'].unique() # 人类、小鼠、大鼠的示例
samples['CODE_ID']=samples['Specie_ID']
samples['Group']=samples['Name'].apply(lambda x: 'Oryza' if 'Oryza' in x else 'Other')
samples['Group'].value_counts()

Group
Other    517
Oryza     41
Name: count, dtype: int64

In [176]:
samples=samples[samples['Group']=='Oryza']
samples[samples['Taxonomy_ID']==4530].head()

Unnamed: 0,Name,Specie_ID,Taxonomy_ID,Assembly_Accession,Assembly,Genome_Source,CODE_ID,Group
347,Oryza sativa 02428,ILWU,4530,Oryza sativa 02428,02428,Rice Resource Center,ILWU,Oryza
348,Oryza sativa 9311,OGXT,4530,Oryza sativa 9311,9311,Rice Resource Center,OGXT,Oryza
349,Oryza sativa Basmati1,MPZB,4530,Oryza sativa Basmati1,Basmati1,Rice Resource Center,MPZB,Oryza
351,Oryza sativa CN1,LMQO,4530,Oryza sativa CN1,CN1,Rice Resource Center,LMQO,Oryza
352,Oryza sativa D62,BZJI,4530,Oryza sativa D62,D62,Rice Resource Center,BZJI,Oryza


In [177]:
samples['Group']=samples['Taxonomy_ID'].apply(lambda x: 'Oryza sativa' if x==4530 else 'Other Oryza')
samples['Group'].value_counts()

Group
Oryza sativa    30
Other Oryza     11
Name: count, dtype: int64

### Group Feature Selection

In [146]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler
import pandas as pd
import scipy.stats as stats
scaler = StandardScaler()

In [205]:
families=['Brassicaceae','Fabaceae', 'Solanaceae', 'Rosaceae', 'Asteraceae',
       'Salicaceae', 'Cucurbitaceae']
for group in families:
    other_groups=[]
    for group2 in families:
        if group2!=group:
            other_groups.append(group2)
    R=[]
    for group2 in other_groups:
        two_groups=[group,group2]
        group_idxs=samples[samples['Group']==group]['CODE_ID'].tolist()
        other_idxs=samples[samples['Group']==group2]['CODE_ID'].tolist()
        data=Feature.loc[group_idxs+other_idxs,:]
        X=data[sig_terms].values 
        #X=scaler.fit_transform(X)
        y=[]
        for idx in group_idxs:
            y.append(1)
        for idx in other_idxs:
            y.append(0)
        y=pd.Series(y)
        clf = RandomForestClassifier(n_estimators=1000)
        clf.fit(X, y)
        importances = clf.feature_importances_
        feature_importance_df = pd.DataFrame({'Feature': sig_terms, "Importance": importances})
        feature_importance_df['Group_Pair']=group+":"+group2
        R.append(feature_importance_df)
    R=pd.concat(R)
    R=R[R['Importance']>0]
    c=Counter(R['Feature'].tolist())
    R['Sig_Count']=R['Feature'].apply(lambda x:c[x])
    
    df2=pd.DataFrame(R.groupby("Feature")['Importance'].sum()).reset_index()
    df2=df2.sort_values('Importance',ascending=False)
    df2['Title']=df2['Feature'].apply(lambda x:title_dic[x])
    df2=df2.reset_index(drop=True)
    df2[group+"_Score"]=0
    df2[group+"_SigScore"]=0
    for i in range(df2.shape[0]):
        feature=df2.loc[i,'Feature']
        group_values=Feature.loc[group_idxs,feature].values
        df2.loc[i,group]=round(Feature.loc[group_idxs,feature].values.mean(),1)
        score=0
        combine_score=0
        for other in other_groups:
            other_idxs=samples[samples['Group']==other]['CODE_ID'].tolist()
            other_values=Feature.loc[other_idxs,feature].values
            df2.loc[i,other]=round(Feature.loc[other_idxs,feature].values.mean(),1)
            t_stat, p_value = stats.ttest_ind(group_values,other_values, equal_var=False)
            if (t_stat>0)&(p_value<0.01):
                combine_score+=-np.log10(p_value)
                score+=1
                #df2.loc[i,other+"_Score"]=-np.log10(p_value)
        df2.loc[i,group+"_Score"]=score
        df2.loc[i,group+"_SigScore"]=combine_score
    df2=df2[df2[group+'_Score']==len(other_groups)]
    #df2=df2.sort_values(group+"_SigScore",ascending=False).reset_index(drop=True)
    df2=df2.sort_values('Importance',ascending=False).reset_index(drop=True)
    print(group, df2.shape)
    df2.to_excel("D:\\18.TE_Evolution\\12.Analysis\\2024-08-16 TE Evolution\\Part3.Feature_Selection\\"+group+"_Features.xlsx",index=False)

Brassicaceae (203, 12)
Fabaceae (32, 12)
Solanaceae (49, 12)
Rosaceae (17, 12)
Asteraceae (16, 12)
Salicaceae (25, 12)
Cucurbitaceae (2, 12)
