### 使用顺序
### 1.tofloat()；2.log2fc_pretreatment()；3.log2foldchange()

### 函数log2foldchange()
### 计算两两组间log2FoldChange

In [4]:
def log2foldchange(df):
    """
    用于计算两两组间log2FoldChange

    :param df: 用于计算的数据表,dtype:dataframe。除行列索引外，所有的元素都必须是非零的数值，且数据类型必须为 "float"；
    :return: 计算log2FoldChange的结果作为新列添加到输入的df中。返回值即是添加完结果的df

    需要 “import numpy as np”
    """
    counter = len(df.columns)
    i = 0
    while i < counter:
        data1 = df[df.columns[i]] # data1 = df的一列
        p = i + 1
        while p < counter:
            data2 = df[df.columns[p]] # data2 = df的一列
            ColName = df.columns[p] + "/" + df.columns[i]
            df[ColName]=np.log2(data2/data1)
            p += 1
        i += 1
    return df


### 函数 tofloat()
### 把df中数值的数据类型转换为 "float"；结果返回一个df

In [5]:
def tofloat(df,col="GroupByTimeDiet"):
    """
    用于把df里数值的数据类型转为 "float"，只支持有一个列的元素包含非数值的文本

    :param df: 需要转换数据类型的数据表。除行列索引外和一个分组列外，所有的元素都必须是数值。
    :param col: 分组列的列名, dtype:str
    :return: 转换后的df
    """
    for i in df.columns:
        if i != col:
            df[i] = df[i].astype(float, errors = 'raise')
    return df

### 函数 log2fc_pretreatment()
### 用于log2FoldChange计算的前处理 包括：1.删除包含字符串的列；2.每个数值元素加0.0001；3.分组求均值；4.返回新的df
### 结果返回一个df，可直接用于计算log2FoldChange

In [6]:
def log2fc_pretreatment(df,col):
    """
    log2FC计算的前处理 包括 1.删除包含字符串的列 2.每个数值元素加0.0001 3.求丰度表每个条目在每个组中的均值（分组求均值） 4.返回新的df
    结果可用于计算log2FoldChange

    :param df:
    :param col: 分组列的列名, dtype:str
    :return: 结果df dtype：dataframe
    """
    # 删除包含字符串的列
    df_tem = df.drop([col],axis=1,inplace=False)
    # df_de中每个元素加0.0001
    df_tem = df_tem.applymap(lambda x: x + 0.0001)
    # 把df1["GroupByTimeDiet"]插入df_de第一列
    df_tem.insert(0,col,df[col])
    dfAddNum = df_tem

    # 对数据分组，按分组列分组
    dfAddNum.groupby(col)
    # 再获取每一列，计算组内均值
    SeriesList = [] # 定义一个存series的列表
    for i in dfAddNum.columns:
        if i != col:
            SeriesList.append(dfAddNum.groupby(col)[i].mean())
        else:
            print("分组列，不计算")

    # 结果合并为新的df，此df储存每个条目在不同组中的均值
    dfSeriesConcat = pd.concat(SeriesList, axis=1).T
    return dfSeriesConcat

### 函数测试

In [2]:
### 导入模块部分
import pandas as pd
import numpy as np

### 读文件
df1 = pd.read_excel(r'D:\Work\分析项目\wxy\宏基因组与16S丰度计算\ASV丰度(加测数据)\ASV_relative_depoint.xlsx'
                    , header=0, index_col=0)
df1.drop(["Taxonomy"],axis=1,inplace=True)
df1

Unnamed: 0_level_0,XG.0w.6,XG.0w.7,XG.0w.8,ND.0w.5,ND.0w.7,ND.0w.8,XG.2w.1,XG.2w.3,XG.2w.3.2,XG.2w.5,...,ND.2w.5,ND.2w.6,ND.2w.7,ND.4w.1,ND.7w.1,ND.7w.10,ND.7w.3,ND.7w.7,ND.7w.8,ND.7w.9
#OTU ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GroupByTimeDiet,ND0,ND0,ND0,ND0,ND0,ND0,XG2,XG2,XG2,XG2,...,ND2,ND2,ND2,ND2,ND7,ND7,ND7,ND7,ND7,ND7
ASV0,0.082868,0.122091,0.130767,0.03806,0.166999,0.053085,0.037429,0.034803,0.034404,0.050459,...,0.02513,0.00964,0.034005,0.058204,0.153138,0.159354,0.176473,0.210045,0.214699,0.127443
ASV1,0.002194,0.000499,0.000432,0.019113,0.000299,0.005551,0.000997,0.006515,0.002493,0,...,0.214932,0.035201,0.21194,0.091078,0.00246,0.012465,0.008476,0.014061,0.004388,0.001529
ASV2,0.057772,0.042813,0.031412,0.064187,0.028553,0.016853,0.044675,0.046736,0.055977,0.052786,...,0.036232,0.008044,0.013196,0.010039,0.024698,0.017717,0.017052,0.008044,0.014692,0.011335
ASV3,0.029351,0.047434,0.046603,0.012166,0.068076,0.022171,0.007811,0.010271,0.009839,0.017684,...,0.013861,0.006149,0.010936,0.019379,0.100352,0.052486,0.049196,0.083067,0.05488,0.041018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ASV3562,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ASV3563,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ASV3564,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ASV3565,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
df1 = df1.T
df1

In [None]:

dfflo = tofloat(df=df1,col="GroupByTimeDiet")
dfpre = log2fc_pretreatment(df=dfflo,col="GroupByTimeDiet")
log2foldchange(df=dfpre)

### 函数未封装前

In [165]:
### 导入模块部分
import pandas as pd
import numpy as np

### 读文件
df1 = pd.read_excel(r'D:\Work\分析项目\wxy\宏基因组与16S丰度计算\ASV丰度(加测数据)\ASV_relative_depoint.xlsx'
                    , header=0, index_col=0)
df1.drop(["Taxonomy"],axis=1,inplace=True)
df1 = df1.T

"""要处理的数据存在df1里"""

### 把数据分组 分组信息中的组名转换为列表 这一步没用上
dfGroup = df1.groupby('GroupByTimeDiet')  # 按'GroupByTimeDiet'分组
KeyList = list(dfGroup.groups.keys())  # 分组组名转为列表
len(KeyList)

# 转换数据类型
for i in df1.columns:
    if i != "GroupByTimeDiet":
        df1[i] = df1[i].astype(float, errors = 'raise')


# 1.找到相对丰度表中最小的值
"""失败 0值小数点后的位数不一样，没法替换成空值。这样求出来的最小值只能是0
"""
# 2.相对丰度表里所有的值都加上这个最小的值（处理0的问题）第一步失败，改为每个值都加0.0001

### 删除包含字符串的列
df_de = df1.drop(["GroupByTimeDiet"],axis=1,inplace=False)

### df_de中每个元素加0.0001
df_de = df_de.applymap(lambda x: x + 0.0001)

### 把df1["GroupByTimeDiet"]插入df_de第一列
df_de.insert(0,"GroupByTimeDiet",df1["GroupByTimeDiet"])

### df1_AddNum = 加完0.0001的dataframe
df1_AddNum = df_de

# 3.求丰度表每个条目在每个组中的均值 比如ASV0 在ND0组的均值

### 对数据分组，按“GroupByTimeDiet”列分组
df1_AddNum.groupby("GroupByTimeDiet")

### 再获取每一列，计算组内均值
SeriesList = [] # 定义一个存series的列表
for i in df1_AddNum.columns:
    if i != "GroupByTimeDiet":
        SeriesList.append(df1_AddNum.groupby("GroupByTimeDiet")[i].mean())
    else:
        print("分组列，不计算")

# 4.结果存为新的df
pd.concat(SeriesList, axis=1).T
dfSeriesConcat = pd.concat(SeriesList, axis=1).T
dfSeriesConcat
import numpy as np

# 5.用求出来的均值计算组间log2FC 比如log2(ND2的ASV0/ND0的ASV0)

# dfSeriesConcat['ND2/ND0']  = dfSeriesConcat.apply(lambda f: log2fc(f['ND2'],f['ND0']), axis=1)
counter = len(dfSeriesConcat.columns)
i = 0

while i < counter:
    data1 = dfSeriesConcat[dfSeriesConcat.columns[i]] # data1 = dfSeriesConcat的一列
    p = i + 1
    while p < counter:
        data2 = dfSeriesConcat[dfSeriesConcat.columns[p]] # data2 = dfSeriesConcat的一列
        ColName = dfSeriesConcat.columns[p] + "/" + dfSeriesConcat.columns[i]
        dfSeriesConcat[ColName]=np.log2(data2/data1)
        p += 1
    i += 1

# dfSeriesConcat.to_excel(r'D:\Work\分析项目\wxy\宏基因组与16S丰度计算\ASV丰度(加测数据)\ASV_relative_depoint_log2FC.xlsx')
