In [1]:
# 载入包
import pandas as pd
import numpy as np
import re
import requests
import time

In [4]:
# 定义函数
# 1. 对于uniref检索结果，需要去NaN以及对于多keggid的unirefid，进行拆分
def process_uniref(df):
    # 去NaN
    df = df[~df['KEGG'].isna()]
    new_rows = []
    for index, row in df.iterrows():
        uniref_id = row['From']
        kegg_ids = row['KEGG'].split(';') # 通过 ; 拆分 KEGG IDs
        for kegg_id in kegg_ids:
            if kegg_id:
                new_rows.append({
                    'Uniref_ID': uniref_id,
                    'KEGG_ID': kegg_id
                })
    # 将新的字典列表转换为df
    df_uniref_to_kegg = pd.DataFrame(new_rows)
    
    return df_uniref_to_kegg

# 2. 定义一个将提取得到的文本转换为df的函数
def webtext_to_df(x,cols):
    rows = x.split('\n')
    data = [row.split('\t') for row in rows]
    df = pd.DataFrame(data, columns=cols)
    df.drop(df.index[-1],inplace=True)
    return df

# 3. 合并数据框
def merge_df(uniref2kegg, org2ko, ko2hsa, hsa):
    merge_df = pd.merge(ko2hsa, hsa[['hsa:gene','info']], on='hsa:gene', how='left')
    merge_df['info'] = merge_df['info'].apply(lambda x: x.split(',')[0])
    merge_df2 = pd.merge(org2ko, merge_df, on='ko', how='right')
    merge_df3 = pd.merge(uniref2kegg, merge_df2, on='org:gene', how='right')

    final_merge_df = merge_df3[['Uniref_ID','info']]
    final_merge_df = final_merge_df[~merge_df4.duplicated()] # 去冗余
    final_merge_df.rename(columns = {'info':'gene_symbol'}, inplace = True)
    final_merge_df['gene_symbol'] = final_merge_df['gene_symbol'].apply(lambda x: x.split(';')[0] if ';' in x else x)

    return final_merge_df

# 4. 转换为scfoundation能够识别的df
def transform_to_scfoundation(gene_family_to_symbol, gene_family):
    # 合并两个数据框，基于 uniref_ID 列
    merged_gene_symbol = pd.merge(gene_family_to_symbol, gene_family, on='Uniref_ID', how='left')
    # 去掉第一列，uniref_ID
    merged_gene_symbol.drop(merged_gene_symbol.columns[0], axis=1, inplace=True) 
    # 计算重复出现的gene表达均值
    avg_gene_expression = merged_gene_symbol.groupby('gene_symbol').mean().reset_index()
    # 以第一列作为索引
    avg_gene_expression.set_index(avg_gene_expression.columns[0], inplace=True)
    # 转置，转置后才满足scfoundation的输入
    avg_gene_expression_T =avg_gene_expression.transpose()

    return avg_gene_expression_T

In [5]:
# 读取uniref检索结果文件
tmp_uniref_to_kegg = pd.read_csv("./idmapping_100000.tsv",sep="\t")

uniref_to_kegg = process_uniref(tmp_uniref_to_kegg)

# 存储 新获得的 uniref_KEGG的df
uniref_to_kegg.to_csv("./results/uniref_to_kegg.csv", index=False)

In [6]:
# 将 keggid_list 按 100 行分块
chunk_size = 100

# 初始化存储所有请求结果的变量
all_results = []

# 循环每个分块进行请求
for i in range(0, len(uniref_to_kegg['KEGG_ID']), chunk_size):
    # 获取当前的 100 行数据
    current_chunk = uniref_to_kegg['KEGG_ID'][i:i + chunk_size]
    
    # 将列表转换为以加号（+）连接的字符串，适合 URL 请求格式
    chunk_str = '+'.join(current_chunk)
    
    # 构建请求 URL
    url = f'https://rest.kegg.jp/link/ko/{chunk_str}'
    
    try:
        # 发送 GET 请求
        response = requests.get(url)
        
        if response.status_code == 200:
            # 将请求的文本结果添加到总结果中
            all_results.append(response.text)
        else:
            print(f"请求失败！ HTTP 状态码: {response.status_code}")
        
    except requests.exceptions.RequestException as e:
        print(f"请求发生错误: {e}")
    
    # 每次请求后睡眠1秒
    time.sleep(1)

# 将所有结果汇总为一个字符串
final_result = ''.join(all_results)

# 字符转换为df 这里表示 org:gene -> ko
org_to_ko = webtext_to_df(final_result,cols=['org:gene','ko'])

# 存储 新获得的 org:gene 到 ko 的 df
# 去冗余
nr_org_to_ko = org_to_ko.drop(org_to_ko[org_to_ko.duplicated()].index)

nr_org_to_ko.to_csv("./results/nr_org_to_ko.csv", index=False)

In [None]:
# 将 keggid_list 按 100 行分块
chunk_size = 100

# 初始化存储所有请求结果的变量
all_results = []

# 循环每个分块进行请求
for i in range(0, len(nr_org_to_ko['ko']), chunk_size):
    # 获取当前的 100 行数据
    current_chunk = nr_org_to_ko['ko'][i:i + chunk_size]
    
    # 将列表转换为以加号（+）连接的字符串，适合 URL 请求格式
    chunk_str = '+'.join(current_chunk)
    
    # 构建请求 URL
    url = f'https://rest.kegg.jp/link/hsa/{chunk_str}'
    
    try:
        # 发送 GET 请求
        response = requests.get(url)
        
        if response.status_code == 200:
            # print(f"请求成功！ 第 {i//chunk_size + 1} 批次")
            # 将请求的文本结果添加到总结果中
            all_results.append(response.text)
        else:
            print(f"请求失败！ HTTP 状态码: {response.status_code}")
        
    except requests.exceptions.RequestException as e:
        print(f"请求发生错误: {e}")
    
    # 每次请求后睡眠1秒
    time.sleep(1)

# 将所有结果汇总为一个字符串
final_result = ''.join(all_results)

ko_to_hsa = webtext_to_df(final_result, cols = ['ko','hsa:gene'])

In [None]:
hsa_df = pd.read_csv('./hsa.txt', sep='\t', names=['hsa:gene','type','sequence','info'])
# 合并df
gene_familt_to_symbol = merge_df(uniref_to_kegg, nr_org_to_ko, ko_to_hsa, hsa_df)
gene_family_to_symbol['Uniref_ID'] = gene_family_to_symbol['Uniref_ID'].apply(lambda x: 'UniRef90_' + x)

In [None]:
# 读取基因家族文件
gene_family = pd.read_csv('2021-03-31.TettAJ_2016.gene_families.txt', sep='\t', index_col=0)[:100000]
# 重置索引，使 Uniref_ID 变为一列
gene_family.reset_index(inplace=True)
gene_family.rename(columns={'index': 'Uniref_ID'}, inplace=True)

In [None]:
bulk_data = transform_to_scfoundation(gene_family_to_symbol, gene_family)

bulk_data.to_csv('results/bulk_data.csv')