In [1]:
import pandas as pd
import re
import requests
import time
import numpy as np

In [4]:
nr_org_to_ko = pd.read_csv('./new_results/nr_org_to_ko_9744.csv')

In [5]:
nr_org_to_ko.head()

Unnamed: 0,org:gene,ko
0,shh:ShL2_00101,ko:K00852
1,sxl:SXYLSMQ121_1778,ko:K07816
2,sxo:SXYL_01926,ko:K07816
3,sxy:BE24_02375,ko:K07816
4,sxl:SXYLSMQ121_1567,ko:K02959


In [6]:
def webtext2df_ko2gene(x):
    import pandas as pd
    rows = x.split('\n')
    data = [row.split('\t') for row in rows]
    df = pd.DataFrame(data, columns=['ko','hsa:gene'])
    df.drop(df.index[-1],inplace=True)
    return df

# 将 keggid_list 按 100 行分块
chunk_size = 100

# 初始化存储所有请求结果的变量
all_results = []

# 循环每个分块进行请求
for i in range(0, len(nr_org_to_ko['ko']), chunk_size):
    # 获取当前的 100 行数据
    current_chunk = nr_org_to_ko['ko'][i:i + chunk_size]
    
    # 将列表转换为以加号（+）连接的字符串，适合 URL 请求格式
    chunk_str = '+'.join(current_chunk)
    
    # 构建请求 URL
    url = f'https://rest.kegg.jp/link/hsa/{chunk_str}'
    
    try:
        # 发送 GET 请求
        response = requests.get(url)
        
        if response.status_code == 200:
            # print(f"请求成功！ 第 {i//chunk_size + 1} 批次")
            # 将请求的文本结果添加到总结果中
            all_results.append(response.text)
        else:
            print(f"请求失败！ HTTP 状态码: {response.status_code}")
        
    except requests.exceptions.RequestException as e:
        print(f"请求发生错误: {e}")
    
    # 每次请求后睡眠1秒
    time.sleep(1)

# 将所有结果汇总为一个字符串
final_result = ''.join(all_results)

ko_to_hsa = webtext2df_ko2gene(final_result)

In [7]:
print(ko_to_hsa.shape)
ko_to_hsa.head(10)

(11632, 2)


Unnamed: 0,ko,hsa:gene
0,ko:K00852,hsa:64080
1,ko:K02959,hsa:51021
2,ko:K02959,hsa:51021
3,ko:K02959,hsa:51021
4,ko:K02980,hsa:6235
5,ko:K03009,hsa:5440
6,ko:K02147,hsa:525
7,ko:K02147,hsa:526
8,ko:K02976,hsa:6231
9,ko:K10573,hsa:7319


In [8]:
hsa_df = pd.read_csv('./hsa.txt', sep='\t', names=['hsa:gene','type','sequence','info'])
hsa_df.head(10)

Unnamed: 0,hsa:gene,type,sequence,info
0,hsa:102466751,miRNA,1:complement(17369..17436),"MIR6859-1, hsa-mir-6859-1; microRNA 6859-1"
1,hsa:100302278,miRNA,1:30366..30503,"MIR1302-2, MIRN1302-2, hsa-mir-1302-2; microRN..."
2,hsa:79501,CDS,1:65419..71585,OR4F5; olfactory receptor family 4 subfamily F...
3,hsa:102465909,miRNA,1:complement(187891..187958),"MIR6859-2, hsa-mir-6859-2; microRNA 6859-2"
4,hsa:112268260,CDS,1:complement(365134..382235),uncharacterized LOC112268260
5,hsa:729759,CDS,1:complement(450740..451678),"OR4F29, OR7-21; olfactory receptor family 4 su..."
6,hsa:124904706,ncRNA,1:complement(516376..516479),U6 spliceosomal RNA
7,hsa:105378947,CDS,1:complement(586287..611297),proline-rich extensin-like protein EPR1
8,hsa:113219467,miRNA,1:complement(632615..632685),MIR12136; microRNA 12136
9,hsa:81399,CDS,1:complement(676076..720101),"OR4F16, OR1-1, OR7-21; olfactory receptor fami..."


In [30]:
merge_df = pd.merge(ko_to_hsa, hsa_df[['hsa:gene','info']], on='hsa:gene', how='left')
merge_df['info'] = merge_df['info'].apply(lambda x: x.split(',')[0])
print(merge_df.shape)
merge_df.head(10)

(11632, 3)


Unnamed: 0,ko,hsa:gene,info
0,ko:K00852,hsa:64080,RBKS
1,ko:K02959,hsa:51021,MRPS16
2,ko:K02959,hsa:51021,MRPS16
3,ko:K02959,hsa:51021,MRPS16
4,ko:K02980,hsa:6235,RPS29
5,ko:K03009,hsa:5440,POLR2K
6,ko:K02147,hsa:525,ATP6V1B1
7,ko:K02147,hsa:526,ATP6V1B2
8,ko:K02976,hsa:6231,RPS26
9,ko:K10573,hsa:7319,UBE2A


In [31]:
merge_df = merge_df[~merge_df.duplicated()]
print(merge_df.shape)
merge_df.head()

(3000, 3)


Unnamed: 0,ko,hsa:gene,info
0,ko:K00852,hsa:64080,RBKS
1,ko:K02959,hsa:51021,MRPS16
4,ko:K02980,hsa:6235,RPS29
5,ko:K03009,hsa:5440,POLR2K
6,ko:K02147,hsa:525,ATP6V1B1


In [34]:
merge_df2 = pd.merge(nr_org_to_ko, merge_df, on='ko', how='right')
print(merge_df2.shape)
merge_df2.head(10)

(11632, 4)


Unnamed: 0,org:gene,ko,hsa:gene,info
0,shh:ShL2_00101,ko:K00852,hsa:64080,RBKS
1,mgl:MGL_4124,ko:K00852,hsa:64080,RBKS
2,ckp:ckrop_1856,ko:K00852,hsa:64080,RBKS
3,ser:SERP2100,ko:K00852,hsa:64080,RBKS
4,sxl:SXYLSMQ121_1567,ko:K02959,hsa:51021,MRPS16
5,sxo:SXYL_01645,ko:K02959,hsa:51021,MRPS16
6,sxy:BE24_03470,ko:K02959,hsa:51021,MRPS16
7,sgo:SGO_1323,ko:K02959,hsa:51021,MRPS16
8,mgl:MGL_1982,ko:K02959,hsa:51021,MRPS16
9,nmn:NMCC_0538,ko:K02959,hsa:51021,MRPS16


In [38]:
merge_df2.to_csv('./merge_df2.csv', index=False)

In [40]:
uniref_to_kegg = pd.read_csv('./new_results/uniref_to_kegg_100000.csv')
uniref_to_kegg.rename(columns={'KEGG_ID': 'org:gene'}, inplace=True)
print(uniref_to_kegg.shape)
uniref_to_kegg.head(10)

(15148, 2)


Unnamed: 0,Uniref_ID,org:gene
0,A0A023UEV9,shh:ShL2_00055
1,A0A023UGA4,shh:ShL2_00101
2,A0A028ZLY2,shh:ShL2_00073
3,A0A031FNR5,moo:BWL13_02343
4,A0A060AB44,vg:19685786
5,A0A060AB89,vg:19685835
6,A0A060ABD0,vg:19685880
7,A0A060AET7,vg:19685756
8,A0A060AF10,vg:19685841
9,A0A060AF31,vg:19685866


In [41]:
merge_df3 = pd.merge(uniref_to_kegg, merge_df2, on='org:gene', how='right')
print(merge_df3.shape)
merge_df3.head(10)

(11662, 5)


Unnamed: 0,Uniref_ID,org:gene,ko,hsa:gene,info
0,A0A023UGA4,shh:ShL2_00101,ko:K00852,hsa:64080,RBKS
1,A8QD56,mgl:MGL_4124,ko:K00852,hsa:64080,RBKS
2,C4LL69,ckp:ckrop_1856,ko:K00852,hsa:64080,RBKS
3,Q5HL87,ser:SERP2100,ko:K00852,hsa:64080,RBKS
4,A0A060MLB4,sxl:SXYLSMQ121_1567,ko:K02959,hsa:51021,MRPS16
5,A0A060MLB4,sxo:SXYL_01645,ko:K02959,hsa:51021,MRPS16
6,A0A060MLB4,sxy:BE24_03470,ko:K02959,hsa:51021,MRPS16
7,A8AXU4,sgo:SGO_1323,ko:K02959,hsa:51021,MRPS16
8,A8Q049,mgl:MGL_1982,ko:K02959,hsa:51021,MRPS16
9,A9M2D3,nmn:NMCC_0538,ko:K02959,hsa:51021,MRPS16


In [42]:
merge_df3.to_csv('./new_results/uniref_to_symbol_2.csv', index=False)

In [43]:
merge_df4 = merge_df3[['Uniref_ID','info']]
merge_df4 = merge_df4[~merge_df4.duplicated()]
merge_df4.rename(columns = {'info':'gene_symbol'}, inplace = True)

In [44]:
print(merge_df4.shape)
merge_df4.head(10)

(7934, 2)


Unnamed: 0,Uniref_ID,gene_symbol
0,A0A023UGA4,RBKS
1,A8QD56,RBKS
2,C4LL69,RBKS
3,Q5HL87,RBKS
4,A0A060MLB4,MRPS16
7,A8AXU4,MRPS16
8,A8Q049,MRPS16
9,A9M2D3,MRPS16
10,C4LJC4,MRPS16
11,C5CAG7,MRPS16


In [46]:
merge_df4.to_csv('./new_results/uniref_to_symbol_3.csv.csv', index=False)

In [47]:
merge_df4['gene_symbol'] = merge_df4['gene_symbol'].apply(lambda x: x.split(';')[0] if ';' in x else x)

In [48]:
merge_df4.to_csv('./new_results/uniref_to_symbol_4.csv.csv', index=False)

In [50]:
print("the unique Uniref_ID number: ", merge_df4['Uniref_ID'].nunique())
print("the unique gene_symbol number: ", merge_df4['gene_symbol'].nunique())

the unique Uniref_ID number:  4425
the unique gene_symbol number:  2992


In [51]:
gene_family_to_symbol = merge_df4
gene_family_to_symbol['Uniref_ID'] = gene_family_to_symbol['Uniref_ID'].apply(lambda x: 'UniRef90_' + x)
gene_family_to_symbol.head()

Unnamed: 0,Uniref_ID,gene_symbol
0,UniRef90_A0A023UGA4,RBKS
1,UniRef90_A8QD56,RBKS
2,UniRef90_C4LL69,RBKS
3,UniRef90_Q5HL87,RBKS
4,UniRef90_A0A060MLB4,MRPS16


In [59]:
# 读取基因家族文件
gene_family = pd.read_csv('2021-03-31.TettAJ_2016.gene_families.txt', sep='\t', index_col=0)[:100000]
# 重置索引，使 Uniref_ID 变为一列
gene_family.reset_index(inplace=True)
gene_family.rename(columns={'index': 'Uniref_ID'}, inplace=True)
gene_family.head()

Unnamed: 0,Uniref_ID,SK_CT101OSL_t1M14,SK_CT101RCR_t1M14,SK_CT102OSL_t1M14,SK_CT102RCL_t1M14,SK_CT102RCR_t1M14,SK_CT104OSL_t1M14,SK_CT104OSR_t1M14,SK_CT104RCL_t1M14,SK_CT104RCR_t1M14,...,Steph_b2_64,Steph_b2_65,Steph_b2_66,Steph_b2_67,Steph_b2_68,Steph_b2_70,Steph_b2_71,Steph_b2_72,Steph_b2_74,Steph_b2_9
0,UNMAPPED,0.336727,0.859273,0.374585,0.217949,0.291308,0.312706,0.266681,0.826913,0.822014,...,0.578803,0.420364,0.186414,0.573109,0.834071,0.18572,0.817462,0.575684,0.436962,0.327175
1,UniRef90_C4PUD2,0.003075,6e-05,0.0,5.41045e-07,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,UniRef90_C4PUD0,0.002848,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,UniRef90_C4PUC9,0.0027,0.0,0.0,2.82204e-07,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,UniRef90_C4PUC8,0.002671,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [63]:
# 1. 合并两个数据框，基于 uniref_ID 列
merged_gene_symbol = pd.merge(gene_family_to_symbol, gene_family, on='Uniref_ID', how='left')

# 2. 按 gene_symbol 分组，并对每个细胞类型的表达值取平均
# result_df = merged_gene_symbol.groupby('gene_symbol').mean().reset_index()

merged_gene_symbol.head()

Unnamed: 0,Uniref_ID,gene_symbol,SK_CT101OSL_t1M14,SK_CT101RCR_t1M14,SK_CT102OSL_t1M14,SK_CT102RCL_t1M14,SK_CT102RCR_t1M14,SK_CT104OSL_t1M14,SK_CT104OSR_t1M14,SK_CT104RCL_t1M14,...,Steph_b2_64,Steph_b2_65,Steph_b2_66,Steph_b2_67,Steph_b2_68,Steph_b2_70,Steph_b2_71,Steph_b2_72,Steph_b2_74,Steph_b2_9
0,UniRef90_A0A023UGA4,RBKS,0.0,0.0,0.0,3.50682e-07,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,UniRef90_A8QD56,RBKS,0.0,0.0,0.0,3.06859e-06,0.0,0.0,0.0,0.0,...,0.0,0.0,2e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,UniRef90_C4LL69,RBKS,1.6e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,UniRef90_Q5HL87,RBKS,4.4e-05,0.0,1.7e-05,7.44143e-05,6.4e-05,7.4e-05,5.4e-05,0.0,...,0.0,9.9e-05,2.4e-05,2.9e-05,0.0,4.9e-05,0.0,0.0,0.0,0.000102
4,UniRef90_A0A060MLB4,MRPS16,0.0,0.0,1e-05,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5e-05,0.0,0.0


In [67]:
# 假设 df 是你的数据框
merged_gene_symbol.drop(merged_gene_symbol.columns[0], axis=1, inplace=True)

merged_gene_symbol.head()

Unnamed: 0,gene_symbol,SK_CT101OSL_t1M14,SK_CT101RCR_t1M14,SK_CT102OSL_t1M14,SK_CT102RCL_t1M14,SK_CT102RCR_t1M14,SK_CT104OSL_t1M14,SK_CT104OSR_t1M14,SK_CT104RCL_t1M14,SK_CT104RCR_t1M14,...,Steph_b2_64,Steph_b2_65,Steph_b2_66,Steph_b2_67,Steph_b2_68,Steph_b2_70,Steph_b2_71,Steph_b2_72,Steph_b2_74,Steph_b2_9
0,RBKS,0.0,0.0,0.0,3.50682e-07,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,RBKS,0.0,0.0,0.0,3.06859e-06,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,RBKS,1.6e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,RBKS,4.4e-05,0.0,1.7e-05,7.44143e-05,6.4e-05,7.4e-05,5.4e-05,0.0,0.0,...,0.0,9.9e-05,2.4e-05,2.9e-05,0.0,4.9e-05,0.0,0.0,0.0,0.000102
4,MRPS16,0.0,0.0,1e-05,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5e-05,0.0,0.0


In [70]:
avg_gene_expression = merged_gene_symbol.groupby('gene_symbol').mean().reset_index()
print(avg_gene_expression.shape)
avg_gene_expression.head()

(2992, 90)


Unnamed: 0,gene_symbol,SK_CT101OSL_t1M14,SK_CT101RCR_t1M14,SK_CT102OSL_t1M14,SK_CT102RCL_t1M14,SK_CT102RCR_t1M14,SK_CT104OSL_t1M14,SK_CT104OSR_t1M14,SK_CT104RCL_t1M14,SK_CT104RCR_t1M14,...,Steph_b2_64,Steph_b2_65,Steph_b2_66,Steph_b2_67,Steph_b2_68,Steph_b2_70,Steph_b2_71,Steph_b2_72,Steph_b2_74,Steph_b2_9
0,40S ribosomal protein S8-like,0.0,0.0,0.0,9e-06,5e-06,0.0,0.0,0.0,0.0,...,0.0,0.0,8.31535e-06,0.0,0.0,0.0,0.0,0.0,0.0,3e-06
1,AACS,0.0,0.0,0.0,3e-06,1e-06,0.0,0.0,0.0,0.0,...,0.0,0.0,3.026485e-06,0.0,0.0,0.0,0.0,0.0,0.0,2e-06
2,AAK1,0.0,0.0,0.0,3e-06,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.32304e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,AAR2,0.0,0.0,0.0,4e-06,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,7.61763e-07,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,AARS1,1.3e-05,0.0,4e-06,2.8e-05,9e-06,2.9e-05,3.1e-05,0.0,0.0,...,1.9e-05,1.9e-05,3.920903e-05,1.3e-05,0.0,2.9e-05,0.0,2e-06,0.0,1.8e-05


In [71]:
avg_gene_expression.set_index(avg_gene_expression.columns[0], inplace=True)

avg_gene_expression_T =avg_gene_expression.transpose()

print(avg_gene_expression_T.shape)
avg_gene_expression_T.head()

(89, 2992)


gene_symbol,40S ribosomal protein S8-like,AACS,AAK1,AAR2,AARS1,AARS2,AASDHPPT,AATF,ABCB1,ABCB10,...,double homeobox protein 4,double homeobox protein 4-like,double homeobox protein 4-like protein 4,eukaryotic translation initiation factor 1A,glycine cleavage system H protein,histone H2B type F-S-like,periodic tryptophan protein 2 homolog,putative exonuclease GOR,putative protein N-methyltransferase FAM86B2,tubulin beta 8B
SK_CT101OSL_t1M14,0.0,0.0,0.0,0.0,1.3e-05,1.3e-05,0.0,0.0,0.0,0.0,...,4.83855e-07,4.83855e-07,4.83855e-07,1e-06,7e-06,4.441356e-08,0.0,0.0,0.0,5.421489e-07
SK_CT101RCR_t1M14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.536333e-05,1.536333e-05,1.536333e-05,0.0,8e-06,0.0,0.0,0.0,0.0,0.0
SK_CT102OSL_t1M14,0.0,0.0,0.0,0.0,4e-06,4e-06,0.0,0.0,0.0,0.0,...,8.463327e-06,8.463327e-06,8.463327e-06,0.0,3.6e-05,0.0,0.0,0.0,0.0,1.247258e-06
SK_CT102RCL_t1M14,9e-06,3e-06,3e-06,4e-06,2.8e-05,2.8e-05,1.884205e-06,2e-06,5e-06,3e-06,...,1.017857e-07,1.017857e-07,1.017857e-07,8e-06,1.6e-05,3.6307e-07,6e-06,4e-06,4e-06,1.381661e-06
SK_CT102RCR_t1M14,5e-06,1e-06,0.0,0.0,9e-06,9e-06,7.82665e-07,0.0,1e-06,0.0,...,2.73576e-07,2.73576e-07,2.73576e-07,3e-06,2.3e-05,1.061075e-07,3e-06,0.0,0.0,1.189011e-06


In [64]:
merged_gene_symbol.to_csv('./merged_gene_symbol.csv', index=False)

In [73]:
avg_gene_expression_T.to_csv('./new_results/avg_gene_expression_T.csv')