In [90]:
import os
import pandas as pd
import numpy as np

In [61]:
new_directory = r"F:\project"  
os.chdir(new_directory)

In [62]:
# 读取data文件夹中的三个文件
data_folder = "data"
file_paths = [
    os.path.join(data_folder, "9606.protein.info.v12.0.txt"),
    os.path.join(data_folder, "9606.protein.links.full.v12.0.txt"),
    os.path.join(data_folder, "9606.protein.links.v12.0.txt")
]

In [63]:
# 读取protein_info和protein_links表格
protein_info = pd.read_csv(file_paths[0], sep="\t")
protein_links = pd.read_csv(file_paths[2], sep=" ")

In [64]:
protein_info.head()

Unnamed: 0,#string_protein_id,preferred_name,protein_size,annotation
0,9606.ENSP00000000233,ARF5,180,ADP-ribosylation factor 5; GTP-binding protein...
1,9606.ENSP00000000412,M6PR,277,Cation-dependent mannose-6-phosphate receptor;...
2,9606.ENSP00000001008,FKBP4,459,"Peptidyl-prolyl cis-trans isomerase FKBP4, N-t..."
3,9606.ENSP00000001146,CYP26B1,512,Cytochrome P450 26B1; Involved in the metaboli...
4,9606.ENSP00000002125,NDUFAF7,441,"Protein arginine methyltransferase NDUFAF7, mi..."


In [65]:
protein_links.head()

Unnamed: 0,protein1,protein2,combined_score
0,9606.ENSP00000000233,9606.ENSP00000356607,173
1,9606.ENSP00000000233,9606.ENSP00000427567,154
2,9606.ENSP00000000233,9606.ENSP00000253413,151
3,9606.ENSP00000000233,9606.ENSP00000493357,471
4,9606.ENSP00000000233,9606.ENSP00000324127,201


In [66]:
# 在protein_links中添加name1和name2列
protein_links['name1'] = protein_links['protein1'].map(protein_info.set_index('#string_protein_id')['preferred_name'])
protein_links['name2'] = protein_links['protein2'].map(protein_info.set_index('#string_protein_id')['preferred_name'])

# 将combined_score列移动到最后一列
combined_score = protein_links.pop('combined_score')
protein_links['combined_score'] = combined_score

In [67]:
protein_links.head()

Unnamed: 0,protein1,protein2,name1,name2,combined_score
0,9606.ENSP00000000233,9606.ENSP00000356607,ARF5,RALGPS2,173
1,9606.ENSP00000000233,9606.ENSP00000427567,ARF5,FHDC1,154
2,9606.ENSP00000000233,9606.ENSP00000253413,ARF5,ATP6V1E1,151
3,9606.ENSP00000000233,9606.ENSP00000493357,ARF5,CYTH2,471
4,9606.ENSP00000000233,9606.ENSP00000324127,ARF5,PSD3,201


In [68]:
protein_links = protein_links[protein_links["combined_score"] > 700]

In [69]:
protein_links.reset_index()

Unnamed: 0,index,protein1,protein2,name1,name2,combined_score
0,85,9606.ENSP00000000233,9606.ENSP00000158762,ARF5,ACAP1,825
1,130,9606.ENSP00000000233,9606.ENSP00000357048,ARF5,COPA,718
2,160,9606.ENSP00000000233,9606.ENSP00000262305,ARF5,RAB11FIP3,952
3,197,9606.ENSP00000000233,9606.ENSP00000329419,ARF5,COPB2,752
4,268,9606.ENSP00000000233,9606.ENSP00000469035,ARF5,COPE,795
...,...,...,...,...,...,...
471995,13715019,9606.ENSP00000501277,9606.ENSP00000326630,LDB1,ZFPM1,942
471996,13715034,9606.ENSP00000501277,9606.ENSP00000263726,LDB1,LHX4,944
471997,13715072,9606.ENSP00000501317,9606.ENSP00000290524,RFX7,RFX5,780
471998,13715299,9606.ENSP00000501317,9606.ENSP00000305071,RFX7,RFXANK,978


In [70]:
# 统计所有互作次数
all_proteins = pd.concat([protein_links['name1'], protein_links['name2']])
protein_counts = all_proteins.value_counts().reset_index()
protein_counts.columns = ['protein', 'interaction_count']

In [71]:
protein_counts.head()

Unnamed: 0,protein,interaction_count
0,TP53,1522
1,RPS27A,1178
2,UBA52,1042
3,EGFR,1006
4,CTNNB1,920


In [72]:
top_1000_proteins = protein_counts.head(1000)['protein'].tolist()

In [73]:
# 创建掩码：筛选包含前1000蛋白的行
mask = protein_links['name1'].isin(top_1000_proteins) | protein_links['name2'].isin(top_1000_proteins)

# 收集互作伙伴（从匹配的行中提取另一个蛋白）
interacting_partners = protein_links[mask].apply(
    lambda row: row['name2'] if row['name1'] in top_1000_proteins else row['name1'], 
    axis=1
)

# 去重并转换为唯一列表
unique_partners = interacting_partners.drop_duplicates().tolist()

In [78]:
from Bio import Entrez

Entrez.email = "lx20040519@gmail.com"

def gene_name_to_entrez(gene_name, species="Homo sapiens"):
    handle = Entrez.esearch(db="gene", term=f"{gene_name}[Gene Name] AND {species}[Organism]")
    record = Entrez.read(handle)
    if record["IdList"]:
        return record["IdList"][0]  # 返回第一个匹配的 Entrez ID
    else:
        return None

# 示例
gene_name = "TP53"
entrez_id = gene_name_to_entrez(gene_name)
print(f"Gene Name: {gene_name} → Entrez ID: {entrez_id}")

Gene Name: TP53 → Entrez ID: 7157


In [83]:
entrez_id = [gene_name_to_entrez(gene_name) for gene_name in unique_partners]

In [84]:
name_to_id = {name: id for name, id in zip(unique_partners, entrez_id)}

In [103]:
gene_info = pd.read_csv(r"C:\Users\32418\Desktop\SynergyX-main\data\raw_data\gene_4079.csv",  index_col=0)
gene_info

Unnamed: 0,gene_name,entrez_id,ensembl_id
0,ABCF1,23,ENSG00000204574
1,ABL1,25,ENSG00000097007
2,ABL2,27,ENSG00000143322
3,ACAA1,30,ENSG00000060971
4,ACACB,32,ENSG00000076555
...,...,...,...
4074,POTEB3,102724631,ENSG00000278522
4075,PNMA6F,105373377,ENSG00000225110
4076,KLF18,105378952,ENSG00000283039
4077,EEF1AKMT4-ECE2,110599583,ENSG00000284917


In [104]:
for gene in unique_partners:
    if gene not in gene_info['gene_name'].values:
        # 如果不存在，创建新行
        new_row = pd.DataFrame({
            'gene_name': [gene],
            'entrez_id': [name_to_id.get(gene, np.nan)],  # 如果gene不在name_to_id中，使用NaN
            'ensembl_id': [np.nan]  # 设置为NA
        })
        # 使用pd.concat添加新行
        gene_info = pd.concat([gene_info, new_row], ignore_index=True)

In [106]:
gene_info['entrez_id'] = pd.to_numeric(gene_info['entrez_id'], errors='coerce')
gene_info = gene_info.dropna(subset=['entrez_id'])
gene_info['entrez_id'] = gene_info['entrez_id'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gene_info['entrez_id'] = gene_info['entrez_id'].astype(int)


In [108]:
gene_info = gene_info.sort_values(by='entrez_id', ascending=True)

In [116]:
gene_info

Unnamed: 0,gene_name,entrez_id,ensembl_id
0,A1BG,1,
1,A2M,2,
2,NAT2,10,
3,SERPINA3,12,
4,AAMP,14,
...,...,...,...
11582,TAF11L9,112488745,
11583,TAF11L11,112488746,
11584,TAF11L13,112488747,
11585,LOC112694756,112694756,


In [None]:
gene_info.reset_index(inplace=True)
gene_info.drop(columns=['index'],inplace=True)

In [117]:
gene_info.to_csv('gene_11587.csv')