In [1]:
import pandas as pd
from Bio import SeqIO
import os

def restore_to_tsv(csv_path, fasta_path, output_tsv_path):
    """
    将CSV配对文件和FASTA序列文件还原为原始TSV格式
    
    参数:
        csv_path: 配对信息CSV文件路径
        fasta_path: 序列FASTA文件路径
        output_tsv_path: 输出TSV文件路径
    """
    print(f"读取文件 {csv_path} 和 {fasta_path}...")
    
    # 从FASTA文件读取序列
    sequences = {}
    for record in SeqIO.parse(fasta_path, "fasta"):
        seq_id = record.id
        sequence = str(record.seq)
        sequences[seq_id] = sequence
    
    print(f"从FASTA文件加载了 {len(sequences)} 个序列")
    
    # 读取CSV配对文件
    pairs_df = pd.read_csv(csv_path)
    print(f"从CSV文件加载了 {len(pairs_df)} 条配对记录")
    
    # 创建还原的数据
    restored_data = []
    
    for i, row in pairs_df.iterrows():
        light_id = row['light']
        heavy_id = row['heavy']
        antigen_id = row['antigen']
        delta_g = row['delta_g']
        
        # 提取pdb_id (假设格式为 pdb_id_type)
        pdb_id = light_id.split('_')[0]
        
        # 获取序列
        try:
            light_sequence = sequences[light_id]
            heavy_sequence = sequences[heavy_id]
            antigen_sequence = sequences[antigen_id]
            
            # 添加到还原数据
            restored_data.append({
                'pdb_id': pdb_id,
                'heavy_sequence': heavy_sequence,
                'light_sequence': light_sequence,
                'antigen_sequence': antigen_sequence,
                'delta_g': delta_g
            })
        except KeyError as e:
            print(f"警告: 找不到ID {e}的序列，跳过该记录")
    
    # 创建DataFrame并保存为TSV
    restored_df = pd.DataFrame(restored_data)
    restored_df.to_csv(output_tsv_path, sep='\t', index=False)
    
    print(f"成功将数据还原并保存到 {output_tsv_path}")
    print(f"还原的TSV文件包含 {len(restored_df)} 条记录")
    
    # 打印预览
    print("\n===== 还原的TSV文件预览 =====")
    print(restored_df.head().to_string())

# 执行还原
csv_path = "datasets/pairs_sabdab.csv"
fasta_path = "datasets/seq_natural.fasta"
output_tsv_path = "datasets/sabdab_dataset.tsv"

restore_to_tsv(csv_path, fasta_path, output_tsv_path)

读取文件 datasets/pairs_sabdab.csv 和 datasets/seq_natural.fasta...
从FASTA文件加载了 2084 个序列
从CSV文件加载了 578 条配对记录
成功将数据还原并保存到 datasets/sabdab_dataset.tsv
还原的TSV文件包含 578 条记录

===== 还原的TSV文件预览 =====
  pdb_id                                                                                                                                                                                                                               heavy_sequence                                                                                                                                                                                                                light_sequence                                                                                                                                                                                                                                                                                      antigen_sequence    delta_g
0   4UU9                                

In [3]:
import pandas as pd
from Bio import SeqIO
import os

def restore_to_tsv(csv_path, fasta_path, output_tsv_path):
    """
    将CSV配对文件和FASTA序列文件还原为原始TSV格式
    
    参数:
        csv_path: 配对信息CSV文件路径
        fasta_path: 序列FASTA文件路径
        output_tsv_path: 输出TSV文件路径
    """
    print(f"读取文件 {csv_path} 和 {fasta_path}...")
    
    # 从FASTA文件读取序列
    sequences = {}
    for record in SeqIO.parse(fasta_path, "fasta"):
        seq_id = record.id
        sequence = str(record.seq)
        sequences[seq_id] = sequence
    
    print(f"从FASTA文件加载了 {len(sequences)} 个序列")
    
    # 读取CSV配对文件
    pairs_df = pd.read_csv(csv_path)
    print(f"从CSV文件加载了 {len(pairs_df)} 条配对记录")
    
    # 创建还原的数据
    restored_data = []
    
    for i, row in pairs_df.iterrows():
        light_id = row['light']
        heavy_id = row['heavy']
        antigen_id = row['antigen']
        delta_g = row['delta_g']
        
        # 提取pdb_id (假设格式为 pdb_id_type)
        pdb_id = light_id.split('_')[0]
        
        # 获取序列
        try:
            light_sequence = sequences[light_id]
            heavy_sequence = sequences[heavy_id]
            antigen_sequence = sequences[antigen_id]
            
            # 添加到还原数据
            restored_data.append({
                'pdb_id': pdb_id,
                'heavy_sequence': heavy_sequence,
                'light_sequence': light_sequence,
                'antigen_sequence': antigen_sequence,
                'delta_g': delta_g
            })
        except KeyError as e:
            print(f"警告: 找不到ID {e}的序列，跳过该记录")
    
    # 创建DataFrame并保存为TSV
    restored_df = pd.DataFrame(restored_data)
    restored_df.to_csv(output_tsv_path, sep='\t', index=False)
    
    print(f"成功将数据还原并保存到 {output_tsv_path}")
    print(f"还原的TSV文件包含 {len(restored_df)} 条记录")
    
    # 打印预览
    print("\n===== 还原的TSV文件预览 =====")
    print(restored_df.head().to_string())

# 执行还原
csv_path = "datasets/pairs_skempi.csv"
fasta_path = "datasets/seq.fasta"
output_tsv_path = "datasets/skempi_dataset.tsv"

restore_to_tsv(csv_path, fasta_path, output_tsv_path)

读取文件 datasets/pairs_skempi.csv 和 datasets/seq.fasta...
从FASTA文件加载了 6352 个序列
从CSV文件加载了 387 条配对记录
成功将数据还原并保存到 datasets/skempi_dataset.tsv
还原的TSV文件包含 387 条记录

===== 还原的TSV文件预览 =====
  pdb_id                                                                                                                                                                                                          heavy_sequence                                                                                                                                                                                                          light_sequence                                                                                                                                                                                                             antigen_sequence  delta_g
0   1AHW  EIQLQQSGAELVRPGALVKLSCKASGFNIKDYYMHWVKQRPEQGLEWIGLIDPENGNTIYDPKFQGKASITADTSSNTAYLQLSSLTSEDTAVYYCARDNSYYFDYWGQGTTLTVSSAKTTPPSVYPLAPGSAAQTNSMV

In [5]:
import pandas as pd
from Bio import SeqIO
import os

def restore_to_tsv(csv_path, fasta_path, output_tsv_path):
    """
    将CSV配对文件和FASTA序列文件还原为原始TSV格式
    
    参数:
        csv_path: 配对信息CSV文件路径
        fasta_path: 序列FASTA文件路径
        output_tsv_path: 输出TSV文件路径
    """
    print(f"读取文件 {csv_path} 和 {fasta_path}...")
    
    # 从FASTA文件读取序列
    sequences = {}
    for record in SeqIO.parse(fasta_path, "fasta"):
        seq_id = record.id
        sequence = str(record.seq)
        sequences[seq_id] = sequence
    
    print(f"从FASTA文件加载了 {len(sequences)} 个序列")
    
    # 读取CSV配对文件
    pairs_df = pd.read_csv(csv_path)
    print(f"从CSV文件加载了 {len(pairs_df)} 条配对记录")
    
    # 创建还原的数据
    restored_data = []
    
    for i, row in pairs_df.iterrows():
        light_id = row['light']
        heavy_id = row['heavy']
        antigen_id = row['antigen']
        delta_g = row['delta_g']
        
        # 提取pdb_id (假设格式为 pdb_id_type)
        pdb_id = light_id.split('_')[0]
        
        # 获取序列
        try:
            light_sequence = sequences[light_id]
            heavy_sequence = sequences[heavy_id]
            antigen_sequence = sequences[antigen_id]
            
            # 添加到还原数据
            restored_data.append({
                'pdb_id': pdb_id,
                'heavy_sequence': heavy_sequence,
                'light_sequence': light_sequence,
                'antigen_sequence': antigen_sequence,
                'delta_g': delta_g
            })
        except KeyError as e:
            print(f"警告: 找不到ID {e}的序列，跳过该记录")
    
    # 创建DataFrame并保存为TSV
    restored_df = pd.DataFrame(restored_data)
    restored_df.to_csv(output_tsv_path, sep='\t', index=False)
    
    print(f"成功将数据还原并保存到 {output_tsv_path}")
    print(f"还原的TSV文件包含 {len(restored_df)} 条记录")
    
    # 打印预览
    print("\n===== 还原的TSV文件预览 =====")
    print(restored_df.head().to_string())

# 执行还原
csv_path = "datasets/pairs_abbind.csv"
fasta_path = "datasets/seq.fasta"
output_tsv_path = "datasets/abbind_dataset.tsv"

restore_to_tsv(csv_path, fasta_path, output_tsv_path)

读取文件 datasets/pairs_abbind.csv 和 datasets/seq.fasta...
从FASTA文件加载了 6352 个序列
从CSV文件加载了 1089 条配对记录
成功将数据还原并保存到 datasets/abbind_dataset.tsv
还原的TSV文件包含 1089 条记录

===== 还原的TSV文件预览 =====
  pdb_id heavy_sequence                                                                                                                                                         light_sequence                                                                                                                                   antigen_sequence  delta_g
0   1AK4              P  MVNPTVFFDIAVDGEPLGRVSFELFADKVPKTAENFRALSTGEKGFGYKGSCFHRIIPGFMCQGGDFTRHNGTGGKSIYGEKFEDENFILKHTGPGILSMANAGPNTNGSQFFICTAKTEWLDGKHVVFGKVKEGMNIVEAMERFGSRNGKTSKKITIADCGQLE  PIVQNLQGQMVHQAISPRTLNAWVKVVEEKAFSPEVIPMFSALSEGATPQDLNTMLNTVGGHQAAMQMLKETINEEAAEWDRLHAVHAGPIAPGQMREPRGSDIAGTTSTLQEQIGWMTHNPPIPVGEIYKRWIILGLNKIVRMY    -7.15
1   1AK4              P  MVNPTVFFDIAVDGEPLGRVSFELFADKVPKTAENFRALSTGEKGFGYKGSCFHRIIPGFMCQGGDFTRHNGTGGKSIYGEKFEDENFILKHTGPGILSM