In [1]:
import pandas as pd
import os

# Directories
rsa_dir = 'data/rsa'
ss_dir = 'data/ss'
output_dir = 'data'

# Lists to store successful and failed files
successful_files = []
failed_files = []

# Loop through RSA files
for rsa_file in os.listdir(rsa_dir):
    # Use the full file name as the prefix, excluding the extension
    prefix = rsa_file.rsplit('.', 1)[0]

    try:
        # Load RSA data
        rsa_df = pd.read_csv(os.path.join(rsa_dir, rsa_file), sep='\t')

        # Find corresponding SS file with the exact same prefix
        ss_file = [f for f in os.listdir(ss_dir) if f.startswith(prefix) and f.endswith('_ss.tsv')][0]

        # Load SS data
        ss_df = pd.read_csv(os.path.join(ss_dir, ss_file), sep='\t')

        # Merge dataframes
        merged_df = pd.merge(rsa_df, ss_df, on=['dssp_index', 'aa'])

        # Add protein name column
        merged_df['Protein_id'] = prefix

        # Reorder columns
        merged_df = merged_df[['dssp_index', 'Protein_id', 'chain', 'aa', 'eight_hot_ss', 'three_hot_ss', 'rsa', 'phi', 'psi']]

        # Save to CSV
        output_file = prefix + '_dssp.csv'
        merged_df.to_csv(os.path.join(output_dir, output_file), index=False)

        # Add to successful files list
        successful_files.append(output_file)

    except (FileNotFoundError, IndexError):
        # Add to failed files list
        failed_files.append(rsa_file)

# Print successful files
print("Successfully merged files:")
for file in successful_files:
    print(file)

# Print failed files
print("\nFailed to merge files:")
for file in failed_files:
    print(file)


Successfully merged files:
1HQ3_1_2_dssp.csv
1HLU_1_2_dssp.csv
1HLE_1_2_dssp.csv
1FYT_2_4_dssp.csv
1FYT_1_4_dssp.csv
1A0R_2_3_dssp.csv
2WII_1_3_dssp.csv
1FZC_1_2_dssp.csv
2WII_2_3_dssp.csv
1G0U_10_7_dssp.csv

Failed to merge files:
