Modify sample DSSP files to have a placeholder interaction score for now

In [10]:
import os
import pandas as pd

def modify_dssp_files(data_dir):
    for file in os.listdir(data_dir):
        if file.endswith('_dssp.csv'):
            file_path = os.path.join(data_dir, file)
            df = pd.read_csv(file_path)
            df['test_interaction_score'] = df['rsa'].apply(lambda x: 1 if x > 0.25 else 0)
            df.to_csv(file_path, index=False)

# Construct the relative path to the data directory
data_dir = os.path.join(os.getcwd(), '..', 'data')

# Modify the DSSP files in the data directory
modify_dssp_files(data_dir)

In [11]:
import os
import pandas as pd
import numpy as np

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def modify_dssp_files(data_dir):
    for file in os.listdir(data_dir):
        if file.endswith('_dssp.csv'):
            file_path = os.path.join(data_dir, file)
            df = pd.read_csv(file_path)

            # Apply a transformation to create a more normal distribution
            df['test_interaction_score'] = df['rsa'].apply(lambda x: sigmoid(x * 10 - 5))

            df.to_csv(file_path, index=False)

# Construct the relative path to the data directory
data_dir = os.path.join(os.getcwd(), '..', 'data')

# Modify the DSSP files in the data directory
modify_dssp_files(data_dir)

If u want to REMOVE that PLACEHOLDER please run the cell below to remove the interaction score from DSSP files

In [8]:
import os
import pandas as pd

def print_and_remove_columns(data_dir, columns_to_remove):
    for file in os.listdir(data_dir):
        if file.endswith('_dssp.csv'):
            file_path = os.path.join(data_dir, file)
            df = pd.read_csv(file_path)
            print(f"Columns in {file}: {df.columns.tolist()}")

            # Remove specified columns if they exist in the DataFrame
            for column in columns_to_remove:
                if column in df.columns:
                    df = df.drop(columns=[column])

            df.to_csv(file_path, index=False)

# Construct the relative path to the data directory
data_dir = os.path.join(os.getcwd(), '..', 'data')

# Print the columns and specify the columns to remove
columns_to_remove = ['test_interaction_score']  # Add more column names if needed
print_and_remove_columns(data_dir, columns_to_remove)

Columns in 1B41_dssp.csv: ['dssp_index', 'Protein_id', 'chain', 'aa', 'eight_hot_ss', 'three_hot_ss', 'rsa', 'phi', 'psi', 'test_interaction_score']
Columns in 2ACH_dssp.csv: ['dssp_index', 'Protein_id', 'chain', 'aa', 'eight_hot_ss', 'three_hot_ss', 'rsa', 'phi', 'psi', 'test_interaction_score']
Columns in 2OYI_dssp.csv: ['dssp_index', 'Protein_id', 'chain', 'aa', 'eight_hot_ss', 'three_hot_ss', 'rsa', 'phi', 'psi', 'test_interaction_score']
Columns in 2WII_dssp.csv: ['dssp_index', 'Protein_id', 'chain', 'aa', 'eight_hot_ss', 'three_hot_ss', 'rsa', 'phi', 'psi', 'test_interaction_score']
Columns in 2XWT_dssp.csv: ['dssp_index', 'Protein_id', 'chain', 'aa', 'eight_hot_ss', 'three_hot_ss', 'rsa', 'phi', 'psi', 'test_interaction_score']
Columns in 2A74_dssp.csv: ['dssp_index', 'Protein_id', 'chain', 'aa', 'eight_hot_ss', 'three_hot_ss', 'rsa', 'phi', 'psi', 'test_interaction_score']
Columns in 2B4X_dssp.csv: ['dssp_index', 'Protein_id', 'chain', 'aa', 'eight_hot_ss', 'three_hot_ss', 'rsa

code to ADD physicochemical properties to the dssp files


In [27]:
import os
import pandas as pd

def load_physicochemical_properties(file_path):
    df = pd.read_csv(file_path)
    df.set_index('Index', inplace=True)
    return df

def modify_dssp_files(data_dir, physicochemical_properties):
    for file in os.listdir(data_dir):
        if file.endswith('_dssp.csv'):
            file_path = os.path.join(data_dir, file)
            df = pd.read_csv(file_path)

            # Add new columns for each physicochemical property
            for property_name in physicochemical_properties.index:
                df[property_name] = df['aa'].apply(lambda aa: physicochemical_properties.loc[property_name, aa])

            df.to_csv(file_path, index=False)

# Construct the relative path to the data directory
data_dir = os.path.join(os.getcwd(), '..', 'data')  # Updated to match the correct relative path

# Load the physicochemical properties
physicochemical_properties_file = os.path.join(data_dir, 'physicochemical_properties.csv')
physicochemical_properties = load_physicochemical_properties(physicochemical_properties_file)

# Modify the DSSP files in the data directory
modify_dssp_files(data_dir, physicochemical_properties)

code to REMOVE physicochemical properties from the dssp files

In [26]:
import os
import pandas as pd

def load_physicochemical_properties(file_path):
    return pd.read_csv(file_path).set_index('Index')

def remove_physicochemical_columns(data_dir, physicochemical_properties):
    for file in os.listdir(data_dir):
        if file.endswith('_dssp.csv'):
            file_path = os.path.join(data_dir, file)
            df = pd.read_csv(file_path)

            # Remove the columns corresponding to physicochemical properties
            for property_name in physicochemical_properties.index:
                if property_name in df.columns:
                    df.drop(columns=[property_name], inplace=True)

            df.to_csv(file_path, index=False)

# Construct the relative path to the data directory
data_dir = os.path.join(os.getcwd(), '..', 'data')

# Load the physicochemical properties
physicochemical_properties_file = os.path.join(data_dir, 'physicochemical_properties.csv')
physicochemical_properties = load_physicochemical_properties(physicochemical_properties_file)

# Remove the physicochemical columns from the DSSP files in the data directory
remove_physicochemical_columns(data_dir, physicochemical_properties)