normalize the distribution of the RSA as compared to Test_interaction_score


In [1]:
import os
import pandas as pd

def create_rsa_distribution_table(data_dir):
    # Define the bins for rsa
    rsa_bins = [0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1]
    rsa_labels = [f'{rsa_bins[i]}-{rsa_bins[i+1]}' for i in range(len(rsa_bins)-1)]

    # Initialize an empty DataFrame to store the distribution data
    distribution_df = pd.DataFrame(columns=['rsa_bin', 'count'])

    for file in os.listdir(data_dir):
        if file.endswith('_dssp.csv'):
            file_path = os.path.join(data_dir, file)
            df = pd.read_csv(file_path)

            # Bin the rsa values
            df['rsa_bin'] = pd.cut(df['rsa'], bins=rsa_bins, labels=rsa_labels, right=False)

            # Group by rsa_bin and count the occurrences
            grouped = df['rsa_bin'].value_counts().reset_index()
            grouped.columns = ['rsa_bin', 'count']

            # Append the grouped data to the distribution DataFrame
            distribution_df = pd.concat([distribution_df, grouped], ignore_index=True)

    # Aggregate the counts from all files
    distribution_df = distribution_df.groupby('rsa_bin').sum().reset_index()

    return distribution_df

# Construct the relative path to the data directory
data_dir = os.path.join(os.getcwd(), '..', 'data')

# Create the distribution table
distribution_table = create_rsa_distribution_table(data_dir)

# Print the distribution table
print(distribution_table)

     rsa_bin count
0     0-0.05  3484
1   0.05-0.1  1037
2   0.1-0.15   820
3   0.15-0.2   691
4   0.2-0.25   668
5   0.25-0.3   639
6   0.3-0.35   628
7   0.35-0.4   552
8   0.4-0.45   560
9   0.45-0.5   457
10  0.5-0.55   466
11  0.55-0.6   371
12  0.6-0.65   338
13  0.65-0.7   263
14  0.7-0.75   233
15  0.75-0.8   185
16  0.8-0.85   140
17  0.85-0.9   107
18  0.9-0.95    80
19    0.95-1    66


  distribution_df = pd.concat([distribution_df, grouped], ignore_index=True)
  distribution_df = distribution_df.groupby('rsa_bin').sum().reset_index()


In [2]:
import os
import pandas as pd

def create_distribution_table(data_dir, column_name, bins, labels):
    # Initialize an empty DataFrame to store the distribution data
    distribution_df = pd.DataFrame(columns=[f'{column_name}_bin', 'count'])

    for file in os.listdir(data_dir):
        if file.endswith('_dssp.csv'):
            file_path = os.path.join(data_dir, file)
            df = pd.read_csv(file_path)

            # Bin the values
            df[f'{column_name}_bin'] = pd.cut(df[column_name], bins=bins, labels=labels, right=False)

            # Group by bin and count the occurrences
            grouped = df[f'{column_name}_bin'].value_counts().reset_index()
            grouped.columns = [f'{column_name}_bin', 'count']

            # Append the grouped data to the distribution DataFrame
            distribution_df = pd.concat([distribution_df, grouped], ignore_index=True)

    # Aggregate the counts from all files
    distribution_df = distribution_df.groupby(f'{column_name}_bin').sum().reset_index()

    # Add a row for the total count
    total_count = distribution_df['count'].sum()
    total_row = pd.DataFrame({f'{column_name}_bin': ['Total'], 'count': [total_count]})
    distribution_df = pd.concat([distribution_df, total_row], ignore_index=True)

    return distribution_df

# Define the bins and labels for rsa
rsa_bins = [0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1, 1.1]
rsa_labels = [f'{rsa_bins[i]}-{rsa_bins[i+1]}' for i in range(len(rsa_bins)-1)]

# Define the bins and labels for test_interaction_score
tis_bins = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 1.1]
tis_labels = [f'{tis_bins[i]}-{tis_bins[i+1]}' for i in range(len(tis_bins)-1)]

# Construct the relative path to the data directory
data_dir = os.path.join(os.getcwd(), '..', 'data')

# Create the distribution tables
rsa_distribution_table = create_distribution_table(data_dir, 'rsa', rsa_bins, rsa_labels)
tis_distribution_table = create_distribution_table(data_dir, 'test_interaction_score', tis_bins, tis_labels)

# Print the distribution tables
print("RSA Distribution Table:")
print(rsa_distribution_table)
print("\nTest Interaction Score Distribution Table:")
print(tis_distribution_table)


RSA Distribution Table:
     rsa_bin  count
0     0-0.05   3484
1   0.05-0.1   1037
2   0.1-0.15    820
3   0.15-0.2    691
4   0.2-0.25    668
5   0.25-0.3    639
6   0.3-0.35    628
7   0.35-0.4    552
8   0.4-0.45    560
9   0.45-0.5    457
10  0.5-0.55    466
11  0.55-0.6    371
12  0.6-0.65    338
13  0.65-0.7    263
14  0.7-0.75    233
15  0.75-0.8    185
16  0.8-0.85    140
17  0.85-0.9    107
18  0.9-0.95     80
19    0.95-1     66
20     1-1.1     94
21     Total  11879

Test Interaction Score Distribution Table:
   test_interaction_score_bin  count
0                       0-0.1   7089
1                     0.1-0.2   1008
2                     0.2-0.3    590
3                     0.3-0.4    485
4                     0.4-0.5    364
5                     0.5-0.6    393
6                     0.6-0.7    357
7                     0.7-0.8    343
8                     0.8-0.9    447
9                       0.9-1    803
10                      1-1.1      0
11                      Tota

  distribution_df = pd.concat([distribution_df, grouped], ignore_index=True)
  distribution_df = distribution_df.groupby(f'{column_name}_bin').sum().reset_index()
  distribution_df = pd.concat([distribution_df, grouped], ignore_index=True)
  distribution_df = distribution_df.groupby(f'{column_name}_bin').sum().reset_index()
