In [34]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.io as pio
#pio.templates.default = "plotly_white"

In [22]:
# import training dataset
d1 = pd.read_csv("../../ML_datasets/QSAR_Molecular_Structure_Predictions/train_rows.csv")

In [20]:
print(d1.head())

   SpMax_L  J_Dz(e)  nHM  F01[N-N]  F04[C-N]  NssssC  nCb-    C%  nCp  nO  \
0    3.919   2.6909    0         0         0       0     0  31.4    2   0   
1    4.170   2.1144    0         0         0       0     0  30.8    1   1   
2    3.932   3.2512    0         0         0       0     0  26.7    2   4   
3    3.000   2.7098    0         0         0       0     0  20.0    0   2   
4    4.236   3.3944    0         0         0       0     0  29.4    2   4   

   ...  C-026  F02[C-N]  nHDon  SpMax_B(m)  Psi_i_A  nN  SM6_B(m)  nArCOOR  \
0  ...      0         0      0       2.949    1.591   0     7.253        0   
1  ...      0         0      0       3.315    1.967   0     7.257        0   
2  ...      0         0      1       3.076    2.417   0     7.601        0   
3  ...      0         0      1       3.046    5.000   0     6.690        0   
4  ...      0         0      0       3.351    2.405   0     8.003        0   

   nX  Class  
0   0      1  
1   0      1  
2   0      1  
3   0   

In [21]:
features = d1.columns
print(features)

Index(['SpMax_L', 'J_Dz(e)', 'nHM', 'F01[N-N]', 'F04[C-N]', 'NssssC', 'nCb-',
       'C%', 'nCp', 'nO', 'F03[C-N]', 'SdssC', 'HyWi_B(m)', 'LOC', 'SM6_L',
       'F03[C-O]', 'Me', 'Mi', 'nN-N', 'nArNO2', 'nCRX3', 'SpPosA_B(p)',
       'nCIR', 'B01[C-Br]', 'B03[C-Cl]', 'N-073', 'SpMax_A', 'Psi_i_1d',
       'B04[C-Br]', 'SdO', 'TI2_L', 'nCrt', 'C-026', 'F02[C-N]', 'nHDon',
       'SpMax_B(m)', 'Psi_i_A', 'nN', 'SM6_B(m)', 'nArCOOR', 'nX', 'Class'],
      dtype='object')


In [46]:
# Compute the correlation matrix
correlation_matrix = d1.corr()

# Extract all pairwise correlations
correlations = correlation_matrix.unstack()

# Remove self-correlations (e.g., corr(a, a)) and duplicate pairs (e.g., corr(a, b) = corr(b, a))
# Create a condition to ensure no duplicates (e.g., corr(a, b) will be considered once)
correlations = correlations[correlations.index.get_level_values(0) != correlations.index.get_level_values(1)]
correlations = correlations[correlations.index.get_level_values(0) < correlations.index.get_level_values(1)]

# Sort correlations by absolute value
correlations = correlations.abs().sort_values(ascending=False)

# Get the top 50 correlations
top_50 = correlations.head(50).reset_index()
top_50.columns = ['Feature1', 'Feature2', 'Correlation']


In [47]:
fig = px.bar(
    top_50,
    x='Correlation',
    y=top_50.index,
    color='Correlation',
    orientation='h',
    title="Top 50 Feature Correlations",
    labels={"y": "Feature Pair Index", "Correlation": "Correlation Coefficient"}
)
fig.update_layout(yaxis=dict(tickmode='array', tickvals=top_50.index, ticktext=[f"{row['Feature1']} vs {row['Feature2']}" for _, row in top_50.iterrows()]))

fig.show()

In [48]:
# Create a 10x5 grid layout
fig = make_subplots(
    rows=10,
    cols=5,
    subplot_titles=[
        f"{row['Feature1']} vs {row['Feature2']}<br>Correlation: {row['Correlation']:.2f}"
        for _, row in top_50.iterrows()
    ]
)

# Step 7: Add scatter plots for each pair
for idx, row in top_50.iterrows():
    feature1 = row['Feature1']
    feature2 = row['Feature2']
    correlation = row['Correlation']

    # Determine row and column for the subplot
    subplot_row = idx // 5 + 1
    subplot_col = idx % 5 + 1

    # Add the scatter plot
    fig.add_trace(
        go.Scatter(
            x=d1[feature1],
            y=d1[feature2],
            mode='markers',
            name=f"{feature1} vs {feature2}",
            marker=dict(size=4, opacity=0.7)
        ),
        row=subplot_row,
        col=subplot_col
    )

    # Update axes titles dynamically
    fig.update_xaxes(title_text=f"{feature1}", row=subplot_row, col=subplot_col)
    fig.update_yaxes(title_text=f"{feature2}", row=subplot_row, col=subplot_col)

# Step 8: Update layout
fig.update_layout(
    height=2500,  # Adjust height for readability
    width=2000,   # Adjust width for readability
    title_text="Scatter Plots for Top 50 Correlations",
    title_x=0.5,
    showlegend=False,
)

fig.show()