<a href="https://colab.research.google.com/github/DaniChinwendu/skin-sensitization-prediction/blob/main/Similarity__Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""
OPTIMIZING SKIN SENSITIZATION PREDICTION: A COMPARATIVE ANALYSIS OF KNN VS RANDOM FOREST

Tanimoto Similarity Matrix and Pairwise Analysis - Google Colab Notebook
OPTIMIZING SKIN SENSITIZATION PREDICTION: A COMPARATIVE ANALYSIS OF KNN VS RANDOM FOREST

Daniel C. Ukaegbu 1, Karolina Kopanska 1, Peter Ranslow 2, Thomas Hartung 1, Alexandra Maertens 1*
1 Center for Alternatives to Animal Testing (CAAT), Johns Hopkins Bloomberg School of Public Health, Baltimore, Maryland 21205, United States
2 Consortium for Environmental Risk Management (CERM), Hallowell, Maine 04347, United States
*Correspondence:
Alexandra Maertens
amaerte1@jhu.edu

Computes fingerprint similarity matrices and extracts similar pairs.
Supports Morgan, RDKit, AtomPair, MACCS Keys, and PubChem fingerprints.
"""



In [None]:
# ============================================================================
# SECTION 1: Install Dependencies (Run this first in Colab)
# ============================================================================

# Uncomment and run if packages are not installed
!pip install rdkit



Collecting rdkit
  Downloading rdkit-2025.9.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (4.1 kB)
Downloading rdkit-2025.9.1-cp312-cp312-manylinux_2_28_x86_64.whl (36.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m36.2/36.2 MB[0m [31m28.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2025.9.1


In [None]:
# ============================================================================
# SECTION 2: Import Libraries
# ============================================================================

import os
from base64 import b64decode
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, MACCSkeys
from rdkit.DataStructs import ExplicitBitVect
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
sns.set_style("whitegrid")

print("✓ All libraries imported successfully!")



✓ All libraries imported successfully!


In [None]:
# ============================================================================
# SECTION 3: Load Your Data File
# ============================================================================

from google.colab import files
import io

print("Upload your CSV file:")
print("Note: Your CSV must have either:")
print("  - A 'smiles' column for most fingerprint types")
print("  - A base64-encoded PubChem fingerprint column for PubChem type")
print("="*60)

uploaded = files.upload()
filename = list(uploaded.keys())[0]
df = pd.read_csv(io.BytesIO(uploaded[filename]))

print(f"\n✓ Data loaded: {len(df)} compounds")
print(f"  Columns: {list(df.columns)}")
print(f"\nFirst few rows of your data:")
display(df.head())



Upload your CSV file:
Note: Your CSV must have either:
  - A 'smiles' column for most fingerprint types
  - A base64-encoded PubChem fingerprint column for PubChem type


Saving df_test.csv to df_test.csv

✓ Data loaded: 235 compounds
  Columns: ['InChIKey', 'DATATYPE', 'VALUE', 'H317', 'smiles', 'Moleculer_weight', 'TPSA', 'CID', 'BOILING_POINT_DEGC_OPERA_PRED', 'OCTANOL_WATER_PARTITION_LOGP_OPERA_PRED', 'MELTING_POINT_DEGC_OPERA_PRED', 'VAPOR_PRESSURE_MMHG_OPERA_PRED', 'WATER_SOLUBILITY_MOL/L_OPERA_PRED', 'IUPAC NAME', 'PUBCHEMFP', 'SUPER CLASS', 'CLASS', 'SUBCLASS', 'Alert for Acyl Transfer Agent', 'Alert For Micheal Acceptors', 'Alert for SN2', 'Alert for SNAR', 'Alert for Schiff base', 'No Skin Sensitization', 'Number of alert', 'HOMO', 'LUMO', 'Pubchem Bits', 'pubchem Bit Vector']

First few rows of your data:


Unnamed: 0,InChIKey,DATATYPE,VALUE,H317,smiles,Moleculer_weight,TPSA,CID,BOILING_POINT_DEGC_OPERA_PRED,OCTANOL_WATER_PARTITION_LOGP_OPERA_PRED,...,Alert For Micheal Acceptors,Alert for SN2,Alert for SNAR,Alert for Schiff base,No Skin Sensitization,Number of alert,HOMO,LUMO,Pubchem Bits,pubchem Bit Vector
0,BDAGIHXWWSANSR-UHFFFAOYSA-N,in vivo,0,0,C(=O)O,46.025,37.3,284,2.00467,-0.54,...,0,0,0,0,Yes,0,-0.1469,0.0998,0000000000000000001100000000000000000000000000...,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0...
1,SALNWJHEVGXTER-UHFFFAOYSA-N,in vivo,1,0,COC(=O)C(CCN1C=NC2=C1N=C(N=C2Cl)N)C(=O)OC,327.728,122.22,9949313,2.507146,0.39,...,0,0,1,0,No,1,-0.1331,0.0401,1100000001110011101110000000000000000100000000...,[1 1 0 0 0 0 0 0 0 1 1 1 0 0 1 1 1 0 1 1 1 0 0...
2,JQZAEUFPPSRDOP-UHFFFAOYSA-N,in vivo,1,1,C1=CC(=CC=C1CCl)Cl,161.031,0.0,7723,2.33969,3.18,...,0,1,0,0,No,1,-0.2027,0.0534,1000000001100000000000000000000000000110000000...,[1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0...
3,SATCULPHIDQDRE-UHFFFAOYSA-N,in vivo,1,1,C1OC2=C(O1)C=C(C=C2)C=O,150.133,35.53,8438,2.420045,1.05,...,0,0,0,1,No,1,-0.1345,0.0396,1000000001110000001100000000000000000000000000...,[1 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 1 1 0 0 0...
4,CMKBCTPCXZNQKX-UHFFFAOYSA-N,in vivo,1,1,C1CCC(CC1)S,116.229,0.0,15290,2.201036,2.6,...,0,0,0,0,Yes,0,-0.0997,0.2037,1100000001100000000000000000000001000000000000...,[1 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0...


In [None]:
# ============================================================================
# SECTION 4: Configuration Parameters
# ============================================================================

# Configure your analysis here
CONFIG = {
    # Fingerprint settings
    'fp_type': 'maccs',  # Options: 'morgan', 'rdkit', 'atompair', 'maccs', 'pubchem'
    'morgan_radius': 3,   # Morgan fingerprint radius
    'n_bits': 512,        # Fingerprint size (not used for MACCS or PubChem)

    # Column names
    'id_column': 'CID',   # Column name for compound IDs
    'smiles_column': 'smiles',  # SMILES column name
    'pubchem_column': 'PUBCHEMFP',  # PubChem fingerprint column (if using)

    # Similarity settings
    'threshold': 0.7,     # Minimum similarity to extract pairs

    # Output settings
    'output_dir': 'similarity_results'
}

print("\n✓ Configuration set:")
for key, value in CONFIG.items():
    print(f"  {key}: {value}")

# Validate required columns
if CONFIG['fp_type'] != 'pubchem':
    if CONFIG['smiles_column'] not in df.columns:
        raise ValueError(f"Column '{CONFIG['smiles_column']}' not found!")
    else:
        print(f"\n✓ SMILES column found: '{CONFIG['smiles_column']}'")

if CONFIG['id_column'] not in df.columns:
    raise ValueError(f"ID column '{CONFIG['id_column']}' not found!")
else:
    print(f"✓ ID column found: '{CONFIG['id_column']}'")




✓ Configuration set:
  fp_type: maccs
  morgan_radius: 3
  n_bits: 512
  id_column: CID
  smiles_column: smiles
  pubchem_column: PUBCHEMFP
  threshold: 0.7
  output_dir: similarity_results

✓ SMILES column found: 'smiles'
✓ ID column found: 'CID'


In [None]:
# ============================================================================
# SECTION 5: Core Fingerprint Functions
# ============================================================================

def decode_pubchem_fp(pcfp_base64):
    """Decode PubChem fingerprint from base64 string."""
    try:
        binary_str = "".join(["{:08b}".format(x) for x in b64decode(pcfp_base64)])[32:913]
        return binary_str
    except Exception as e:
        return None


def convert_bitstring_to_bitvect(bitstring):
    """Convert binary string to RDKit ExplicitBitVect."""
    if bitstring is None:
        return None

    n_bits = len(bitstring)
    bitvect = ExplicitBitVect(n_bits)

    for i, bit in enumerate(bitstring):
        if bit == '1':
            bitvect.SetBit(i)

    return bitvect


def generate_fingerprints(df, fp_type='morgan', radius=3, n_bits=512, pubchem_column=None):
    """Generate molecular fingerprints from SMILES or decode PubChem FPs."""
    print(f"\n{'='*60}")
    print(f"GENERATING {fp_type.upper()} FINGERPRINTS")
    print(f"{'='*60}")

    df = df.copy()

    # Handle PubChem fingerprints
    if fp_type == 'pubchem':
        if pubchem_column is None:
            raise ValueError("pubchem_column must be specified for PubChem fingerprints")

        if pubchem_column not in df.columns:
            raise ValueError(f"Column '{pubchem_column}' not found")

        print(f"Decoding PubChem fingerprints from: {pubchem_column}")

        df['PubChem_BitString'] = df[pubchem_column].astype(str).apply(decode_pubchem_fp)
        df['Fingerprint'] = df['PubChem_BitString'].apply(convert_bitstring_to_bitvect)

        valid_count = df['Fingerprint'].notna().sum()
        invalid_count = df['Fingerprint'].isna().sum()

        print(f"  Valid fingerprints: {valid_count}")
        print(f"  Invalid fingerprints: {invalid_count}")
        print(f"  Fingerprint size: 881 bits")

        return df[df['Fingerprint'].notna()].copy()

    # Generate molecules from SMILES
    print("Parsing SMILES strings...")
    df['Molecule'] = df[CONFIG['smiles_column']].apply(
        lambda x: Chem.MolFromSmiles(x) if pd.notna(x) else None
    )

    valid_count = df['Molecule'].notna().sum()
    invalid_count = df['Molecule'].isna().sum()
    print(f" Valid molecules: {valid_count}")
    print(f" Invalid SMILES: {invalid_count}")

    # Generate fingerprints
    print(f"Generating {fp_type} fingerprints...")

    if fp_type == 'morgan':
        df['Fingerprint'] = df['Molecule'].apply(
            lambda x: AllChem.GetMorganFingerprintAsBitVect(x, nBits=n_bits, radius=radius, useFeatures=True)
            if x is not None else None
        )
        print(f"  Fingerprint size: {n_bits} bits (radius={radius})")

    elif fp_type == 'rdkit':
        df['Fingerprint'] = df['Molecule'].apply(
            lambda x: Chem.RDKFingerprint(x, fpSize=n_bits) if x is not None else None
        )
        print(f"  Fingerprint size: {n_bits} bits")

    elif fp_type == 'atompair':
        df['Fingerprint'] = df['Molecule'].apply(
            lambda x: AllChem.GetHashedAtomPairFingerprintAsBitVect(x, nBits=n_bits)
            if x is not None else None
        )
        print(f"  Fingerprint size: {n_bits} bits")

    elif fp_type == 'maccs':
        df['Fingerprint'] = df['Molecule'].apply(
            lambda x: MACCSkeys.GenMACCSKeys(x) if x is not None else None
        )
        print(f"  Fingerprint size: 167 bits (MACCS Keys)")

    else:
        raise ValueError(f"Unknown fingerprint type: {fp_type}")

    df_valid = df[df['Fingerprint'].notna()].copy()
    print(f"   Generated: {len(df_valid)} fingerprints")

    return df_valid


def compute_tanimoto_matrix(df, id_column='CID'):
    """Compute Tanimoto similarity matrix."""
    print(f"\n{'='*60}")
    print("COMPUTING TANIMOTO SIMILARITY MATRIX")
    print(f"{'='*60}")

    fingerprints = df['Fingerprint'].tolist()
    ids = df[id_column].tolist()
    n = len(fingerprints)

    print(f"Matrix size: {n} x {n} = {n*n:,} comparisons")

    # Initialize matrix
    sim_matrix = np.zeros((n, n))
    np.fill_diagonal(sim_matrix, 1.0)

    # Compute upper triangle
    print("Computing similarities...")
    for i in range(n):
        if i % 100 == 0 and i > 0:
            progress = (i / n) * 100
            print(f"  Progress: {progress:.1f}% ({i}/{n})")

        similarities = DataStructs.BulkTanimotoSimilarity(fingerprints[i], fingerprints[i+1:])

        for j, sim in enumerate(similarities, start=i+1):
            sim_matrix[i, j] = sim
            sim_matrix[j, i] = sim

    print("  Similarity matrix complete!")

    sim_df = pd.DataFrame(sim_matrix, index=ids, columns=ids)
    return sim_df


def flatten_similarity_matrix(sim_df, threshold=0.7):
    """Extract pairwise similarities above threshold."""
    print(f"\n{'='*60}")
    print(f"EXTRACTING PAIRS (threshold >= {threshold})")
    print(f"{'='*60}")

    pairs = []
    ids = sim_df.index.tolist()

    # Upper triangle only
    for i_idx, i in enumerate(ids):
        for j in ids[i_idx + 1:]:
            similarity = sim_df.loc[i, j]
            if similarity >= threshold:
                pairs.append((i, j, similarity))

    pair_df = pd.DataFrame(pairs, columns=['Chemical_x', 'Chemical_y', 'Similarity'])
    pair_df = pair_df.sort_values('Similarity', ascending=False).reset_index(drop=True)

    print(f"  Found {len(pair_df)} pairs above threshold")

    return pair_df


print(" All fingerprint functions defined!")



✓ All fingerprint functions defined!


In [None]:
# ============================================================================
# SECTION 6: Generate Fingerprints
# ============================================================================

# Create output directory
os.makedirs(CONFIG['output_dir'], exist_ok=True)

# Generate fingerprints
df_fp = generate_fingerprints(
    df,
    fp_type=CONFIG['fp_type'],
    radius=CONFIG['morgan_radius'],
    n_bits=CONFIG['n_bits'],
    pubchem_column=CONFIG['pubchem_column']
)

print(f"\n Ready for similarity analysis: {len(df_fp)} compounds")




GENERATING MACCS FINGERPRINTS
Parsing SMILES strings...
  ✓ Valid molecules: 235
  ⚠️  Invalid SMILES: 0
Generating maccs fingerprints...
  Fingerprint size: 167 bits (MACCS Keys)
  ✓ Generated: 235 fingerprints

✓ Ready for similarity analysis: 235 compounds


In [None]:
# ============================================================================
# SECTION 7: Compute Similarity Matrix
# ============================================================================

sim_df = compute_tanimoto_matrix(df_fp, id_column=CONFIG['id_column'])

# Save similarity matrix
matrix_file = os.path.join(CONFIG['output_dir'], f"similarity_matrix_{CONFIG['fp_type']}.csv")
sim_df.to_csv(matrix_file)
print(f"\n✓ Similarity matrix saved to: {matrix_file}")

# Display sample of matrix
print("\nSample of similarity matrix:")
display(sim_df.iloc[:10, :10])




COMPUTING TANIMOTO SIMILARITY MATRIX
Matrix size: 235 x 235 = 55,225 comparisons
Computing similarities...
  Progress: 42.6% (100/235)
  Progress: 85.1% (200/235)
  ✓ Similarity matrix complete!

✓ Similarity matrix saved to: similarity_results/similarity_matrix_maccs.csv

Sample of similarity matrix:


Unnamed: 0,284,9949313,7723,8438,15290,16299,164362,3024003,4075,7704
284,1.0,0.089286,0.0,0.208333,0.0,0.2,0.107143,0.08,0.230769,0.208333
9949313,0.089286,1.0,0.163636,0.258065,0.063492,0.25,0.15942,0.169231,0.265625,0.2
7723,0.0,0.163636,1.0,0.142857,0.105263,0.147059,0.133333,0.153846,0.09375,0.066667
8438,0.208333,0.258065,0.142857,1.0,0.060606,0.358974,0.2,0.294118,0.324324,0.314286
15290,0.0,0.063492,0.105263,0.060606,1.0,0.135135,0.233333,0.222222,0.055556,0.166667
16299,0.2,0.25,0.147059,0.358974,0.135135,1.0,0.617647,0.416667,0.435897,0.514286
164362,0.107143,0.15942,0.133333,0.2,0.233333,0.617647,1.0,0.393939,0.307692,0.454545
3024003,0.08,0.169231,0.153846,0.294118,0.222222,0.416667,0.393939,1.0,0.146341,0.466667
4075,0.230769,0.265625,0.09375,0.324324,0.055556,0.435897,0.307692,0.146341,1.0,0.225
7704,0.208333,0.2,0.066667,0.314286,0.166667,0.514286,0.454545,0.466667,0.225,1.0


In [None]:
# ============================================================================
# SECTION 8: Extract Similar Pairs
# ============================================================================

pair_df = flatten_similarity_matrix(sim_df, threshold=CONFIG['threshold'])

if len(pair_df) > 0:
    # Save pairs
    pairs_file = os.path.join(CONFIG['output_dir'], f"similarity_pairs_{CONFIG['fp_type']}.csv")
    pair_df.to_csv(pairs_file, index=False)
    print(f"\n✓ Similar pairs saved to: {pairs_file}")

    # Display top pairs
    print(f"\n{'='*60}")
    print("TOP 20 MOST SIMILAR PAIRS")
    print(f"{'='*60}")
    display(pair_df.head(20))
else:
    print(f"\n⚠️  No pairs found with similarity >= {CONFIG['threshold']}")




EXTRACTING PAIRS (threshold >= 0.7)
  ✓ Found 201 pairs above threshold

✓ Similar pairs saved to: similarity_results/similarity_pairs_maccs.csv

TOP 20 MOST SIMILAR PAIRS


Unnamed: 0,Chemical_x,Chemical_y,Similarity
0,8048,8918,1.0
1,8048,8164,1.0
2,60985,9017,1.0
3,8918,8164,1.0
4,7803,7762,1.0
5,20959,8710,1.0
6,7704,16821,0.958333
7,85335,169361,0.956522
8,8048,7803,0.95
9,7803,8164,0.95


In [None]:
# ============================================================================
# SECTION 9: Similarity Distribution Visualization
# ============================================================================

print(f"\n{'='*60}")
print("GENERATING VISUALIZATIONS")
print(f"{'='*60}")

# Extract upper triangle values
sim_values = sim_df.values[np.triu_indices_from(sim_df.values, k=1)]

# Create comprehensive visualization
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Plot 1: Histogram
ax1 = axes[0, 0]
ax1.hist(sim_values, bins=50, edgecolor='k', alpha=0.7, color='steelblue')
ax1.set_xlabel('Tanimoto Similarity', fontsize=11)
ax1.set_ylabel('Count', fontsize=11)
ax1.set_title('Distribution of Similarity Scores', fontsize=12, fontweight='bold')
ax1.grid(True, alpha=0.3)

mean_sim = np.mean(sim_values)
median_sim = np.median(sim_values)
ax1.axvline(mean_sim, color='r', linestyle='--', linewidth=2, label=f'Mean: {mean_sim:.3f}')
ax1.axvline(median_sim, color='g', linestyle='--', linewidth=2, label=f'Median: {median_sim:.3f}')
ax1.legend()

# Plot 2: Cumulative distribution
ax2 = axes[0, 1]
sorted_sims = np.sort(sim_values)
cumulative = np.arange(1, len(sorted_sims) + 1) / len(sorted_sims) * 100
ax2.plot(sorted_sims, cumulative, linewidth=2, color='darkblue')
ax2.set_xlabel('Tanimoto Similarity', fontsize=11)
ax2.set_ylabel('Cumulative Percentage (%)', fontsize=11)
ax2.set_title('Cumulative Distribution', fontsize=12, fontweight='bold')
ax2.grid(True, alpha=0.3)
ax2.axvline(CONFIG['threshold'], color='r', linestyle='--', linewidth=2,
           label=f"Threshold: {CONFIG['threshold']}")
ax2.legend()

# Plot 3: Box plot
ax3 = axes[1, 0]
ax3.boxplot(sim_values, vert=True, patch_artist=True,
           boxprops=dict(facecolor='lightblue', alpha=0.7),
           medianprops=dict(color='red', linewidth=2))
ax3.set_ylabel('Tanimoto Similarity', fontsize=11)
ax3.set_title('Similarity Score Distribution', fontsize=12, fontweight='bold')
ax3.grid(True, alpha=0.3, axis='y')

# Plot 4: Threshold analysis
ax4 = axes[1, 1]
thresholds = np.arange(0.5, 1.0, 0.05)
pair_counts = []
for thresh in thresholds:
    count = np.sum(sim_values >= thresh)
    pair_counts.append(count)

ax4.plot(thresholds, pair_counts, marker='o', linewidth=2, markersize=6, color='darkgreen')
ax4.axvline(CONFIG['threshold'], color='r', linestyle='--', linewidth=2,
           label=f"Selected: {CONFIG['threshold']}")
ax4.set_xlabel('Similarity Threshold', fontsize=11)
ax4.set_ylabel('Number of Pairs', fontsize=11)
ax4.set_title('Pairs Above Threshold', fontsize=12, fontweight='bold')
ax4.grid(True, alpha=0.3)
ax4.legend()

plt.tight_layout()

# Save plot
plot_file = os.path.join(CONFIG['output_dir'], f"similarity_distribution_{CONFIG['fp_type']}.png")
plt.savefig(plot_file, dpi=300, bbox_inches='tight')
print(f"\n Visualization saved to: {plot_file}")
plt.show()



In [None]:
# ============================================================================
# SECTION 10: Heatmap Visualization (for smaller datasets)
# ============================================================================

if len(sim_df) <= 50:
    print(f"\n{'='*60}")
    print("GENERATING SIMILARITY HEATMAP")
    print(f"{'='*60}")

    plt.figure(figsize=(12, 10))
    sns.heatmap(sim_df, cmap='RdYlBu_r', vmin=0, vmax=1,
                square=True, cbar_kws={'label': 'Tanimoto Similarity'})
    plt.title(f'Tanimoto Similarity Matrix ({CONFIG["fp_type"]})',
              fontsize=14, fontweight='bold')
    plt.xlabel('Compound ID', fontsize=11)
    plt.ylabel('Compound ID', fontsize=11)

    heatmap_file = os.path.join(CONFIG['output_dir'], f"similarity_heatmap_{CONFIG['fp_type']}.png")
    plt.savefig(heatmap_file, dpi=300, bbox_inches='tight')
    print(f"✓ Heatmap saved to: {heatmap_file}")
    plt.show()
else:
    print(f"\n Dataset too large for heatmap ({len(sim_df)} compounds)")
    print("   Heatmap visualization skipped (recommended for ≤50 compounds)")




⚠️  Dataset too large for heatmap (235 compounds)
   Heatmap visualization skipped (recommended for ≤50 compounds)


In [None]:
# ============================================================================
# SECTION 11: Summary Statistics
# ============================================================================

print(f"\n{'='*60}")
print("SUMMARY STATISTICS")
print(f"{'='*60}")

stats = {
    'Metric': [
        'Total Compounds',
        'Valid Fingerprints',
        'Pairwise Comparisons',
        'Mean Similarity',
        'Median Similarity',
        'Min Similarity',
        'Max Similarity',
        'Std Deviation',
        f'Pairs ≥ {CONFIG["threshold"]}',
        '25th Percentile',
        '75th Percentile'
    ],
    'Value': [
        len(df),
        len(df_fp),
        len(sim_values),
        f"{np.mean(sim_values):.4f}",
        f"{np.median(sim_values):.4f}",
        f"{np.min(sim_values):.4f}",
        f"{np.max(sim_values):.4f}",
        f"{np.std(sim_values):.4f}",
        len(pair_df),
        f"{np.percentile(sim_values, 25):.4f}",
        f"{np.percentile(sim_values, 75):.4f}"
    ]
}

stats_df = pd.DataFrame(stats)
display(stats_df)

# Save summary
summary_file = os.path.join(CONFIG['output_dir'], 'summary_statistics.csv')
stats_df.to_csv(summary_file, index=False)
print(f"\n✓ Summary statistics saved to: {summary_file}")




SUMMARY STATISTICS


Unnamed: 0,Metric,Value
0,Total Compounds,235.0
1,Valid Fingerprints,235.0
2,Pairwise Comparisons,27495.0
3,Mean Similarity,0.1169
4,Median Similarity,0.1026
5,Min Similarity,0.0
6,Max Similarity,1.0
7,Std Deviation,0.0777
8,Pairs ≥ 0.7,14.0
9,25th Percentile,0.0645



✓ Summary statistics saved to: similarity_results/summary_statistics.csv


In [None]:
# ============================================================================
# SECTION 12: Download Results
# ============================================================================

print(f"\n{'='*60}")
print("DOWNLOAD YOUR RESULTS")
print(f"{'='*60}")

# Create zip file
import shutil
zip_filename = 'similarity_analysis_results'
shutil.make_archive(zip_filename, 'zip', CONFIG['output_dir'])

print(f"\n✓ All results compressed to: {zip_filename}.zip")
print(f"\nFiles included:")
print(f"  - similarity_matrix_{CONFIG['fp_type']}.csv")
if len(pair_df) > 0:
    print(f"  - similarity_pairs_{CONFIG['fp_type']}.csv")
print(f"  - similarity_distribution_{CONFIG['fp_type']}.png")
if len(sim_df) <= 50:
    print(f"  - similarity_heatmap_{CONFIG['fp_type']}.png")
print(f"  - summary_statistics.csv")

print(f"\nTo download, uncomment and run:")
print(f"# from google.colab import files")
print(f"# files.download('{zip_filename}.zip')")

# Uncomment to automatically download
"""
from google.colab import files
files.download(f'{zip_filename}.zip')
"""



In [None]:
# ============================================================================
# SECTION 13: Final Summary
# ============================================================================

print(f"\n{'='*60}")
print("ANALYSIS COMPLETE! 🎉")
print(f"{'='*60}")

print(f"\n📊 Key Results:")
print(f"  • Analyzed {len(df_fp)} compounds")
print(f"  • Fingerprint type: {CONFIG['fp_type'].upper()}")
print(f"  • Mean similarity: {np.mean(sim_values):.3f}")
print(f"  • Pairs above threshold: {len(pair_df)}")
print(f"  • Results saved to: {CONFIG['output_dir']}/")

print(f"\n{'='*60}")