<a href="https://colab.research.google.com/github/DaniChinwendu/skin-sensitization-prediction/blob/main/Molecular_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""
OPTIMIZING SKIN SENSITIZATION PREDICTION: A COMPARATIVE ANALYSIS OF KNN VS RANDOM FOREST

Conjugation and Aromaticity Analysis - Google Colab Notebook

OPTIMIZING SKIN SENSITIZATION PREDICTION: A COMPARATIVE ANALYSIS OF KNN VS RANDOM FOREST

Daniel C. Ukaegbu 1, Karolina Kopanska 1, Peter Ranslow 2, Thomas Hartung 1, Alexandra Maertens 1*
1 Center for Alternatives to Animal Testing (CAAT), Johns Hopkins Bloomberg School of Public Health, Baltimore, Maryland 21205, United States
2 Consortium for Environmental Risk Management (CERM), Hallowell, Maine 04347, United States
*Correspondence:
Alexandra Maertens
amaerte1@jhu.edu

Input:  CSV with a 'smiles' column
Output: 'conjugation_analysis_results.csv' and 'molecular_analysis_results.csv'
"""

In [None]:

# ============================================================================
# SECTION 1: Install Dependencies (Run this first in Colab)
# ============================================================================

# Uncomment and run if packages are not installed
!pip install rdkit



Collecting rdkit
  Downloading rdkit-2025.9.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (4.1 kB)
Downloading rdkit-2025.9.1-cp312-cp312-manylinux_2_28_x86_64.whl (36.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m36.2/36.2 MB[0m [31m43.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2025.9.1


In [None]:
# ============================================================================
# SECTION 2: Import Libraries
# ============================================================================

import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors
import warnings
warnings.filterwarnings('ignore')

print("✓ All libraries imported successfully!")



✓ All libraries imported successfully!


In [None]:
# ============================================================================
# SECTION 3: Load Your Data File
# ============================================================================

from google.colab import files
import io

print("Upload your CSV file containing SMILES strings:")
print("Note: Your CSV must have a 'smiles' column")
print("="*60)

uploaded = files.upload()
filename = list(uploaded.keys())[0]
df = pd.read_csv(io.BytesIO(uploaded[filename]))

print(f"\n✓ Data loaded: {len(df)} compounds")
print(f"  Columns: {list(df.columns)}")

# Validate SMILES column
if 'smiles' not in df.columns:
    raise ValueError("❌ Error: Input CSV must contain a 'smiles' column")
else:
    print("✓ 'smiles' column found")

print(f"\nFirst few rows of your data:")
display(df.head())



In [None]:
# ============================================================================
# SECTION 4: Core Analysis Functions
# ============================================================================

def find_conjugation_chains(mol):
    """Return a list of bond indices that are plausibly conjugated.
    This is a simplified heuristic combining aromatic and double bonds.
    Returns all aromatic/double bonds as a single set (not true path tracing).
    """
    if mol is None:
        return []
    conj_bond_idxs = []
    for b in mol.GetBonds():
        if b.GetIsAromatic() or b.GetBondType() == Chem.BondType.DOUBLE:
            conj_bond_idxs.append(b.GetIdx())
    return [conj_bond_idxs] if conj_bond_idxs else []


def detect_conjugated_systems(smiles):
    """Detect conjugation patterns for a single SMILES string.
    Returns a dict of computed structural features.

    Note: These are heuristic features for ML/prediction, not rigorous
    quantum chemical conjugation definitions.
    """
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return {
            "valid_mol": False,
            "has_conjugated_system": False,
            "aromatic_conjugation": False,
            "linear_conjugation": False,
            "extended_conjugation": False,
            "conjugated_double_bonds": 0,
            "aromatic_rings": 0,
            "total_rings": 0,
            "conjugation_bond_count": 0,
            "aromatic_atoms_count": 0,
            "non_aromatic_double_bonds": 0,
        }

    aromatic_rings = Descriptors.NumAromaticRings(mol)
    total_rings = Descriptors.RingCount(mol)

    # Aromatic atoms imply conjugation
    aromatic_atoms = [a for a in mol.GetAtoms() if a.GetIsAromatic()]
    aromatic_conjugation = len(aromatic_atoms) > 0

    # Non-aromatic double bonds
    double_bonds = [
        b for b in mol.GetBonds()
        if b.GetBondType() == Chem.BondType.DOUBLE and not b.GetIsAromatic()
    ]

    # Simple linear-conjugation heuristic: adjacent double bonds sharing an atom
    # (This counts pairs of double bonds with shared atoms, not true alternating paths)
    linear_conjugation = False
    conjugated_double_bonds = 0
    if len(double_bonds) >= 2:
        for i, b1 in enumerate(double_bonds):
            a1 = {b1.GetBeginAtomIdx(), b1.GetEndAtomIdx()}
            for b2 in double_bonds[i+1:]:
                a2 = {b2.GetBeginAtomIdx(), b2.GetEndAtomIdx()}
                if a1 & a2:
                    conjugated_double_bonds += 1
                    linear_conjugation = True

    # Extended conjugation feature: count of aromatic + double bonds (>= 3 for "extended")
    # This is a global count, not a path length
    chains = find_conjugation_chains(mol)
    conjugation_bond_count = max((len(c) for c in chains), default=0)
    extended_conjugation = conjugation_bond_count >= 3

    has_conjugated_system = aromatic_conjugation or linear_conjugation

    return {
        "valid_mol": True,
        "has_conjugated_system": has_conjugated_system,
        "aromatic_conjugation": aromatic_conjugation,
        "linear_conjugation": linear_conjugation,
        "extended_conjugation": extended_conjugation,
        "conjugated_double_bonds": conjugated_double_bonds,
        "aromatic_rings": aromatic_rings,
        "total_rings": total_rings,
        "conjugation_bond_count": conjugation_bond_count,
        "aromatic_atoms_count": len(aromatic_atoms),
        "non_aromatic_double_bonds": len(double_bonds),
    }


def classify_conjugation_type(row):
    """Classify the conjugation category for a row of features."""
    if not row.get("has_conjugated_system", False):
        return "No conjugation"
    if row.get("aromatic_conjugation", False) and row.get("linear_conjugation", False):
        return "Both aromatic and linear"
    if row.get("aromatic_conjugation", False):
        return "Aromatic only"
    if row.get("linear_conjugation", False):
        return "Linear only"
    return "Other"


def analyze_molecular_features(smiles):
    """Compute basic aromaticity/ring features for a SMILES string."""
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return {
            "valid_mol": False,
            "has_aromatic_atoms": False,
            "num_aromatic_rings": 0,
            "total_rings": 0,
            "has_any_rings": False,
            "aromatic_atoms_count": 0,
            "total_atoms": 0,
        }
    aromatic_atoms = [a for a in mol.GetAtoms() if a.GetIsAromatic()]
    total_rings = Descriptors.RingCount(mol)
    return {
        "valid_mol": True,
        "has_aromatic_atoms": len(aromatic_atoms) > 0,
        "num_aromatic_rings": Descriptors.NumAromaticRings(mol),
        "total_rings": total_rings,
        "has_any_rings": total_rings > 0,
        "aromatic_atoms_count": len(aromatic_atoms),
        "total_atoms": mol.GetNumAtoms(),
    }


print("✓ All analysis functions defined successfully!")





✓ All analysis functions defined successfully!


In [None]:
# ============================================================================
# SECTION 5: Conjugation Analysis
# ============================================================================

print("\n" + "="*60)
print("RUNNING CONJUGATION ANALYSIS")
print("="*60)

print("\nAnalyzing conjugated systems...")
conj_features = df["smiles"].apply(detect_conjugated_systems)
conj_df = pd.DataFrame(conj_features.tolist())
df_conj = pd.concat([df.reset_index(drop=True), conj_df.reset_index(drop=True)], axis=1)
df_conj["conjugation_type"] = conj_df.apply(classify_conjugation_type, axis=1)

# Calculate statistics
total = len(df_conj)
valid = int(conj_df["valid_mol"].sum())
invalid = total - valid

pct = lambda x: 100.0 * x / total if total else 0.0

print("\n" + "="*60)
print("DATASET OVERVIEW")
print("="*60)
print(f"Total compounds: {total}")
print(f"Valid molecules: {valid} ({pct(valid):.1f}%)")
print(f"Invalid SMILES: {invalid} ({pct(invalid):.1f}%)")

# Conjugation statistics
total_conj = int(conj_df["has_conjugated_system"].sum())
aromatic_conj = int(conj_df["aromatic_conjugation"].sum())
linear_conj = int(conj_df["linear_conjugation"].sum())
extended_conj = int(conj_df["extended_conjugation"].sum())

print("\n" + "="*60)
print("CONJUGATION FEATURES")
print("="*60)
print(f"Compounds with conjugated systems: {total_conj} ({pct(total_conj):.1f}%)")
print(f"  - Aromatic conjugation: {aromatic_conj} ({pct(aromatic_conj):.1f}%)")
print(f"  - Linear conjugation: {linear_conj} ({pct(linear_conj):.1f}%)")
print(f"  - Extended conjugation (≥3 bonds): {extended_conj} ({pct(extended_conj):.1f}%)")

# Conjugation type distribution
print("\n" + "="*60)
print("CONJUGATION TYPE DISTRIBUTION")
print("="*60)
type_counts = df_conj["conjugation_type"].value_counts()
for conjugation_type, count in type_counts.items():
    print(f"{conjugation_type}: {count} ({pct(count):.1f}%)")

# Ring statistics
aromatic_rings_pos = int((conj_df["aromatic_rings"] > 0).sum())
total_rings_pos = int((conj_df["total_rings"] > 0).sum())

print("\n" + "="*60)
print("RING ANALYSIS")
print("="*60)
print(f"Compounds with aromatic rings: {aromatic_rings_pos} ({pct(aromatic_rings_pos):.1f}%)")
print(f"Compounds with any rings: {total_rings_pos} ({pct(total_rings_pos):.1f}%)")

# Aromatic ring distribution
print("\nDistribution of aromatic rings:")
for rings, count in conj_df["aromatic_rings"].value_counts().sort_index().items():
    print(f"  {rings} ring(s): {count} compounds ({pct(count):.1f}%)")

# Total ring distribution
print("\nDistribution of total rings:")
for rings, count in conj_df["total_rings"].value_counts().sort_index().items():
    print(f"  {rings} ring(s): {count} compounds ({pct(count):.1f}%)")

# Save conjugation results
df_conj.to_csv('conjugation_analysis_results.csv', index=False)
print(f"\n{'='*60}")
print(f"✓ Conjugation analysis results saved to: conjugation_analysis_results.csv")
print(f"{'='*60}")


RUNNING CONJUGATION ANALYSIS

Analyzing conjugated systems...

DATASET OVERVIEW
Total compounds: 235
Valid molecules: 235 (100.0%)
Invalid SMILES: 0 (0.0%)

CONJUGATION FEATURES
Compounds with conjugated systems: 106 (45.1%)
  - Aromatic conjugation: 105 (44.7%)
  - Linear conjugation: 11 (4.7%)
  - Extended conjugation (≥3 bonds): 118 (50.2%)

CONJUGATION TYPE DISTRIBUTION
No conjugation: 129 (54.9%)
Aromatic only: 95 (40.4%)
Both aromatic and linear: 10 (4.3%)
Linear only: 1 (0.4%)

RING ANALYSIS
Compounds with aromatic rings: 105 (44.7%)
Compounds with any rings: 151 (64.3%)

Distribution of aromatic rings:
  0 ring(s): 130 compounds (55.3%)
  1 ring(s): 76 compounds (32.3%)
  2 ring(s): 23 compounds (9.8%)
  3 ring(s): 4 compounds (1.7%)
  4 ring(s): 1 compounds (0.4%)
  5 ring(s): 1 compounds (0.4%)

Distribution of total rings:
  0 ring(s): 84 compounds (35.7%)
  1 ring(s): 90 compounds (38.3%)
  2 ring(s): 40 compounds (17.0%)
  3 ring(s): 13 compounds (5.5%)
  4 ring(s): 3 com

In [None]:
# ============================================================================
# SECTION 6: Molecular Features Analysis
# ============================================================================

print("\n" + "="*60)
print("RUNNING MOLECULAR FEATURES ANALYSIS")
print("="*60)

print("\nAnalyzing molecular features...")
features = df["smiles"].apply(analyze_molecular_features)
feat_df = pd.DataFrame(features.tolist())
df_feat = pd.concat([df.reset_index(drop=True), feat_df.reset_index(drop=True)], axis=1)

# Aromatic statistics
aromatic_compounds = int(feat_df["has_aromatic_atoms"].sum())
ring_compounds = int(feat_df["has_any_rings"].sum())
aromatic_and_rings = int((feat_df['has_aromatic_atoms'] & feat_df['has_any_rings']).sum())
aliphatic_rings_only = int((feat_df["has_any_rings"] & ~feat_df["has_aromatic_atoms"]).sum())

print("\n" + "="*60)
print("AROMATICITY ANALYSIS")
print("="*60)
print(f"Compounds with aromatic atoms: {aromatic_compounds} ({pct(aromatic_compounds):.1f}%)")
print(f"Compounds with rings: {ring_compounds} ({pct(ring_compounds):.1f}%)")
print(f"  - Aromatic rings: {aromatic_and_rings} ({pct(aromatic_and_rings):.1f}%)")
print(f"  - Aliphatic rings only: {aliphatic_rings_only} ({pct(aliphatic_rings_only):.1f}%)")

# Save molecular features results
df_feat.to_csv('molecular_analysis_results.csv', index=False)
print(f"\n{'='*60}")
print(f"✓ Molecular features results saved to: molecular_analysis_results.csv")
print(f"{'='*60}")




RUNNING MOLECULAR FEATURES ANALYSIS

Analyzing molecular features...

AROMATICITY ANALYSIS
Compounds with aromatic atoms: 105 (44.7%)
Compounds with rings: 151 (64.3%)
  - Aromatic rings: 105 (44.7%)
  - Aliphatic rings only: 46 (19.6%)

✓ Molecular features results saved to: molecular_analysis_results.csv


In [None]:
# ============================================================================
# SECTION 7: Display Sample Results
# ============================================================================

print("\n" + "="*60)
print("SAMPLE RESULTS PREVIEW")
print("="*60)

print("\nConjugation Analysis Results (first 10 rows):")
display(df_conj.head(10))

print("\nMolecular Features Results (first 10 rows):")
display(df_feat.head(10))




SAMPLE RESULTS PREVIEW

Conjugation Analysis Results (first 10 rows):


Unnamed: 0,InChIKey,DATATYPE,VALUE,H317,smiles,Moleculer_weight,TPSA,CID,BOILING_POINT_DEGC_OPERA_PRED,OCTANOL_WATER_PARTITION_LOGP_OPERA_PRED,...,aromatic_conjugation,linear_conjugation,extended_conjugation,conjugated_double_bonds,aromatic_rings,total_rings,conjugation_bond_count,aromatic_atoms_count,non_aromatic_double_bonds,conjugation_type
0,BDAGIHXWWSANSR-UHFFFAOYSA-N,in vivo,0,0,C(=O)O,46.025,37.3,284,2.00467,-0.54,...,False,False,False,0,0,0,1,0,1,No conjugation
1,SALNWJHEVGXTER-UHFFFAOYSA-N,in vivo,1,0,COC(=O)C(CCN1C=NC2=C1N=C(N=C2Cl)N)C(=O)OC,327.728,122.22,9949313,2.507146,0.39,...,True,False,True,0,2,2,12,9,2,Aromatic only
2,JQZAEUFPPSRDOP-UHFFFAOYSA-N,in vivo,1,1,C1=CC(=CC=C1CCl)Cl,161.031,0.0,7723,2.33969,3.18,...,True,False,True,0,1,1,6,6,0,Aromatic only
3,SATCULPHIDQDRE-UHFFFAOYSA-N,in vivo,1,1,C1OC2=C(O1)C=C(C=C2)C=O,150.133,35.53,8438,2.420045,1.05,...,True,False,True,0,1,2,7,6,1,Aromatic only
4,CMKBCTPCXZNQKX-UHFFFAOYSA-N,in vivo,1,1,C1CCC(CC1)S,116.229,0.0,15290,2.201036,2.6,...,False,False,False,0,0,1,0,0,0,No conjugation
5,RANVDUNFZBMTBK-UHFFFAOYSA-N,in vivo,0,0,CCCCCOC(=O)C1=CC=CC=C1O,208.257,46.53,16299,2.431422,3.18,...,True,False,True,0,1,1,7,6,1,Aromatic only
6,TVWGHFVGFWIHFN-UHFFFAOYSA-N,in vivo,1,1,CCCCCCCCCCCCCCC(C)C1=CC(=CC(=C1O)C)C,346.599,20.23,164362,2.538424,9.38,...,True,False,True,0,1,1,6,6,0,Aromatic only
7,GDAVABNCFOTAOD-UHFFFAOYSA-N,in vivo,0,0,CC1CCOC(C1)C2=CC=CC=C2,176.259,9.23,3024003,2.402877,2.89,...,True,False,True,0,1,2,6,6,0,Aromatic only
8,KBOPZPXVLCULAV-UHFFFAOYSA-N,in vivo,1,1,C1=CC(=C(C=C1N)C(=O)O)O,153.137,83.55,4075,2.435601,0.81,...,True,False,True,0,1,1,7,6,1,Aromatic only
9,IPBFYZQJXZJBFQ-UHFFFAOYSA-N,in vivo,0,0,CCCCC1CCC(=O)O1,142.198,26.3,7704,2.368863,1.96,...,False,False,False,0,0,1,1,0,1,No conjugation



Molecular Features Results (first 10 rows):


Unnamed: 0,InChIKey,DATATYPE,VALUE,H317,smiles,Moleculer_weight,TPSA,CID,BOILING_POINT_DEGC_OPERA_PRED,OCTANOL_WATER_PARTITION_LOGP_OPERA_PRED,...,LUMO,Pubchem Bits,pubchem Bit Vector,valid_mol,has_aromatic_atoms,num_aromatic_rings,total_rings,has_any_rings,aromatic_atoms_count,total_atoms
0,BDAGIHXWWSANSR-UHFFFAOYSA-N,in vivo,0,0,C(=O)O,46.025,37.3,284,2.00467,-0.54,...,0.0998,0000000000000000001100000000000000000000000000...,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0...,True,False,0,0,False,0,3
1,SALNWJHEVGXTER-UHFFFAOYSA-N,in vivo,1,0,COC(=O)C(CCN1C=NC2=C1N=C(N=C2Cl)N)C(=O)OC,327.728,122.22,9949313,2.507146,0.39,...,0.0401,1100000001110011101110000000000000000100000000...,[1 1 0 0 0 0 0 0 0 1 1 1 0 0 1 1 1 0 1 1 1 0 0...,True,True,2,2,True,9,22
2,JQZAEUFPPSRDOP-UHFFFAOYSA-N,in vivo,1,1,C1=CC(=CC=C1CCl)Cl,161.031,0.0,7723,2.33969,3.18,...,0.0534,1000000001100000000000000000000000000110000000...,[1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0...,True,True,1,1,True,6,9
3,SATCULPHIDQDRE-UHFFFAOYSA-N,in vivo,1,1,C1OC2=C(O1)C=C(C=C2)C=O,150.133,35.53,8438,2.420045,1.05,...,0.0396,1000000001110000001100000000000000000000000000...,[1 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 1 1 0 0 0...,True,True,1,2,True,6,11
4,CMKBCTPCXZNQKX-UHFFFAOYSA-N,in vivo,1,1,C1CCC(CC1)S,116.229,0.0,15290,2.201036,2.6,...,0.2037,1100000001100000000000000000000001000000000000...,[1 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0...,True,False,0,1,True,0,7
5,RANVDUNFZBMTBK-UHFFFAOYSA-N,in vivo,0,0,CCCCCOC(=O)C1=CC=CC=C1O,208.257,46.53,16299,2.431422,3.18,...,0.0576,1110000001110000001100000000000000000000000000...,[1 1 1 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 1 1 0 0 0...,True,True,1,1,True,6,15
6,TVWGHFVGFWIHFN-UHFFFAOYSA-N,in vivo,1,1,CCCCCCCCCCCCCCC(C)C1=CC(=CC(=C1O)C)C,346.599,20.23,164362,2.538424,9.38,...,0.1152,1111000001111000001000000000000000000000000000...,[1 1 1 1 0 0 0 0 0 1 1 1 1 0 0 0 0 0 1 0 0 0 0...,True,True,1,1,True,6,25
7,GDAVABNCFOTAOD-UHFFFAOYSA-N,in vivo,0,0,CC1CCOC(C1)C2=CC=CC=C2,176.259,9.23,3024003,2.402877,2.89,...,0.1003,1110000001110000001000000000000000000000000000...,[1 1 1 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 1 0 0 0 0...,True,True,1,2,True,6,13
8,KBOPZPXVLCULAV-UHFFFAOYSA-N,in vivo,1,1,C1=CC(=C(C=C1N)C(=O)O)O,153.137,83.55,4075,2.435601,0.81,...,0.0599,1000000001100010001100000000000000000000000000...,[1 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 1 1 0 0 0...,True,True,1,1,True,6,11
9,IPBFYZQJXZJBFQ-UHFFFAOYSA-N,in vivo,0,0,CCCCC1CCC(=O)O1,142.198,26.3,7704,2.368863,1.96,...,0.1061,1100000001110000001100000000000000000000000000...,[1 1 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 1 1 0 0 0...,True,False,0,1,True,0,10


In [None]:
# ============================================================================
# SECTION 8: Visualization (Optional)
# ============================================================================

import matplotlib.pyplot as plt
import seaborn as sns

# Set style
sns.set_style("whitegrid")

# Create visualizations
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Plot 1: Conjugation type distribution
ax1 = axes[0, 0]
type_counts.plot(kind='bar', ax=ax1, color='steelblue')
ax1.set_title('Conjugation Type Distribution', fontsize=12, fontweight='bold')
ax1.set_xlabel('Conjugation Type')
ax1.set_ylabel('Number of Compounds')
ax1.tick_params(axis='x', rotation=45)

# Plot 2: Aromatic rings distribution
ax2 = axes[0, 1]
aromatic_ring_counts = conj_df["aromatic_rings"].value_counts().sort_index()
aromatic_ring_counts.plot(kind='bar', ax=ax2, color='coral')
ax2.set_title('Distribution of Aromatic Rings', fontsize=12, fontweight='bold')
ax2.set_xlabel('Number of Aromatic Rings')
ax2.set_ylabel('Number of Compounds')

# Plot 3: Conjugation features pie chart
ax3 = axes[1, 0]
conj_stats = pd.Series({
    'Aromatic': aromatic_conj,
    'Linear': linear_conj,
    'Extended': extended_conj,
    'No conjugation': total - total_conj
})
conj_stats.plot(kind='pie', ax=ax3, autopct='%1.1f%%', startangle=90)
ax3.set_title('Conjugation Features Distribution', fontsize=12, fontweight='bold')
ax3.set_ylabel('')

# Plot 4: Ring types comparison
ax4 = axes[1, 1]
ring_comparison = pd.DataFrame({
    'Count': [aromatic_and_rings, aliphatic_rings_only, total - ring_compounds]
}, index=['Aromatic Rings', 'Aliphatic Rings Only', 'No Rings'])
ring_comparison.plot(kind='bar', ax=ax4, legend=False, color='mediumseagreen')
ax4.set_title('Ring Types in Dataset', fontsize=12, fontweight='bold')
ax4.set_xlabel('Ring Type')
ax4.set_ylabel('Number of Compounds')
ax4.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.savefig('conjugation_analysis_plots.png', dpi=300, bbox_inches='tight')
print("\n✓ Visualization saved to: conjugation_analysis_plots.png")
plt.show()



In [None]:
# ============================================================================
# SECTION 9: Download Results
# ============================================================================

print("\n" + "="*60)
print("DOWNLOAD YOUR RESULTS")
print("="*60)

print("\nYou can download the following files:")
print("  1. conjugation_analysis_results.csv")
print("  2. molecular_analysis_results.csv")
print("  3. conjugation_analysis_plots.png")

print("\nTo download, uncomment and run the following lines:")
print("# from google.colab import files")
print("# files.download('conjugation_analysis_results.csv')")
print("# files.download('molecular_analysis_results.csv')")
print("# files.download('conjugation_analysis_plots.png')")

# Uncomment to automatically download results:
# files.download('conjugation_analysis_results.csv')
# files.download('molecular_analysis_results.csv')
# files.download('conjugation_analysis_plots.png')



In [None]:
# ============================================================================
# SECTION 10: Summary Statistics Table
# ============================================================================

print("\n" + "="*60)
print("SUMMARY STATISTICS TABLE")
print("="*60)

summary_stats = pd.DataFrame({
    'Metric': [
        'Total Compounds',
        'Valid Molecules',
        'Invalid SMILES',
        'Compounds with Conjugation',
        'Aromatic Conjugation',
        'Linear Conjugation',
        'Extended Conjugation',
        'Compounds with Aromatic Rings',
        'Compounds with Any Rings',
        'Aromatic Atoms Present',
        'Aliphatic Rings Only'
    ],
    'Count': [
        total,
        valid,
        invalid,
        total_conj,
        aromatic_conj,
        linear_conj,
        extended_conj,
        aromatic_rings_pos,
        total_rings_pos,
        aromatic_compounds,
        aliphatic_rings_only
    ],
    'Percentage': [
        f"{pct(total):.1f}%",
        f"{pct(valid):.1f}%",
        f"{pct(invalid):.1f}%",
        f"{pct(total_conj):.1f}%",
        f"{pct(aromatic_conj):.1f}%",
        f"{pct(linear_conj):.1f}%",
        f"{pct(extended_conj):.1f}%",
        f"{pct(aromatic_rings_pos):.1f}%",
        f"{pct(total_rings_pos):.1f}%",
        f"{pct(aromatic_compounds):.1f}%",
        f"{pct(aliphatic_rings_only):.1f}%"
    ]
})

display(summary_stats)

# Save summary
summary_stats.to_csv('summary_statistics.csv', index=False)
print("\n✓ Summary statistics saved to: summary_statistics.csv")

print("\n" + "="*60)
print("ALL ANALYSES COMPLETE! 🎉")
print("="*60)