In [1]:
# Cell 1: Imports and Setup
import numpy as np
from typing import Dict, List, Tuple

# Cell 2: Utility Functions
def calculate_ns_from_ef(ef: float, n: int, N: int, percentile: float) -> float:
    """Back-calculate ns from EF value"""
    Ns = int(np.ceil(N * percentile / 100))
    ns = (ef * n * Ns) / N
    return round(ns, 0)

# Cell 3: Metric Calculation Functions
def calculate_metrics_from_ns(ns: float, n: int, N: int, percentile: float) -> Dict[str, float]:
    """Calculate all metrics given ns, n, N values"""
    Ns = int(np.ceil(N * percentile / 100))
    
    # Enrichment Factor
    ef = (N * ns) / (n * Ns) if (n * Ns) != 0 else 0
    
    # Power Metric
    numerator = ns * N - n * ns
    denominator = ns * N - 2 * n * ns + n * Ns
    pm = numerator / denominator if denominator != 0 else 0
    
    # ROCE
    if n * (Ns - ns) == 0:
        roce = float('inf')
    else:
        roce = (ns * (N - n)) / (n * (Ns - ns))
    
    # CCR
    if n == 0 or N - n == 0:
        ccr = 0
    else:
        ccr = 0.5 * ((ns / n) + ((N - Ns - n + ns) / (N - n)))
    
    # REF
    max_possible = min(Ns, n)
    ref = 100 * (ns / max_possible) if max_possible != 0 else 0
    
    return {
        'ns': ns,
        'ef': ef,
        'pm': pm,
        'roce': roce,
        'ccr': ccr,
        'ref': ref
    }

# Cell 4: Generate Test Cases
def generate_test_cases():
    """Generate different test scenarios with known values"""
    test_cases = [
        {
            'name': 'Perfect early recovery',
            'n': 100,
            'N': 10000,
            'percentile': 1.0,
            'ef': 100
        },
        {
            'name': 'Random distribution',
            'n': 100,
            'N': 10000,
            'percentile': 1.0,
            'ef': 1.0
        },
        {
            'name': 'Good enrichment',
            'n': 100,
            'N': 10000,
            'percentile': 0.5,
            'ef': 20
        },
        {
            'name': 'Small dataset',
            'n': 10,
            'N': 1000,
            'percentile': 0.5,
            'ef': 20
        }
    ]
    return test_cases

# Cell 5: Validation Function
def compare_metrics(original_ns: float, back_calculated_ns: float, 
                   n: int, N: int, percentile: float, tolerance: float = 1e-6):
    """Compare metrics using two ns values"""
    metrics1 = calculate_metrics_from_ns(original_ns, n, N, percentile)
    metrics2 = calculate_metrics_from_ns(back_calculated_ns, n, N, percentile)
    
    print("\nMetric Comparison:")
    metrics_match = True
    for key in metrics1:
        if key == 'ns':
            continue
        val1, val2 = metrics1[key], metrics2[key]
        if np.isinf(val1) and np.isinf(val2):
            diff = 0
        else:
            diff = abs(val1 - val2)
            if diff > tolerance:
                metrics_match = False
        print(f"{key}:")
        print(f"  Original calculation: {val1:.6f}")
        print(f"  From back-calculated ns: {val2:.6f}")
        print(f"  Difference: {diff:.6f}")
    return metrics_match

# Cell 6: Run Validation
def validate_metrics():
    """Validate metric calculations"""
    test_cases = generate_test_cases()
    
    for case in test_cases:
        print(f"\nTest Case: {case['name']}")
        print("=" * 50)
        
        # Calculate ns from EF
        ns_from_ef = calculate_ns_from_ef(
            case['ef'], case['n'], case['N'], case['percentile']
        )
        
        # Calculate original ns that would give this EF
        Ns = int(np.ceil(case['N'] * case['percentile'] / 100))
        original_ns = (case['ef'] * case['n'] * Ns) / case['N']
        
        print(f"\nInput Parameters:")
        print(f"n: {case['n']}, N: {case['N']}, percentile: {case['percentile']}%")
        print(f"Original EF: {case['ef']}")
        print(f"\nns values:")
        print(f"Original ns: {original_ns}")
        print(f"Back-calculated ns: {ns_from_ef}")
        
        metrics_match = compare_metrics(
            original_ns, ns_from_ef, 
            case['n'], case['N'], case['percentile']
        )
        print(f"\nAll metrics match within tolerance: {metrics_match}")

# Cell 7: Execute Validation
validate_metrics()


Test Case: Perfect early recovery

Input Parameters:
n: 100, N: 10000, percentile: 1.0%
Original EF: 100

ns values:
Original ns: 100.0
Back-calculated ns: 100.0

Metric Comparison:
ef:
  Original calculation: 100.000000
  From back-calculated ns: 100.000000
  Difference: 0.000000
pm:
  Original calculation: 1.000000
  From back-calculated ns: 1.000000
  Difference: 0.000000
roce:
  Original calculation: inf
  From back-calculated ns: inf
  Difference: 0.000000
ccr:
  Original calculation: 1.000000
  From back-calculated ns: 1.000000
  Difference: 0.000000
ref:
  Original calculation: 100.000000
  From back-calculated ns: 100.000000
  Difference: 0.000000

All metrics match within tolerance: True

Test Case: Random distribution

Input Parameters:
n: 100, N: 10000, percentile: 1.0%
Original EF: 1.0

ns values:
Original ns: 1.0
Back-calculated ns: 1.0

Metric Comparison:
ef:
  Original calculation: 1.000000
  From back-calculated ns: 1.000000
  Difference: 0.000000
pm:
  Original calcul

In [10]:
# Import required libraries and load data
import numpy as np
import pandas as pd

# Load data
lit_df = pd.read_csv('DEKOIS_literature.csv', index_col=None)
lit_df = lit_df.dropna(axis=1, how='all')
lit_df['Target'] = lit_df['Target'].astype(str)
lit_df['Model'] = lit_df['Model'].astype(str)
count_df = pd.read_csv('DEKOIS_count_data.csv', index_col=None)

# Initialize results list
results = []
percentiles = [0.5, 1.0, 5.0]  # Corresponding to EF0.5, EF1, EF5

# Process each target
for _, lit_row in lit_df.iterrows():
    target = lit_row['Target']
    model = lit_row['Model']
    count_row = count_df[count_df['target'] == target.lower()]
    
    if len(count_row) == 0:
        continue
        
    n = count_row['actives'].iloc[0]  # Number of actives
    N = count_row['total'].iloc[0]    # Total compounds
    
    # Calculate metrics for each percentile
    for p, ef_col in zip(percentiles, ['EF0.5', 'EF1', 'EF5']):
        if pd.isna(lit_row[ef_col]):
            continue
            
        ef = lit_row[ef_col]
        
        # Calculate ns from EF
        Ns = int(np.ceil(N * p / 100))  # Number of compounds in selection
        ns = (ef * n * Ns) / N          # Number of actives in selection
        
        # Calculate Power Metric (PM)
        numerator = ns * N - n * ns
        denominator = ns * N - 2 * n * ns + n * Ns
        pm = numerator / denominator if denominator != 0 else 0
        
        # Calculate ROCE
        if n * (Ns - ns) == 0:
            roce = float('inf')
        else:
            roce = (ns * (N - n)) / (n * (Ns - ns))
        
        # Calculate CCR
        if n == 0 or N - n == 0:
            ccr = 0
        else:
            ccr = 0.5 * ((ns / n) + ((N - Ns - n + ns) / (N - n)))
        
        # Calculate REF
        max_possible = min(Ns, n)
        ref = 100 * (ns / max_possible) if max_possible != 0 else 0
        
        # Calculate MCC
        # TP = ns, FP = Ns - ns, TN = N - Ns - n + ns, FN = n - ns
        numerator = N * ns - Ns * n
        denominator = np.sqrt(Ns * n * (N - n) * (N - Ns))
        mcc = numerator / denominator if denominator != 0 else 0
        
        # Calculate CKC (Cohen's Kappa Coefficient)
        numerator = N * n + N * Ns - 2 * ns * N
        denominator = N * n + N * Ns - 2 * n * Ns
        ckc = 1 - (numerator / denominator) if denominator != 0 else 0
        
        # Calculate RIE (Relative Inhibition Efficiency)
        # For RIE at given percentile, we can use the ratio of ns/Ns to n/N
        rie = (ns/Ns)/(n/N) if (Ns != 0 and n != 0) else 0
        
        # Store results
        results.append({
            'Target': target,
            'Model': model,
            'Percentile': p,
            'EF': ef,
            'n': n,
            'N': N,
            'Ns': Ns,
            'ns': ns,
            'PM': pm,
            'ROCE': roce,
            'CCR': ccr,
            'REF': ref,
            'MCC': mcc,
            'CKC': ckc,
            'RIE': rie,
            'AUC_ROC': lit_row['AUC_ROC'],
            'BEDROC': lit_row['BEDROC']
        })

print(results)

# Create results dataframe
results_df = pd.DataFrame(results)

# Round numeric columns to 3 decimal places
numeric_cols = ['EF', 'ns', 'PM', 'ROCE', 'CCR', 'REF', 'MCC', 'CKC', 'RIE', 'AUC_ROC', 'BEDROC']
results_df[numeric_cols] = results_df[numeric_cols].round(3)

# Save full results
results_df.to_csv('dekois_complete_metrics2.csv', index=False)

# Display first few rows
print("\nFirst few rows of results:")
print(results_df.head())

# Display summary statistics by percentile
print("\nSummary Statistics by Percentile:")
for p in percentiles:
    print(f"\nPercentile {p}%:")
    subset = results_df[results_df['Percentile'] == p]
    print(subset[numeric_cols].describe())

[{'Target': 'parp-1', 'Model': 'KarmaDock Raw', 'Percentile': 0.5, 'EF': 30.075, 'n': 40, 'N': 1240, 'Ns': 7, 'ns': 6.791129032258064, 'PM': 0.9989758352478755, 'ROCE': 975.4054054054048, 'CCR': 0.5848020833333334, 'REF': 97.01612903225806, 'MCC': 0.39996922614116476, 'CKC': 0.28208593208593213, 'RIE': 30.075, 'AUC_ROC': 0.918293207, 'BEDROC': 0.803994207}, {'Target': 'parp-1', 'Model': 'KarmaDock Raw', 'Percentile': 1.0, 'EF': 27.56875, 'n': 40, 'N': 1240, 'Ns': 13, 'ns': 11.561088709677419, 'PM': 0.9958684216467614, 'ROCE': 241.03825136612014, 'CCR': 0.6439140625, 'REF': 88.93145161290322, 'MCC': 0.49929807277800387, 'CKC': 0.4272031539888683, 'RIE': 27.56875, 'AUC_ROC': 0.918293207, 'BEDROC': 0.803994207}, {'Target': 'parp-1', 'Model': 'KarmaDock Raw', 'Percentile': 5.0, 'EF': 13.53375, 'n': 40, 'N': 1240, 'Ns': 62, 'ns': 27.067500000000003, 'PM': 0.9587553094458695, 'ROCE': 23.245544979603526, 'CCR': 0.8237885416666668, 'REF': 67.66875000000002, 'MCC': 0.5249810173051039, 'CKC': 0.

  sqr = _ensure_numeric((avg - values) ** 2)
  sqr = _ensure_numeric((avg - values) ** 2)


In [4]:
# Import required libraries and load data
import numpy as np
import pandas as pd

# Load data
lit_df = pd.read_csv('DUD-E_literature.csv', index_col=None)
lit_df = lit_df.dropna(axis=1, how='all')
lit_df['Target'] = lit_df['Target'].astype(str)
lit_df['Model'] = lit_df['Model'].astype(str)
count_df = pd.read_csv('DEKOIS_count_data.csv', index_col=None)


# Initialize results list
results = []
percentiles = [1.0]  # Corresponding to EF0.5, EF1, EF5

# Process each target
for _, lit_row in lit_df.iterrows():
    target = lit_row['Target']
    model = lit_row['Model']
    count_row = count_df[count_df['target'] == target.lower()]
    
    if len(count_row) == 0:
        continue
        
    n = count_row['actives'].iloc[0]  # Number of actives
    N = count_row['total'].iloc[0]    # Total compounds
    
    # Calculate metrics for each percentile
    for p, ef_col in zip(percentiles, ['EF1']):
        if pd.isna(lit_row[ef_col]):
            continue
            
        ef = lit_row[ef_col]
        
        # Calculate ns from EF
        Ns = int(np.ceil(N * p / 100))  # Number of compounds in selection
        ns = (ef * n * Ns) / N          # Number of actives in selection
        
        # Calculate Power Metric (PM)
        numerator = ns * N - n * ns
        denominator = ns * N - 2 * n * ns + n * Ns
        pm = numerator / denominator if denominator != 0 else 0
        
        # Calculate ROCE
        if n * (Ns - ns) == 0:
            roce = float('inf')
        else:
            roce = (ns * (N - n)) / (n * (Ns - ns))
        
        # Calculate CCR
        if n == 0 or N - n == 0:
            ccr = 0
        else:
            ccr = 0.5 * ((ns / n) + ((N - Ns - n + ns) / (N - n)))
        
        # Calculate REF
        max_possible = min(Ns, n)
        ref = 100 * (ns / max_possible) if max_possible != 0 else 0
        
        # Calculate MCC
        # TP = ns, FP = Ns - ns, TN = N - Ns - n + ns, FN = n - ns
        numerator = N * ns - Ns * n
        denominator = np.sqrt(Ns * n * (N - n) * (N - Ns))
        mcc = numerator / denominator if denominator != 0 else 0
        
        # Calculate CKC (Cohen's Kappa Coefficient)
        numerator = N * n + N * Ns - 2 * ns * N
        denominator = N * n + N * Ns - 2 * n * Ns
        ckc = 1 - (numerator / denominator) if denominator != 0 else 0
        
        # Calculate RIE (Relative Inhibition Efficiency)
        # For RIE at given percentile, we can use the ratio of ns/Ns to n/N
        rie = (ns/Ns)/(n/N) if (Ns != 0 and n != 0) else 0
        
        # Store results
        results.append({
            'Target': target,
            'Model': model,
            'Percentile': p,
            'EF': ef,
            'n': n,
            'N': N,
            'Ns': Ns,
            'ns': ns,
            'PM': pm,
            'ROCE': roce,
            'CCR': ccr,
            'REF': ref,
            'MCC': mcc,
            'CKC': ckc,
            'RIE': rie,
            'AUC_ROC': lit_row['AUC_ROC'],
            'BEDROC': lit_row['BEDROC']
        })

# Create results dataframe
results_df = pd.DataFrame(results)

# Round numeric columns to 3 decimal places
numeric_cols = ['EF', 'ns', 'PM', 'ROCE', 'CCR', 'REF', 'MCC', 'CKC', 'RIE', 'AUC_ROC', 'BEDROC']
results_df[numeric_cols] = results_df[numeric_cols].round(3)

# Save full results
results_df.to_csv('dude_complete_metrics.csv', index=False)

# Display first few rows
print("\nFirst few rows of results:")
print(results_df.head())

# Display summary statistics by percentile
print("\nSummary Statistics by Percentile:")
for p in percentiles:
    print(f"\nPercentile {p}%:")
    subset = results_df[results_df['Percentile'] == p]
    print(subset[numeric_cols].describe())


First few rows of results:
  Target                          Model  Percentile    EF    n      N   Ns  \
0  fabp4  Scardino et al.(proprietary)*         1.0  41.0   47   2518   26   
1  hivrt  Scardino et al.(proprietary)*         1.0  17.0  337  18737  188   
2    dyr  Scardino et al.(proprietary)*         1.0  25.8  231  17074  171   
3   aldr  Scardino et al.(proprietary)*         1.0  36.0  158   9054   91   
4  lkha4  Scardino et al.(proprietary)*         1.0  14.7  170   7992   80   

       ns     PM     ROCE    CCR     REF    MCC    CKC   RIE  AUC_ROC  BEDROC  
0  19.898  0.994  171.423  0.710  76.529  0.563  0.539  41.0      NaN     NaN  
1  57.483  0.960   24.047  0.582  30.576  0.218  0.209  17.0      NaN     NaN  
2  59.689  0.975   39.099  0.626  34.906  0.292  0.289  25.8      NaN     NaN  
3  57.169  0.990   95.144  0.679  62.823  0.470  0.452  36.0      NaN     NaN  
4  25.015  0.954   20.933  0.570  31.269  0.203  0.189  14.7      NaN     NaN  

Summary Statistics by 

  sqr = _ensure_numeric((avg - values) ** 2)


In [5]:
# Import required libraries and load data
import numpy as np
import pandas as pd

# Load data
lit_df = pd.read_csv('Lit-PCBA_literature.csv', index_col=None)
lit_df = lit_df.dropna(axis=1, how='all')
lit_df['Target'] = lit_df['Target'].astype(str)
lit_df['Model'] = lit_df['Model'].astype(str)
count_df = pd.read_csv('Lit-PCBA_count_data.csv', index_col=None)

# Initialize results list
results = []
percentiles = [0.1, 0.5, 1.0, 5.0]  # Corresponding to EF0.1, EF0.5, EF1, EF5

# Process each target
for _, lit_row in lit_df.iterrows():
    target = lit_row['Target']
    model = lit_row['Model']
    count_row = count_df[count_df['target'] == target.lower()]
    
    if len(count_row) == 0:
        continue
        
    n = count_row['actives'].iloc[0]  # Number of actives
    N = count_row['total'].iloc[0]    # Total compounds
    
    # Calculate metrics for each percentile
    for p, ef_col in zip(percentiles, ['EF0.1', 'EF0.5', 'EF1', 'EF5']):
        if pd.isna(lit_row[ef_col]):
            continue
            
        ef = lit_row[ef_col]
        
        # Calculate ns from EF
        Ns = int(np.ceil(N * p / 100))  # Number of compounds in selection
        ns = (ef * n * Ns) / N          # Number of actives in selection
        
        # Calculate Power Metric (PM)
        numerator = ns * N - n * ns
        denominator = ns * N - 2 * n * ns + n * Ns
        pm = numerator / denominator if denominator != 0 else 0
        
        # Calculate ROCE
        if n * (Ns - ns) == 0:
            roce = float('inf')
        else:
            roce = (ns * (N - n)) / (n * (Ns - ns))
        
        # Calculate CCR
        if n == 0 or N - n == 0:
            ccr = 0
        else:
            ccr = 0.5 * ((ns / n) + ((N - Ns - n + ns) / (N - n)))
        
        # Calculate REF
        max_possible = min(Ns, n)
        ref = 100 * (ns / max_possible) if max_possible != 0 else 0
        
        # Calculate MCC
        # TP = ns, FP = Ns - ns, TN = N - Ns - n + ns, FN = n - ns
        numerator = N * ns - Ns * n
        denominator = np.sqrt(Ns * n * (N - n) * (N - Ns))
        mcc = numerator / denominator if denominator != 0 else 0
        
        # Calculate CKC (Cohen's Kappa Coefficient)
        numerator = N * n + N * Ns - 2 * ns * N
        denominator = N * n + N * Ns - 2 * n * Ns
        ckc = 1 - (numerator / denominator) if denominator != 0 else 0
        
        # Calculate RIE (Relative Inhibition Efficiency)
        # For RIE at given percentile, we can use the ratio of ns/Ns to n/N
        rie = (ns/Ns)/(n/N) if (Ns != 0 and n != 0) else 0
        
        # Store results
        results.append({
            'Target': target,
            'Model': model,
            'Percentile': p,
            'EF': ef,
            'n': n,
            'N': N,
            'Ns': Ns,
            'ns': ns,
            'PM': pm,
            'ROCE': roce,
            'CCR': ccr,
            'REF': ref,
            'MCC': mcc,
            'CKC': ckc,
            'RIE': rie,
            'AUC_ROC': lit_row['AUC_ROC'],
            'BEDROC': lit_row['BEDROC']
        })

# Create results dataframe
results_df = pd.DataFrame(results)

# Round numeric columns to 3 decimal places
numeric_cols = ['EF', 'ns', 'PM', 'ROCE', 'CCR', 'REF', 'MCC', 'CKC', 'RIE', 'AUC_ROC', 'BEDROC']
results_df[numeric_cols] = results_df[numeric_cols].round(3)

# Save full results
results_df.to_csv('litpcba_complete_metrics.csv', index=False)

# Display first few rows
print("\nFirst few rows of results:")
print(results_df.head())

# Display summary statistics by percentile
print("\nSummary Statistics by Percentile:")
for p in percentiles:
    print(f"\nPercentile {p}%:")
    subset = results_df[results_df['Percentile'] == p]
    print(subset[numeric_cols].describe())


First few rows of results:
  Target       Model  Percentile      EF    n       N     Ns      ns     PM  \
0   fen1     GlideSP         0.1   0.000  360  351078    352   0.000  0.000   
1   fen1     GlideSP         0.5   2.170  360  351078   1756   3.907  0.685   
2   fen1     GlideSP         1.0   2.170  360  351078   3511   7.812  0.685   
3   fen1     GlideSP         5.0   4.130  360  351078  17554  74.340  0.806   
4   fen1  IGN(DUD-E)         0.1  21.683  360  351078    352   7.826  0.957   

     ROCE    CCR     REF    MCC    CKC     RIE  AUC_ROC  BEDROC  
0   0.000  0.499   0.000 -0.001 -0.001   0.000    0.584   0.038  
1   2.173  0.503   1.085  0.003  0.002   2.170    0.584   0.038  
2   2.173  0.506   2.170  0.004  0.002   2.170    0.584   0.038  
3   4.143  0.578  20.650  0.023  0.006   4.130    0.584   0.038  
4  22.153  0.510   2.223  0.021  0.021  21.683    0.668   0.088  

Summary Statistics by Percentile:

Percentile 0.1%:
               EF         ns         PM        R

In [7]:
from pathlib import Path
from tqdm.notebook import tqdm
import pandas as pd
from rdkit import Chem
import gzip
import tempfile
import shutil
import os

# Define base directory
base_dir = Path("/home/tony/Datasets/DEKOIS")
actives_dir = base_dir / "actives"

# Get list of targets by looking at the decoy files in the root directory
target_files = list(base_dir.glob("*_Celling-v1.12_decoyset.sdf.gz"))
targets = [f.name.split("_Celling")[0] for f in target_files]

# Initialize list to store results
results = []

def decompress_and_count_molecules(gz_file):
    with tempfile.NamedTemporaryFile(suffix='.sdf', delete=False) as temp_file:
        with gzip.open(gz_file, 'rb') as gz_input:
            shutil.copyfileobj(gz_input, temp_file)
    
    try:
        suppl = Chem.SDMolSupplier(temp_file.name)
        count = sum(1 for mol in suppl if mol is not None)
    finally:
        os.unlink(temp_file.name)
    
    return count

# Process each target
for target in tqdm(targets, desc="Analyzing activity files"):
    try:
        # Define paths for active and decoy files
        actives_file = actives_dir / f"{target}.sdf.gz"
        decoys_file = base_dir / f"{target}_Celling-v1.12_decoyset.sdf.gz"
        
        # Check if both files exist
        if not (actives_file.exists() and decoys_file.exists()):
            results.append({
                'target': target,
                'error': 'SDF files not found',
                'actives': None,
                'total': None,
                'active_ratio': None
            })
            continue
        
        # Count molecules in actives and decoys files
        n_actives = decompress_and_count_molecules(actives_file)
        n_decoys = decompress_and_count_molecules(decoys_file)
            
        # Calculate totals
        n_total = n_actives + n_decoys
        active_ratio = (n_actives / n_total) * 100
        
        results.append({
            'target': target.lower(),
            'error': None,
            'actives': n_actives,
            'total': n_total,
            'active_ratio': active_ratio
        })
        
    except Exception as e:
        results.append({
            'target': target.lower(),
            'error': str(e),
            'actives': None,
            'total': None,
            'active_ratio': None
        })

# Convert results to DataFrame for easy analysis
results_df = pd.DataFrame(results)

# Print summary statistics
print("\nAnalysis Summary:")
print(f"Total targets processed: {len(results_df)}")

# Print targets with errors
error_targets = results_df[results_df['error'].notna()]
if not error_targets.empty:
    print("\nTargets with errors:")
    for _, row in error_targets.iterrows():
        print(f"{row['target']}: {row['error']}")

# Print statistics for successful analyses
successful = results_df[results_df['error'].isna()]
if not successful.empty:
    print("\nActivity Statistics:")
    print(f"Average number of actives: {successful['actives'].mean():.1f}")
    print(f"Average total compounds: {successful['total'].mean():.1f}")
    print(f"Average active ratio: {successful['active_ratio'].mean():.1f}%")
    
    # Print targets with highest and lowest active ratios
    print("\nTop 3 targets by active ratio:")
    top3 = successful.nlargest(3, 'active_ratio')
    for _, row in top3.iterrows():
        print(f"{row['target']}: {row['active_ratio']:.1f}% ({row['actives']}/{row['total']} compounds)")

# Save the results DataFrame to CSV
results_df[["target", "actives", "total"]].to_csv("DEKOIS_count_data.csv", index=False)

results_df

Analyzing activity files:   0%|          | 0/81 [00:00<?, ?it/s]

[00:19:41] Explicit valence for atom # 8 N, 5, is greater than permitted
[00:19:41] ERROR: Could not sanitize molecule ending on line 74891
[00:19:41] ERROR: Explicit valence for atom # 8 N, 5, is greater than permitted
[00:19:49] Conflicting single bond directions around double bond at index 2.
[00:19:49]   BondStereo set to STEREONONE and single bond directions set to NONE.



Analysis Summary:
Total targets processed: 81

Activity Statistics:
Average number of actives: 40.0
Average total compounds: 1239.6
Average active ratio: 3.2%

Top 3 targets by active ratio:
ts: 3.2% (40/1239 compounds)
bcl2: 3.2% (40/1240 compounds)
jnk2: 3.2% (40/1240 compounds)


Unnamed: 0,target,error,actives,total,active_ratio
0,bcl2,,40,1240,3.225806
1,jnk2,,40,1240,3.225806
2,tpa,,40,1240,3.225806
3,adrb2,,40,1240,3.225806
4,pparg,,40,1240,3.225806
...,...,...,...,...,...
76,jnk1,,40,1240,3.225806
77,rock-1,,40,1240,3.225806
78,fgfr1,,40,1240,3.225806
79,kif11,,40,1240,3.225806
