In [3]:
# Import libraries
import sys
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Add src to path
sys.path.append('../src')

from data.data_loader import DrugDataLoader, TextDataProcessor
from data.preprocessing import DataPreprocessor
from models.train_models import DrugDiscoveryModels
from models.deep_learning import DeepLearningModels

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("‚úì All libraries imported successfully!")

ModuleNotFoundError: No module named 'cv2'

## Step 1: Generate Sample Data

First, let's generate our synthetic drug discovery dataset with 10,000+ samples.

In [None]:
# Generate datasets if they don't exist
from data.generate_data import generate_all_datasets
import os

if not os.path.exists('../data/raw/drug_data.csv'):
    print("Generating datasets... This may take a few minutes.")
    generate_all_datasets(n_samples=10000, n_images=500)
else:
    print("Datasets already exist. Skipping generation.")

## Step 2: Load Multiple Data Sources

Load and merge CSV, JSON, and image feature data.

In [None]:
# Initialize data loader
loader = DrugDataLoader(data_dir='../data/raw')

# Load and merge all data sources
df = loader.merge_all_data(
    csv_file='drug_data.csv',
    json_file='drug_interactions.json',
    use_images=True
)

print(f"\nDataset shape: {df.shape}")
print(f"\nFirst few rows:")
df.head()

In [None]:
# Check data types and missing values
print("Data Info:")
print(df.info())

print("\nMissing Values:")
missing = df.isnull().sum()
missing = missing[missing > 0].sort_values(ascending=False)
print(missing)

In [None]:
# Visualize target distribution
plt.figure(figsize=(8, 6))
target_counts = df['target'].value_counts()
plt.bar(['Not Effective (0)', 'Effective (1)'], target_counts.values, color=['#FF6B6B', '#4ECDC4'])
plt.title('Drug Effectiveness Distribution', fontsize=14, fontweight='bold')
plt.ylabel('Count')
plt.xlabel('Class')
for i, v in enumerate(target_counts.values):
    plt.text(i, v + 50, str(v), ha='center', fontweight='bold')
plt.tight_layout()
plt.show()

print(f"\nClass balance: {target_counts[1]/(target_counts[0]+target_counts[1])*100:.1f}% effective drugs")

## Step 3: Exploratory Data Analysis

In [None]:
# Statistical summary
print("Statistical Summary of Numerical Features:")
df.describe()

In [None]:
# Visualize distributions of key features
fig, axes = plt.subplots(2, 3, figsize=(15, 10))

key_features = ['molecular_weight', 'logP', 'bioavailability', 
                'efficacy_score', 'safety_score', 'solubility']

for idx, feature in enumerate(key_features):
    row = idx // 3
    col = idx % 3
    
    axes[row, col].hist(df[feature].dropna(), bins=50, color='skyblue', edgecolor='black', alpha=0.7)
    axes[row, col].set_title(f'{feature} Distribution', fontweight='bold')
    axes[row, col].set_xlabel(feature)
    axes[row, col].set_ylabel('Frequency')
    axes[row, col].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Correlation heatmap
numeric_cols = df.select_dtypes(include=[np.number]).columns
correlation_matrix = df[numeric_cols].corr()

plt.figure(figsize=(14, 12))
sns.heatmap(correlation_matrix, cmap='coolwarm', center=0, 
            square=True, linewidths=0.5, cbar_kws={"shrink": 0.8})
plt.title('Feature Correlation Heatmap', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

## Step 4: Data Preprocessing & Feature Engineering

Apply comprehensive preprocessing:
- Handle missing values
- Create interaction features
- Create polynomial features
- Encode categorical variables
- Remove outliers

In [None]:
# Initialize preprocessor
preprocessor = DataPreprocessor()

# Apply feature engineering pipeline
df_processed = preprocessor.feature_engineering_pipeline(df, target_col='target')

print(f"\nProcessed dataset shape: {df_processed.shape}")
print(f"Number of features created: {df_processed.shape[1] - df.shape[1]}")

In [None]:
# Check for any remaining missing values
print("Missing values after preprocessing:")
print(df_processed.isnull().sum().sum())

if df_processed.isnull().sum().sum() > 0:
    print("\nFilling any remaining missing values...")
    df_processed = df_processed.fillna(df_processed.median(numeric_only=True))
    df_processed = df_processed.fillna(0)

## Step 5: Train Multiple ML Models

Train and compare:
- Logistic Regression
- Random Forest
- Gradient Boosting
- XGBoost
- SVM

In [None]:
# Initialize model trainer
ml_models = DrugDiscoveryModels(random_state=42)

# Prepare data
X_train, X_test, y_train, y_test = ml_models.prepare_data(df_processed, target_col='target', test_size=0.2)

# Save feature names for later
feature_names = X_train.columns.tolist()

In [None]:
# Train all ML models
results = ml_models.train_all_models(X_train, y_train, X_test, y_test)

In [None]:
# Visualize model comparison
results_df = pd.DataFrame([
    {
        'Model': metrics['model_name'],
        'Accuracy': metrics['accuracy'],
        'Precision': metrics['precision'],
        'Recall': metrics['recall'],
        'F1 Score': metrics['f1_score'],
        'ROC AUC': metrics['roc_auc']
    }
    for metrics in results.values()
])

# Plot comparison
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Accuracy comparison
results_df.plot(x='Model', y='Accuracy', kind='bar', ax=axes[0], color='steelblue', legend=False)
axes[0].set_title('Model Accuracy Comparison', fontsize=14, fontweight='bold')
axes[0].set_ylabel('Accuracy')
axes[0].set_xlabel('')
axes[0].set_ylim([0.7, 1.0])
axes[0].tick_params(axis='x', rotation=45)
axes[0].grid(True, alpha=0.3)

# All metrics comparison
metrics_to_plot = ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC']
x = np.arange(len(results_df))
width = 0.15

for idx, metric in enumerate(metrics_to_plot):
    axes[1].bar(x + idx * width, results_df[metric], width, label=metric)

axes[1].set_xlabel('Model')
axes[1].set_ylabel('Score')
axes[1].set_title('All Metrics Comparison', fontsize=14, fontweight='bold')
axes[1].set_xticks(x + width * 2)
axes[1].set_xticklabels(results_df['Model'], rotation=45, ha='right')
axes[1].legend()
axes[1].grid(True, alpha=0.3)
axes[1].set_ylim([0.7, 1.0])

plt.tight_layout()
plt.show()

## Step 6: Train Deep Learning Model

In [None]:
# Initialize DL model
dl_model = DeepLearningModels(random_state=42)

# Further split training data for validation
from sklearn.model_selection import train_test_split
X_train_dl, X_val_dl, y_train_dl, y_val_dl = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
)

print(f"DL Training set: {X_train_dl.shape}")
print(f"DL Validation set: {X_val_dl.shape}")
print(f"DL Test set: {X_test.shape}")

In [None]:
# Train MLP model
dl_metrics = dl_model.train_mlp(
    X_train_dl, y_train_dl, 
    X_val_dl, y_val_dl,
    epochs=100,
    batch_size=128
)

In [None]:
# Plot training history
fig = dl_model.plot_training_history()
plt.show()

In [None]:
# Evaluate DL model on test set
from sklearn.metrics import accuracy_score, classification_report

y_pred_dl = dl_model.predict(X_test)
y_pred_proba_dl = dl_model.predict_proba(X_test)

print("Deep Learning Model - Test Set Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_dl):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_dl, target_names=['Not Effective', 'Effective']))

## Step 7: Confusion Matrices

In [None]:
from sklearn.metrics import confusion_matrix

# Get predictions from best ML model
best_ml_model = ml_models.best_model
y_pred_ml = best_ml_model.predict(X_test)

# Create confusion matrices
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# ML Model confusion matrix
cm_ml = confusion_matrix(y_test, y_pred_ml)
sns.heatmap(cm_ml, annot=True, fmt='d', cmap='Blues', ax=axes[0], cbar=False)
axes[0].set_title(f'{ml_models.best_model_name} Confusion Matrix', fontweight='bold')
axes[0].set_ylabel('True Label')
axes[0].set_xlabel('Predicted Label')
axes[0].set_xticklabels(['Not Effective', 'Effective'])
axes[0].set_yticklabels(['Not Effective', 'Effective'])

# DL Model confusion matrix
cm_dl = confusion_matrix(y_test, y_pred_dl)
sns.heatmap(cm_dl, annot=True, fmt='d', cmap='Greens', ax=axes[1], cbar=False)
axes[1].set_title('Deep Neural Network Confusion Matrix', fontweight='bold')
axes[1].set_ylabel('True Label')
axes[1].set_xlabel('Predicted Label')
axes[1].set_xticklabels(['Not Effective', 'Effective'])
axes[1].set_yticklabels(['Not Effective', 'Effective'])

plt.tight_layout()
plt.show()

## Step 8: Feature Importance Analysis

In [None]:
# Get feature importance from best model
if hasattr(ml_models.best_model, 'feature_importances_'):
    importance_df = pd.DataFrame({
        'feature': feature_names,
        'importance': ml_models.best_model.feature_importances_
    }).sort_values('importance', ascending=False).head(20)
    
    plt.figure(figsize=(12, 8))
    plt.barh(range(len(importance_df)), importance_df['importance'], color='coral')
    plt.yticks(range(len(importance_df)), importance_df['feature'])
    plt.xlabel('Importance')
    plt.title(f'Top 20 Feature Importance - {ml_models.best_model_name}', 
              fontsize=14, fontweight='bold')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()
else:
    print(f"{ml_models.best_model_name} does not support feature importance")

## Step 9: ROC Curves

In [None]:
from sklearn.metrics import roc_curve, auc

plt.figure(figsize=(10, 8))

# Plot ROC for each ML model
for model_name, model in ml_models.models.items():
    if hasattr(model, 'predict_proba'):
        y_pred_proba = model.predict_proba(X_test)[:, 1]
    else:
        continue
    
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f'{model_name} (AUC = {roc_auc:.3f})', linewidth=2)

# Plot ROC for DL model
fpr_dl, tpr_dl, _ = roc_curve(y_test, y_pred_proba_dl)
roc_auc_dl = auc(fpr_dl, tpr_dl)
plt.plot(fpr_dl, tpr_dl, label=f'Deep Neural Network (AUC = {roc_auc_dl:.3f})', 
         linewidth=2, linestyle='--')

# Plot random classifier
plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier', linewidth=1)

plt.xlabel('False Positive Rate', fontsize=12)
plt.ylabel('True Positive Rate', fontsize=12)
plt.title('ROC Curves - All Models', fontsize=14, fontweight='bold')
plt.legend(loc='lower right')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## Step 10: Save Models

In [None]:
# Save all ML models
ml_models.save_models(output_dir='../models')

# Save DL model
dl_model.save_model('../models/deep_neural_network.keras')

# Save preprocessor
import joblib
joblib.dump(preprocessor, '../models/preprocessor.pkl')

# Save feature names
joblib.dump(feature_names, '../models/feature_names.pkl')

print("\n‚úì All models and preprocessor saved successfully!")

## Step 11: Final Summary

In [None]:
print("="*70)
print("FINAL MODEL PERFORMANCE SUMMARY")
print("="*70)

# Combine all results
all_results = []

for model_name, metrics in results.items():
    all_results.append({
        'Model': metrics['model_name'],
        'Type': 'ML',
        'Accuracy': f"{metrics['accuracy']:.4f}",
        'F1 Score': f"{metrics['f1_score']:.4f}",
        'ROC AUC': f"{metrics['roc_auc']:.4f}"
    })

all_results.append({
    'Model': 'Deep Neural Network',
    'Type': 'DL',
    'Accuracy': f"{accuracy_score(y_test, y_pred_dl):.4f}",
    'F1 Score': f"{f1_score(y_test, y_pred_dl):.4f}",
    'ROC AUC': f"{roc_auc_score(y_test, y_pred_proba_dl):.4f}"
})

summary_df = pd.DataFrame(all_results)
print(summary_df.to_string(index=False))

print("\n" + "="*70)
print(f"üèÜ BEST MODEL: {ml_models.best_model_name}")
print(f"   Accuracy: {ml_models.best_score:.4f}")
print("="*70)

print("\nüìä Dataset Statistics:")
print(f"   Total samples: {len(df):,}")
print(f"   Training samples: {len(X_train):,}")
print(f"   Test samples: {len(X_test):,}")
print(f"   Original features: {len(df.columns)}")
print(f"   Engineered features: {len(feature_names)}")

print("\n‚úÖ Training pipeline completed successfully!")
print("\nNext steps:")
print("   1. Run manual_drug_test.py to test with your own drug data")
print("   2. Models are saved in ../models/ directory")
print("   3. Use the best model for predictions")