In [None]:
# Step 1: Data Preprocessing
from data_preprocessing import preprocess_pipeline

X_train, X_test, y_train, y_test, df_processed = preprocess_pipeline(
    filepath='dataset/binding_affinity_data.csv',
    test_size=0.2,
    random_state=42
)

# Save processed data
import pandas as pd
pd.DataFrame(df_processed).to_csv('dataset/processed_data.csv', index=False)
print('Processed data saved to: dataset/processed_data.csv')
print('Processed shape:', df_processed.shape)


In [None]:
# Step 2: Feature Engineering (fingerprints + protein features)
from feature_engineering import combine_features

print('Creating features for training set...')
X_train_features = combine_features(X_train, use_fingerprints=True, n_bits=512)
print('Creating features for test set...')
X_test_features = combine_features(X_test, use_fingerprints=True, n_bits=512)

# Align columns between train and test
common_cols = X_train_features.columns.intersection(X_test_features.columns)
X_train_features = X_train_features[common_cols]
X_test_features = X_test_features[common_cols]

# Handle missing values
X_train_features = X_train_features.fillna(X_train_features.mean())
X_test_features = X_test_features.fillna(X_train_features.mean())

# Attach targets and save
X_train_features['binding_affinity'] = y_train.values
X_test_features['binding_affinity'] = y_test.values

X_train_features.to_csv('dataset/features_train.csv', index=False)
X_test_features.to_csv('dataset/features_test.csv', index=False)

print('Saved features to dataset/features_train.csv and dataset/features_test.csv')
print('Train features shape:', X_train_features.shape)
print('Test features shape:', X_test_features.shape)


In [None]:
# Step 3: Train models and save outputs
from models import train_all_models, compare_models, save_model
from evaluation import create_evaluation_report

# Separate features/targets
Xtr = X_train_features.drop('binding_affinity', axis=1)
Xte = X_test_features.drop('binding_affinity', axis=1)
ytr = y_train
yte = y_test

# Train all models
models_dict, results = train_all_models(Xtr, Xte, ytr, yte)

# Compare and save model comparison
comparison_df = compare_models(results)
comparison_df.to_csv('results/model_comparison.csv', index=False)
print('Saved model comparison to results/model_comparison.csv')

# Evaluation reports and saving best model
import numpy as np
best_model_name = min(results.items(), key=lambda x: x[1]['rmse'])[0]
for model_name, metrics in results.items():
    create_evaluation_report(yte.values if hasattr(yte, 'values') else yte, metrics['predictions'], model_name=model_name, save_dir='results')
    if model_name == best_model_name:
        mobj = models_dict[model_name]
        if isinstance(mobj, tuple):
            model, scaler = mobj
            save_model(model, f'models/{model_name.lower().replace(" ", "_")}.pkl', scaler)
        else:
            save_model(mobj, f'models/{model_name.lower().replace(" ", "_")}.pkl')

print('All done. Files saved in dataset/, models/, and results/.')
