# BioInsight Lite: End-to-End Workflow
This notebook covers Data Loading, EDA, Preprocessing, Modeling (LogReg, XGBoost), and Evaluation.

## 1. Setup and Data Loading

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
import xgboost as xgb

# Load Data
try:
    df = pd.read_csv('../data/sample_bioactivity.csv')
    print(f"Loaded {df.shape}")
except:
    print("Data not found, check path")

## 2. Exploratory Data Analysis (EDA)

In [None]:
print(df['is_active'].value_counts())
sns.countplot(x='is_active', data=df)
plt.title('Class Distribution')
plt.show()

In [None]:
plt.figure(figsize=(10,6))
sns.histplot(df, x='mw_freebase', hue='is_active', kde=True)
plt.title('Molecular Weight Distribution by Activity')
plt.show()

## 3. Modeling

In [None]:
# Preprocessing
drop_cols = ['activity_id', 'assay_id', 'is_active', 'pchembl_value', 
             'confidence_score', 'standard_type', 'target_type', 'organism']
X = df.drop(columns=[c for c in drop_cols if c in df.columns], errors='ignore')
X = X.select_dtypes(include=[np.number])
y = df['is_active']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Train shape: {X_train.shape}")

In [None]:
# Logistic Regression
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
lr_preds = lr.predict_proba(X_test)[:,1]
print(f"Logistic Regression AUC: {roc_auc_score(y_test, lr_preds):.4f}")

In [None]:
# XGBoost
xgb_model = xgb.XGBClassifier(eval_metric='logloss')
xgb_model.fit(X_train, y_train)
xgb_preds = xgb_model.predict_proba(X_test)[:,1]
print(f"XGBoost AUC: {roc_auc_score(y_test, xgb_preds):.4f}")