# Step 3 â€” Diagnostics

We check standard regression assumptions using:
- Residuals vs fitted values
- Normal Q-Q plot
- Residual histogram

**Input:** `data/processed/ca2_curated.csv`  
**Outputs:** figures in `figures/`

In [None]:
from pathlib import Path
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import scipy.stats as stats

DATA_IN = Path('data/processed/ca2_curated.csv')
FIG_DIR = Path('figures')

df = pd.read_csv(DATA_IN)
X = sm.add_constant(df['AlogP'])
y = df['pIC50']
model = sm.OLS(y, X).fit()

residuals = model.resid
fitted = model.fittedvalues
FIG_DIR.mkdir(parents=True, exist_ok=True)

In [None]:
# Residuals vs fitted
plt.figure(figsize=(7, 5))
plt.scatter(fitted, residuals, alpha=0.6)
plt.axhline(0, linestyle='--', color='red', linewidth=1)
plt.xlabel('Fitted values')
plt.ylabel('Residuals')
plt.title('Residuals vs Fitted')
plt.tight_layout()
plt.savefig(FIG_DIR / 'residuals_vs_fitted.png', dpi=200)
plt.show()

In [None]:
# Q-Q plot
plt.figure(figsize=(7, 5))
stats.probplot(residuals, dist='norm', plot=plt)
plt.title('Normal Q-Q Plot (Residuals)')
plt.tight_layout()
plt.savefig(FIG_DIR / 'qqplot.png', dpi=200)
plt.show()

In [None]:
# Histogram
plt.figure(figsize=(7, 5))
plt.hist(residuals, bins=30, edgecolor='black', alpha=0.85)
plt.xlabel('Residual')
plt.ylabel('Frequency')
plt.title('Residuals Histogram')
plt.tight_layout()
plt.savefig(FIG_DIR / 'residuals_hist.png', dpi=200)
plt.show()