# Step 2 â€” Baseline Model (OLS)

We fit a simple linear regression model (**OLS**) to test whether **AlogP** explains variability in **pIC50**.

**Input:** `data/processed/ca2_curated.csv`  
**Outputs:** `results/ols_summary.txt`, `results/ols_metrics.csv`, `results/ols_params.csv`

In [None]:
from pathlib import Path
import pandas as pd
import statsmodels.api as sm

DATA_IN = Path('data/processed/ca2_curated.csv')
OUT_DIR = Path('results')

df = pd.read_csv(DATA_IN)
df.head()

In [None]:
X = sm.add_constant(df['AlogP'])
y = df['pIC50']

model = sm.OLS(y, X).fit()
print(model.summary())

In [None]:
OUT_DIR.mkdir(parents=True, exist_ok=True)

(OUT_DIR / 'ols_summary.txt').write_text(model.summary().as_text(), encoding='utf-8')

params = pd.DataFrame({
    'term': model.params.index,
    'estimate': model.params.values,
    'std_error': model.bse.values,
    't_value': model.tvalues.values,
    'p_value': model.pvalues.values,
})
params.to_csv(OUT_DIR / 'ols_params.csv', index=False)

metrics = pd.DataFrame([{
    'n_obs': int(model.nobs),
    'r2': float(model.rsquared),
    'r2_adj': float(model.rsquared_adj),
    'sigma2': float(model.scale),
    'rmse': float((model.mse_resid) ** 0.5),
    'aic': float(model.aic),
    'bic': float(model.bic),
}])
metrics.to_csv(OUT_DIR / 'ols_metrics.csv', index=False)

metrics