# Marimo Interactive Data Analysis Notebook

_This notebook demonstrates variable dependencies, an interactive slider, and dynamic markdown output._

### Author contact (comment):
`# 22f3002631@ds.study.iitm.ac.in`


In [None]:
# 22f3002631@ds.study.iitm.ac.in  <-- email as a comment
# Cell 1: Data generation and helper functions
# This cell produces the dataset and exposes variables used by later cells.
import numpy as np
import pandas as pd

np.random.seed(42)
n = 200

# Synthetic dataset: x influences y; z is another explanatory variable correlated with x
x = np.linspace(0, 10, n)
z = 0.5 * x + np.random.normal(scale=1.0, size=n)
noise = np.random.normal(scale=2.0, size=n)
y = 3.0 * x + 1.5 * z + noise

df = pd.DataFrame({
    'x': x,
    'z': z,
    'y': y
})

# Helper: compute a simple linear model fit of y on x (optionally include z)
def fit_coefficients(include_z=False):
    """Return regression coefficients for predicting y.
    If include_z is True, fit y ~ x + z, otherwise fit y ~ x.
    """
    from sklearn.linear_model import LinearRegression
    if include_z:
        X = df[['x','z']]
    else:
        X = df[['x']]
    lr = LinearRegression().fit(X, df['y'])
    intercept = lr.intercept_
    coefs = lr.coef_.tolist()
    return intercept, coefs

# Document data flow: df -> fit_coefficients -> downstream visualization/markdown
intercept_x, coef_x = fit_coefficients(include_z=False)
intercept_xz, coef_xz = fit_coefficients(include_z=True)

intercept_x, coef_x, intercept_xz, coef_xz


## Cell dependency demonstration

The previous cell created `df`, `fit_coefficients`, and computed baseline coefficients `intercept_x` and `coef_x`.
This cell (and the next) will use those variables. This demonstrates variable dependency: if you re-run Cell 1 with different data, dependent cells will reflect the changes.

In [None]:
# Cell 2: Interactive controls and dynamic markdown output
# This cell depends on variables and functions defined in Cell 1 (df, fit_coefficients).
import ipywidgets as widgets
from IPython.display import display, Markdown, clear_output
import matplotlib.pyplot as plt

# Slider selects fraction of dataset to use for the model (simulates subsampling)
slider = widgets.FloatSlider(value=1.0, min=0.1, max=1.0, step=0.1,
                             description='Data fraction:', continuous_update=False)
checkbox = widgets.Checkbox(value=False, description='Include z in model')

out = widgets.Output()

def on_change(change):
    # This callback reads variables from Cell 1 -> shows variable dependency
    frac = slider.value
    include_z = checkbox.value
    m = max(2, int(len(df) * frac))
    sub = df.sample(n=m, random_state=1)

    # Fit using the helper from Cell 1
    from sklearn.linear_model import LinearRegression
    if include_z:
        X = sub[['x','z']]
    else:
        X = sub[['x']]
    lr = LinearRegression().fit(X, sub['y'])
    intercept = lr.intercept_
    coefs = lr.coef_.tolist()

    # Dynamic markdown based on widget state
    clear_output(wait=True)
    display(Markdown(f"**Model on {m} samples**  "))
    if include_z:
        display(Markdown(f"Intercept: `{intercept:.3f}`; Coefficients: `x={coefs[0]:.3f}`, `z={coefs[1]:.3f}`"))
    else:
        display(Markdown(f"Intercept: `{intercept:.3f}`; Coefficient x: `{coefs[0]:.3f}`"))

    # Simple diagnostic plot
    fig, ax = plt.subplots(figsize=(6,3))
    ax.scatter(sub['x'], sub['y'], alpha=0.6)
    # overlay prediction using x only (even if include_z True, we show x effect for clarity)
    xs = np.linspace(sub['x'].min(), sub['x'].max(), 100)
    if include_z:
        # approximate z for plotting purpose using population relationship
        zs = 0.5 * xs
        preds = intercept + coefs[0]*xs + coefs[1]*zs
    else:
        preds = intercept + coefs[0]*xs
    ax.plot(xs, preds)
    ax.set_xlabel('x')
    ax.set_ylabel('y')
    plt.show()

slider.observe(on_change, names='value')
checkbox.observe(on_change, names='value')

# Initial display
display(widgets.HBox([slider, checkbox]))
on_change(None)

# Comments about data flow:
# - This cell reads `df` and `fit_coefficients` defined in Cell 1.
# - Changing the slider or checkbox causes re-sampling of `df` and refitting the model.
# - Therefore: Cell 2 depends on the state produced by Cell 1.


### Notes on interactivity and reproducibility

- Re-run **Cell 1** to change the dataset or helper functions. Dependent cells (e.g., Cell 2) will reflect updates.
- The slider controls how much of the dataset is used; the checkbox toggles inclusion of `z` in the model.
- Comments in code document the data flow between cells.
