In [42]:
import numpy as np
import pandas as pd

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression

Why:
1) Enables IterativeImputer
2) Uses Linear Regression to predict missing values

In [43]:
df = pd.read_csv("50_Startups.csv")

df = df[['R&D Spend', 'Administration', 'Marketing Spend']]
#Only numerical features

In [45]:
df = df.copy()
df.iloc[1,0] = np.nan
df.iloc[3,1] = np.nan
df.iloc[-1,-1] = np.nan

# To demonstrate MICE behavior
# Skip this if data already has NaNs

Below is direct implemetation which is for : 

1) Mean initialization
2) Column-by-column modeling
3) Iterations
4) Convergence checks  



In [47]:
imputer = IterativeImputer(
    estimator=LinearRegression(),
    max_iter=10,
    tol=1e-3,
    random_state=42
)

In [49]:
df_imputed = imputer.fit_transform(df)

What happens internally (important) above:

1) Mean imputation (0th iteration)
2) Predict col1 using others
3) Predict col2 using others
4) Predict col3 using others
5) Repeat steps 2–4
6) Stop at convergence or max_iter

In [50]:
# converting back to DataFrame
df_imputed = pd.DataFrame(
    df_imputed,
    columns=df.columns,
    index=df.index
)

In [51]:
df_imputed.isnull().sum()

R&D Spend          0
Administration     0
Marketing Spend    0
dtype: int64

In [52]:
df_imputed.head(12)

Unnamed: 0,R&D Spend,Administration,Marketing Spend
0,165349.2,136897.8,471784.1
1,149061.550582,151377.59,443898.53
2,153441.51,101145.55,407934.54
3,144372.41,129128.580058,383199.62
4,142107.34,91391.77,366168.42
5,131876.9,99814.71,362861.36
6,134615.46,147198.87,127716.82
7,130298.13,145530.06,323876.68
8,120542.52,148718.95,311613.29
9,123334.88,108679.17,304981.62


In [53]:
df1 = pd.read_csv("50_Startups.csv" , usecols=['R&D Spend', 'Administration', 'Marketing Spend'])

In [54]:
df1.head(12)

Unnamed: 0,R&D Spend,Administration,Marketing Spend
0,165349.2,136897.8,471784.1
1,162597.7,151377.59,443898.53
2,153441.51,101145.55,407934.54
3,144372.41,118671.85,383199.62
4,142107.34,91391.77,366168.42
5,131876.9,99814.71,362861.36
6,134615.46,147198.87,127716.82
7,130298.13,145530.06,323876.68
8,120542.52,148718.95,311613.29
9,123334.88,108679.17,304981.62


In [55]:
imputer.n_iter_

3

### How Iterative Imputer Checks Convergence (`tol`)

At the end of each iteration, Iterative Imputer compares the newly imputed values with those from the previous iteration.

Only the values that were originally missing are considered.

The maximum absolute difference is computed:

|Xᵗ − Xᵗ⁻¹| < tol

If this condition is satisfied, the imputation process is considered converged and stops early.
Otherwise, the algorithm continues until `max_iter` is reached.

---

### How to check convergence in practice

After fitting the imputer:

```python
imputer.n_iter_


## Iterative Imputer in Pipelines

In [72]:
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

In [66]:
df = pd.read_csv("50_Startups.csv")

X = df[['R&D Spend', 'Administration', 'Marketing Spend']]
y = df['Profit']

In [68]:
# Train–test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

In [59]:
# # Build the Pipeline
# pipe = Pipeline([
#     ('scaler', StandardScaler()),
#     ('imputer', IterativeImputer(
#         estimator=LinearRegression(),
#         max_iter=10,
#         tol=1e-3,
#         random_state=42
#     ))
# ])

In [60]:
# X_train_imputed = pipe.fit_transform(X_train)
# X_test_imputed = pipe.transform(X_test)

In [73]:
full_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('imputer', IterativeImputer(
        estimator=LinearRegression(),
        max_iter=10,
        tol=1e-3,
        random_state=42
    )),
    ('model', LinearRegression())
])

In [74]:
full_pipe.fit(X_train, y_train)

0,1,2
,steps,"[('scaler', ...), ('imputer', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,estimator,LinearRegression()
,missing_values,
,sample_posterior,False
,max_iter,10
,tol,0.001
,n_nearest_features,
,initial_strategy,'mean'
,fill_value,
,imputation_order,'ascending'
,skip_complete,False

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [75]:
y_pred = full_pipe.predict(X_test)

In [77]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.900065308303732