#**Import the required libraries:**


> #  Think of these as tools we'll use to process data, build our model, and analyze results.




In [28]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer, PowerTransformer, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
import kagglehub

# **Define helper functions:**



> # These functions help us create missing data and calculate how well our imputation works.





In [29]:
def missing_method(raw_data, mechanism='mcar', method='random', missing_threshold=0.2, random_state=42):
    np.random.seed(random_state)
    data = raw_data.copy()
    rows, cols = data.shape
    t = missing_threshold

    if mechanism == 'mcar':
        v = np.random.uniform(size=(rows, cols))
        if method == 'uniform':
            mask = v <= t
        elif method == 'random':
            c = np.zeros(cols, dtype=bool)
            c[np.random.choice(cols, cols // 2, replace=False)] = True
            mask = (v <= t) & c[np.newaxis, :]
        else:
            raise ValueError(f"Unknown method: {method}")
    elif mechanism == 'mnar':
        sample_cols = np.random.choice(cols, 2, replace=False)
        m1, m2 = np.median(data[:, sample_cols], axis=0)
        v = np.random.uniform(size=(rows, cols))
        m = (data[:, sample_cols[0]] <= m1) & (data[:, sample_cols[1]] >= m2)
        mask = v <= t
        if method == 'uniform':
            mask &= m[:, np.newaxis]
        elif method == 'random':
            c = np.zeros(cols, dtype=bool)
            c[np.random.choice(cols, cols // 2, replace=False)] = True
            mask &= m[:, np.newaxis] & c[np.newaxis, :]
        else:
            raise ValueError(f"Unknown method: {method}")
    else:
        raise ValueError(f"Unknown mechanism: {mechanism}")

    data[mask] = np.nan
    return data, mask

In [30]:
def imputation_rmse(clean_data, imputed_data, missing_mask):
    if clean_data.shape != imputed_data.shape:
        raise ValueError("Clean and imputed datasets must have the same shape")

    missing_mask = missing_mask.astype(bool)

    if not np.any(missing_mask):
        raise ValueError("The missing_mask does not contain any True values.")

    errors = clean_data[missing_mask] - imputed_data[missing_mask]
    rmse = np.sqrt(np.mean(errors**2))

    metrics = {
        'rmse': rmse,
        'mae': np.mean(np.abs(errors)),
        'total_missing': np.sum(missing_mask),
        'missing_percentage': np.sum(missing_mask) / missing_mask.size * 100,
        'min_error': np.min(errors) if errors.size > 0 else None,
        'max_error': np.max(errors) if errors.size > 0 else None,
        'std_error': np.std(errors) if errors.size > 0 else None
    }

    return rmse, metrics

# **Load dataset**



> # We are reading a file containing health data into a table format.




In [31]:
# Download latest version
path = kagglehub.dataset_download("elikplim/concrete-compressive-strength-data-set")

print("Path to dataset files:", path)

Path to dataset files: /root/.cache/kagglehub/datasets/elikplim/concrete-compressive-strength-data-set/versions/1


In [32]:
data_path = "/root/.cache/kagglehub/datasets/elikplim/concrete-compressive-strength-data-set/versions/1/concrete_data.csv"

# **Split into train and test**



> # We divide our data into two parts: one for learning and one for testing if the learning worked.





In [33]:
# Load dataset
diabetes = pd.read_csv(data_path)

In [34]:
# Split into train and test
train_set, test_set = train_test_split(diabetes, test_size=0.2, random_state=42)

# **Preprocessing pipeline**



> # A pipeline is like a recipe. Here, we're setting up steps to clean and prepare our data.




In [35]:
preprocessing_pipeline = Pipeline([
    ("scaler", StandardScaler()),
])

# **Fit preprocessing on train and transform train and test**


> # We "teach" the pipeline using the training data and then apply it to both train and test data.




In [36]:
# Fit preprocessing on train and transform train and test
preprocessed_train_set = preprocessing_pipeline.fit_transform(train_set)
preprocessed_test_set = preprocessing_pipeline.transform(test_set)

# **Verify preprocessed data**

> #  We check if everything looks okay after preprocessing, especially for missing values.





In [37]:
# Verify preprocessed data
if np.isnan(preprocessed_train_set).any():
    print("Warning: Training data contains NaNs after preprocessing!")
if np.isnan(preprocessed_test_set).any():
    print("Warning: Test data contains NaNs after preprocessing!")

# **Generate missing data in test set**

> # Here, we artificially create missing data in our test set to simulate real-world scenarios.





In [38]:
# Generate missing data in test set
missing_test_set, missing_mask = missing_method(preprocessed_test_set, mechanism='mcar', method='random', missing_threshold=0.4)

In [39]:
# Check for NaNs in missing data
if np.isnan(missing_test_set).any():
    print("Warning: Missing test set contains NaNs!")
if np.isinf(missing_test_set).any():
    print("Warning: Missing test set contains infinite values!")



# **Autoencoder model**


> # This is our  model that will learn how to fill in missing values.




In [40]:
# Autoencoder model
input_dim = preprocessed_train_set.shape[1]
input_layer = Input(shape=(input_dim,))
encoder = Dense(64, activation="relu")(input_layer)
encoder = Dense(32, activation="relu")(encoder)

decoder = Dense(64, activation="relu")(encoder)
decoder = Dense(input_dim, activation="linear")(decoder)

autoencoder = Model(inputs=input_layer, outputs=decoder)
autoencoder.compile(optimizer='adam', loss='mean_squared_error')

# **Train autoencoder**
# We show our model the training data multiple times until it learns to process it well.


In [41]:
# Train autoencoder
history = autoencoder.fit(preprocessed_train_set, preprocessed_train_set, epochs=50, batch_size=32, shuffle=True, validation_split=0.2, verbose=32)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


# **Check training history**
# After training, we check how well the model did during learning.

In [42]:
# Check training history
print("Training Loss:", history.history['loss'][-1])
print("Validation Loss:", history.history['val_loss'][-1])

Training Loss: 0.28529223054647446
Validation Loss: 0.39678732864558697


# **Impute missing values**
# We use our trained model to guess the missing values in the test set.

In [43]:
# Impute missing values
imputed_test_set = autoencoder.predict(missing_test_set)

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step 


# **Handle NaN values in the imputed data**
# If the guesses from the model have NaNs, we replace them with the average values of the dataset.


In [44]:
# Handle NaN values in the imputed data
if np.isnan(imputed_test_set).any():
    print("Warning: Imputed data contains NaNs. Applying fallback imputation.")
    imputed_test_set = np.where(np.isnan(imputed_test_set), np.nanmean(imputed_test_set, axis=0), imputed_test_set)



# **Evaluate performance**
# We calculate how close the guesses are to the original values using RMSE and other metrics.


In [45]:
# Evaluate performance
clean_test_set = preprocessed_test_set
rmse, metrics = imputation_rmse(clean_test_set, imputed_test_set, missing_mask)

In [46]:
print("RMSE:", rmse)
print("Metrics:", metrics)

RMSE: 1.0574767758678276
Metrics: {'rmse': 1.0574767758678276, 'mae': 0.8049457653153316, 'total_missing': 331, 'missing_percentage': 17.853290183387273, 'min_error': -1.9296161723675702, 'max_error': 5.3609976045019385, 'std_error': 1.0539464881148253}
