In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt



In [None]:
# Load dataset
df = pd.read_csv('health_data.csv')
# Initial data inspection
print(df.isnull().sum())


In [None]:
# Mean imputation
mean_imputer = SimpleImputer(strategy='mean')
df_mean_imputed = pd.DataFrame(mean_imputer.fit_transform(df), columns=df.columns)


In [None]:
# KNN imputation
knn_imputer = KNNImputer(n_neighbors=5)
df_knn_imputed = pd.DataFrame(knn_imputer.fit_transform(df), columns=df.columns)


In [None]:
# Separate the dataset into one with missing values and one without
df_with_cholesterol = df.dropna(subset=['cholesterol'])
df_missing_cholesterol = df[df['cholesterol'].isnull()]

# Train a linear regression model
X = df_with_cholesterol.drop('cholesterol', axis=1)
y = df_with_cholesterol['cholesterol']
regressor = LinearRegression()
regressor.fit(X, y)

# Predict missing cholesterol values
predicted_cholesterol = regressor.predict(df_missing_cholesterol.drop('cholesterol', axis=1))
df.loc[df['cholesterol'].isnull(), 'cholesterol'] = predicted_cholesterol


In [None]:
# Function to calculate MSE after imputation
def calculate_mse(data, target):
    X_train, X_test, y_train, y_test = train_test_split(data.drop(target, axis=1), data[target], test_size=0.2, random_state=42)
    model = LinearRegression()
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    return mean_squared_error(y_test, predictions)

# Calculate and compare MSE
mse_original = calculate_mse(df.dropna(), 'weight')  # Assuming dropping missing values as baseline
mse_mean = calculate_mse(df_mean_imputed, 'weight')
mse_knn = calculate_mse(df_knn_imputed, 'weight')
mse_regression = calculate_mse(df, 'weight')  # After regression imputation

# Plotting the MSEs for comparison
plt.bar(['Original', 'Mean', 'KNN', 'Regression'], [mse_original, mse_mean, mse_knn, mse_regression])
plt.ylabel('MSE')
plt.title('MSE Comparison Among Imputation Methods')
plt.show()
