# Matrix Factorization:

- It efficiently imputes large amounts of missing data by exploiting the latent structure of the data
- It is particularly effective at imputing missing values ​​for continuous variables

In [1]:
pip install fancyimpute

Collecting fancyimpute
  Downloading fancyimpute-0.7.0.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting knnimpute>=0.1.0 (from fancyimpute)
  Downloading knnimpute-0.1.0.tar.gz (8.3 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nose (from fancyimpute)
  Downloading nose-1.3.7-py3-none-any.whl.metadata (1.7 kB)
Downloading nose-1.3.7-py3-none-any.whl (154 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.7/154.7 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: fancyimpute, knnimpute
  Building wheel for fancyimpute (setup.py) ... [?25l[?25hdone
  Created wheel for fancyimpute: filename=fancyimpute-0.7.0-py3-none-any.whl size=29880 sha256=4dc6450022136364d1036801fd450041b1f3291a210d9b6246bb5fa008f33da1
  Stored in directory: /root/.cache/pip/wheels/7b/0c/d3/ee82d1fbdcc0858d96434af108608d01703505d453720c84ed
  Building wheel for knnimpute (setup.py) ... [?25l[?25hdone
  C

In [2]:
import numpy as np
import pandas as pd
from fancyimpute import SoftImpute

# Hyperparameters
input_dim = 5  # Number of features

# Utility function to introduce missing values
def introduce_missing_values(data, missing_rate=0.2):
    data_with_missing = data.copy()
    mask = np.random.binomial(1, 1 - missing_rate, data.shape)
    data_with_missing[mask == 0] = np.nan
    return data_with_missing, mask

# Generate sample data
np.random.seed(0)
data = np.random.rand(1000, input_dim)
data_with_missing, mask = introduce_missing_values(data, missing_rate=0.2)

# Original data
original_df = pd.DataFrame(data)

# Data with missing
df = pd.DataFrame(data_with_missing)

# Replace missing values ​​with Soft Impute
imputer = SoftImpute()

# Perform missing value imputation
df_imputed = imputer.fit_transform(df)

# Output the replaced dataframe
df_imputed = pd.DataFrame(df_imputed, columns=df.columns)
print("Original Data:")
print(data[:5])
print("Data with Missing Values:")
print(data_with_missing[:5])
print("Imputed Data:")
print(df_imputed[:5])

[SoftImpute] Max Singular Value of X_init = 30.296027
[SoftImpute] Iter 1: observed MAE=0.016558 rank=5
[SoftImpute] Iter 2: observed MAE=0.016610 rank=5
[SoftImpute] Iter 3: observed MAE=0.016662 rank=5
[SoftImpute] Iter 4: observed MAE=0.016712 rank=5
[SoftImpute] Iter 5: observed MAE=0.016761 rank=5
[SoftImpute] Iter 6: observed MAE=0.016808 rank=5
[SoftImpute] Iter 7: observed MAE=0.016853 rank=5
[SoftImpute] Iter 8: observed MAE=0.016896 rank=5
[SoftImpute] Iter 9: observed MAE=0.016937 rank=5
[SoftImpute] Iter 10: observed MAE=0.016976 rank=5
[SoftImpute] Iter 11: observed MAE=0.017013 rank=5
[SoftImpute] Iter 12: observed MAE=0.017049 rank=5
[SoftImpute] Iter 13: observed MAE=0.017082 rank=5
[SoftImpute] Iter 14: observed MAE=0.017114 rank=5
[SoftImpute] Iter 15: observed MAE=0.017144 rank=5
[SoftImpute] Iter 16: observed MAE=0.017173 rank=5
[SoftImpute] Iter 17: observed MAE=0.017200 rank=5
[SoftImpute] Iter 18: observed MAE=0.017225 rank=5
[SoftImpute] Iter 19: observed MAE=0.

In [3]:
from sklearn.metrics import mean_squared_error

# MSE calculation function
def calculate_mse(true_values, imputed_values):
    mse = mean_squared_error(true_values, imputed_values)
    return mse

softimpute_mse = calculate_mse(original_df, df_imputed)

print(f"SoftImpute MSE: {softimpute_mse}")

SoftImpute MSE: 0.025185071086018567
