# IMPUTATION Notebook from imputation.py

### Install project's libraries

In [2]:
%pwd
%pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


## Import system and project libraries

In [3]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import zscore

In [4]:
# For Expectation-Maximization
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
# For K-Nearest Neighbors
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
# For Linea Regression
from sklearn.linear_model import LinearRegression
# For Random Forest
from sklearn.ensemble import RandomForestRegressor
# For MICE
from sklearn.linear_model import BayesianRidge
# For Neural Network (PyTorch)
from sklearn.preprocessing import MinMaxScaler
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.optim as optim
# For SVD
from fancyimpute import SoftImpute
# For Matrix Factorization
from fancyimpute import MatrixFactorization

## 0. Connect to the database and extract the data

### Database connection

In [None]:
# Establish the connection
# sqlalchemy uses a standard URL for connections: 
# 'mysql+pymysql://<user>:<password>@<host>/<dbname>'
DATABASE_CON = os.getenv('DATABASE_CON')

try:
    # Create a SQLAlchemy engine
    engine = create_engine(DATABASE_CON)
    # Attempt to connect to the database
    with engine.connect() as connection:
        print("Database connection established successfully.")
except SQLAlchemyError as e:
    # Handle the error
    print(f"An error occurred when connecting to the database: {e}")

### Query to extract the soil_ICP dataframe

In [None]:
soil_icp_df = pd.read_sql_query("""
                        -- This query extracts the information necessary to shape the soil_icp dataframe
                        
                        SELECT sr.id, s.name, sr.sample, sr.rep,
                            MAX(CASE WHEN n.symbol = 'B' THEN rn.value ELSE 0 END) AS B,
                            MAX(CASE WHEN n.symbol = 'Mg' THEN rn.value ELSE 0 END) AS Mg,
                            MAX(CASE WHEN n.symbol = 'P' THEN rn.value ELSE 0 END) AS P,
                            MAX(CASE WHEN n.symbol = 'S' THEN rn.value ELSE 0 END) AS S,
                            MAX(CASE WHEN n.symbol = 'K' THEN rn.value ELSE 0 END) AS K,
                            MAX(CASE WHEN n.symbol = 'Ca' THEN rn.value ELSE 0 END) AS Ca,
                            MAX(CASE WHEN n.symbol = 'Mn' THEN rn.value ELSE 0 END) AS Mn,
                            MAX(CASE WHEN n.symbol = 'Fe' THEN rn.value ELSE 0 END) AS Fe,
                            MAX(CASE WHEN n.symbol = 'Cu' THEN rn.value ELSE 0 END) AS Cu,
                            MAX(CASE WHEN n.symbol = 'Zn' THEN rn.value ELSE 0 END) AS Zn
                        FROM soil_results AS sr
                        JOIN soils AS s ON sr.soil_id = s.id
                        JOIN result_nutrients AS rn ON sr.id = rn.soil_result_id
                        JOIN nutrients AS n ON rn.nutrient_id = n.id
                        WHERE sr.analysis_method_id  = 2
                        GROUP BY sr.id
                        ORDER BY sr.id;""", engine)

soil_icp_df = soil_icp_df.rename(columns={'name': 'soil'})

### Query to extract the soil_HHXRF dataframe

In [None]:
soil_hhxrf_df = pd.read_sql_query("""
                        -- This query extracts the information necessary to shape the soil_icp dataframe
                        
                        SELECT sr.id, s.name, sr.sample, sr.rep,
                            MAX(CASE WHEN n.symbol = 'B' THEN rn.value ELSE 0 END) AS B,
                            MAX(CASE WHEN n.symbol = 'Mg' THEN rn.value ELSE 0 END) AS Mg,
                            MAX(CASE WHEN n.symbol = 'P' THEN rn.value ELSE 0 END) AS P,
                            MAX(CASE WHEN n.symbol = 'S' THEN rn.value ELSE 0 END) AS S,
                            MAX(CASE WHEN n.symbol = 'K' THEN rn.value ELSE 0 END) AS K,
                            MAX(CASE WHEN n.symbol = 'Ca' THEN rn.value ELSE 0 END) AS Ca,
                            MAX(CASE WHEN n.symbol = 'Mn' THEN rn.value ELSE 0 END) AS Mn,
                            MAX(CASE WHEN n.symbol = 'Fe' THEN rn.value ELSE 0 END) AS Fe,
                            MAX(CASE WHEN n.symbol = 'Cu' THEN rn.value ELSE 0 END) AS Cu,
                            MAX(CASE WHEN n.symbol = 'Zn' THEN rn.value ELSE 0 END) AS Zn
                        FROM soil_results AS sr
                        JOIN soils AS s ON sr.soil_id = s.id
                        JOIN result_nutrients AS rn ON sr.id = rn.soil_result_id
                        JOIN nutrients AS n ON rn.nutrient_id = n.id
                        WHERE sr.analysis_method_id  = 3
                        GROUP BY sr.id
                        ORDER BY sr.id;""", engine)




## 1. Import and transform dataframe

In [6]:
soil_mix_df_final = pd.read_csv('soil_mix_df_final.csv')





      B      Mg        P      S       K       Ca      Mn       Fe    Cu     Zn
0   NaN     NaN   807.96    NaN  574.62   859.11  148.69  5992.90  28.0  15.43
1   NaN  491.76   889.15    NaN  707.79   844.07  266.10  7576.03  23.0  19.94
2   NaN  322.18  1039.74    NaN  760.43   751.79  275.44  7326.52  30.0  22.47
3   NaN  514.06   958.12    NaN  708.97   770.85  264.32  7580.27  27.0  20.07
4   NaN  677.30   818.44    NaN  606.48  1019.33  190.94  6252.12  26.0  19.44
..   ..     ...      ...    ...     ...      ...     ...      ...   ...    ...
784 NaN  488.77  1529.99    NaN  486.19   612.38  286.13  5350.72  27.0  17.46
785 NaN  447.40  1571.24    NaN  434.23   611.24  288.83  5353.28  25.0  17.19
786 NaN  507.54  1643.29    NaN  535.02   642.96  314.32  5352.57  21.0  18.78
787 NaN  374.68  1698.38  223.0  453.04   644.39  315.06  5375.89  27.0  18.74
788 NaN  304.73  1721.79  200.0  441.16   635.62  310.12  5305.12  23.0  18.26

[789 rows x 10 columns]


### Delete the first 3 columns that are not necessary

In [7]:
soil_mix_cleaned = soil_mix_df_final.drop(['soil', 'sample', 'rep'], axis = 1)

### Replace zeros with NaN for imputation

In [8]:
soil_mix_cleaned.replace(0, np.nan, inplace = True)
print(soil_mix_cleaned)

      B      Mg        P      S       K       Ca      Mn       Fe    Cu     Zn
0   NaN     NaN   807.96    NaN  574.62   859.11  148.69  5992.90  28.0  15.43
1   NaN  491.76   889.15    NaN  707.79   844.07  266.10  7576.03  23.0  19.94
2   NaN  322.18  1039.74    NaN  760.43   751.79  275.44  7326.52  30.0  22.47
3   NaN  514.06   958.12    NaN  708.97   770.85  264.32  7580.27  27.0  20.07
4   NaN  677.30   818.44    NaN  606.48  1019.33  190.94  6252.12  26.0  19.44
..   ..     ...      ...    ...     ...      ...     ...      ...   ...    ...
784 NaN  488.77  1529.99    NaN  486.19   612.38  286.13  5350.72  27.0  17.46
785 NaN  447.40  1571.24    NaN  434.23   611.24  288.83  5353.28  25.0  17.19
786 NaN  507.54  1643.29    NaN  535.02   642.96  314.32  5352.57  21.0  18.78
787 NaN  374.68  1698.38  223.0  453.04   644.39  315.06  5375.89  27.0  18.74
788 NaN  304.73  1721.79  200.0  441.16   635.62  310.12  5305.12  23.0  18.26

[789 rows x 10 columns]


## 2. Imputation using Expectation-Maximization (EM) 

### EM using sklearn's IterativeImputer, considering NAs now as NaN

In [9]:
imputer = IterativeImputer(max_iter = 10, random_state = 1)
soil_imputed_EM = imputer.fit_transform(soil_mix_cleaned)
soil_imputed_EM = pd.DataFrame(soil_imputed_EM, columns = soil_mix_cleaned.columns)
print(soil_imputed_EM)

             B          Mg        P           S       K       Ca      Mn  \
0    35.430337  412.709634   807.96  286.425832  574.62   859.11  148.69   
1    41.096779  491.760000   889.15  255.279353  707.79   844.07  266.10   
2    45.009479  322.180000  1039.74  322.468911  760.43   751.79  275.44   
3    41.957018  514.060000   958.12  273.057413  708.97   770.85  264.32   
4    33.456667  677.300000   818.44  243.097983  606.48  1019.33  190.94   
..         ...         ...      ...         ...     ...      ...     ...   
784  27.148686  488.770000  1529.99  242.272562  486.19   612.38  286.13   
785  25.897144  447.400000  1571.24  234.818174  434.23   611.24  288.83   
786  25.591342  507.540000  1643.29  204.237060  535.02   642.96  314.32   
787  25.605732  374.680000  1698.38  223.000000  453.04   644.39  315.06   
788  24.119592  304.730000  1721.79  200.000000  441.16   635.62  310.12   

          Fe    Cu     Zn  
0    5992.90  28.0  15.43  
1    7576.03  23.0  19.94  
2  

## 3. Imputation using K-Nearest Neighbors (KNN) 

#### It is advisable to standardize data for KNN

In [15]:
scaler = StandardScaler()
soil_mix_scaled = scaler.fit_transform(soil_mix_cleaned)

#### Imputation

In [37]:
imputer_knn = KNNImputer(n_neighbors = 5, weights = "uniform")
soil_imputed_KNN = imputer_knn.fit_transform(soil_mix_scaled)

## Convert back to DataFrame and undo standardization

In [38]:
soil_imputed_KNN = pd.DataFrame(scaler.inverse_transform(soil_imputed_KNN), columns = soil_mix_cleaned.columns)
print(soil_imputed_KNN)

          B       Mg        P        S       K       Ca      Mn       Fe  \
0    38.024  339.488   807.96  246.200  574.62   859.11  148.69  5992.90   
1    78.278  491.760   889.15  142.168  707.79   844.07  266.10  7576.03   
2    78.278  322.180  1039.74  273.600  760.43   751.79  275.44  7326.52   
3    78.278  514.060   958.12  133.168  708.97   770.85  264.32  7580.27   
4    34.630  677.300   818.44  224.600  606.48  1019.33  190.94  6252.12   
..      ...      ...      ...      ...     ...      ...     ...      ...   
784  35.588  488.770  1529.99  339.600  486.19   612.38  286.13  5350.72   
785  56.496  447.400  1571.24  282.200  434.23   611.24  288.83  5353.28   
786  33.648  507.540  1643.29  268.600  535.02   642.96  314.32  5352.57   
787  62.200  374.680  1698.38  223.000  453.04   644.39  315.06  5375.89   
788  62.200  304.730  1721.79  200.000  441.16   635.62  310.12  5305.12   

       Cu     Zn  
0    28.0  15.43  
1    23.0  19.94  
2    30.0  22.47  
3    27.0  

In [39]:
imputer_linear = IterativeImputer(estimator=LinearRegression(), max_iter = 10, random_state = 2)
soil_imputed_linear = imputer_linear.fit_transform(soil_mix_cleaned)



## 4. Imputation using Linear Regression

In [40]:
imputer_linear = IterativeImputer(estimator=LinearRegression(), max_iter = 10, random_state = 2)
soil_imputed_linear = imputer_linear.fit_transform(soil_mix_cleaned)



### Converting back to DataFrame

In [41]:
soil_imputed_linear = pd.DataFrame(soil_imputed_linear, columns = soil_mix_cleaned.columns)
print(soil_imputed_linear)

             B          Mg        P           S       K       Ca      Mn  \
0    45.881079  390.991448   807.96  299.038201  574.62   859.11  148.69   
1    46.539168  491.760000   889.15  250.986637  707.79   844.07  266.10   
2    53.171977  322.180000  1039.74  325.486342  760.43   751.79  275.44   
3    49.591419  514.060000   958.12  275.252284  708.97   770.85  264.32   
4    42.720432  677.300000   818.44  253.221611  606.48  1019.33  190.94   
..         ...         ...      ...         ...     ...      ...     ...   
784  41.508603  488.770000  1529.99  256.401751  486.19   612.38  286.13   
785  39.400226  447.400000  1571.24  245.622078  434.23   611.24  288.83   
786  36.882611  507.540000  1643.29  209.508728  535.02   642.96  314.32   
787  38.527951  374.680000  1698.38  223.000000  453.04   644.39  315.06   
788  35.249978  304.730000  1721.79  200.000000  441.16   635.62  310.12   

          Fe    Cu     Zn  
0    5992.90  28.0  15.43  
1    7576.03  23.0  19.94  
2  

## 5. Imputation using Random Forest Regression

In [42]:
imputer_rf = IterativeImputer(estimator = RandomForestRegressor(n_estimators = 100, random_state = 3), max_iter=10, random_state=0)
soil_imputed_rf = imputer_rf.fit_transform(soil_mix_cleaned)



### Converting back to DataFrame

In [45]:
soil_imputed_rf = pd.DataFrame(soil_imputed_rf, columns = soil_mix_cleaned.columns)
print(soil_imputed_rf)

           B       Mg        P         S       K       Ca      Mn       Fe  \
0    78.1821  404.083   807.96  272.5183  574.62   859.11  148.69  5992.90   
1    91.5556  491.760   889.15  312.2238  707.79   844.07  266.10  7576.03   
2    94.9546  322.180  1039.74  389.1938  760.43   751.79  275.44  7326.52   
3    92.2491  514.060   958.12  314.8038  708.97   770.85  264.32  7580.27   
4    71.7421  677.300   818.44  250.1857  606.48  1019.33  190.94  6252.12   
..       ...      ...      ...       ...     ...      ...     ...      ...   
784  75.9537  488.770  1529.99  315.3888  486.19   612.38  286.13  5350.72   
785  80.1453  447.400  1571.24  301.0200  434.23   611.24  288.83  5353.28   
786  24.7485  507.540  1643.29  297.4388  535.02   642.96  314.32  5352.57   
787  76.8873  374.680  1698.38  223.0000  453.04   644.39  315.06  5375.89   
788  77.5379  304.730  1721.79  200.0000  441.16   635.62  310.12  5305.12   

       Cu     Zn  
0    28.0  15.43  
1    23.0  19.94  
2    3

## 6. Imputation using Multiple Imputation by Chained Equations (MICE)

### MICE imputation with a BayesianRidge model as an estimator

In [49]:
imputer_mice = IterativeImputer(estimator = BayesianRidge(), max_iter = 10, random_state = 4)
soil_imputed_mice = imputer_mice.fit_transform(soil_mix_cleaned)
soil_imputed_mice = pd.DataFrame(soil_imputed_mice, columns = soil_mix_cleaned.columns)
print(soil_imputed_mice)

             B          Mg        P           S       K       Ca      Mn  \
0    35.430337  412.709634   807.96  286.425832  574.62   859.11  148.69   
1    41.096779  491.760000   889.15  255.279353  707.79   844.07  266.10   
2    45.009479  322.180000  1039.74  322.468911  760.43   751.79  275.44   
3    41.957018  514.060000   958.12  273.057413  708.97   770.85  264.32   
4    33.456667  677.300000   818.44  243.097983  606.48  1019.33  190.94   
..         ...         ...      ...         ...     ...      ...     ...   
784  27.148686  488.770000  1529.99  242.272562  486.19   612.38  286.13   
785  25.897144  447.400000  1571.24  234.818174  434.23   611.24  288.83   
786  25.591342  507.540000  1643.29  204.237060  535.02   642.96  314.32   
787  25.605732  374.680000  1698.38  223.000000  453.04   644.39  315.06   
788  24.119592  304.730000  1721.79  200.000000  441.16   635.62  310.12   

          Fe    Cu     Zn  
0    5992.90  28.0  15.43  
1    7576.03  23.0  19.94  
2  