### Scaler_Imputer

This code fragment was used to test various Scaler-Imputer combinations, along with dropping all rows containing a NaN value. Data is exported to Excel (Scaler_Imputer_Test.XLSX) and the best combination is printed.

**Note:** As a group we chose to use the Simple (Mean) Imputer and Standard scaler as we found these were efficient and worked on all regression models during initial testing. As is also shown, there are only marginal improvements when using more complex methods, and the possability of getting much worse values and thus errors.

In [9]:
import pandas as pd
import numpy as np
from math import sqrt
from scipy.stats import norm

# SKLearn
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.linear_model import LinearRegression, Ridge, ElasticNet, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler, QuantileTransformer, MinMaxScaler, RobustScaler, PowerTransformer, MaxAbsScaler, Binarizer, Normalizer
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, learning_curve
from sklearn.experimental import enable_iterative_imputer
from sklearn.linear_model import BayesianRidge
from sklearn.ensemble import StackingRegressor
from sklearn.impute import IterativeImputer

In [10]:

#import data from the files
dataset = pd.read_csv('Concrete_Data_Yeh_final.csv')

dataset = pd.DataFrame(dataset)
dataset = dataset.drop_duplicates() #drop duplicates
  
y = dataset["csMPa"]
X = dataset.drop("csMPa", axis=1)

23 duplicated rows dropped


In [16]:
results = []

# Define the scaler dictionary
scaler = {
    'standard_scaler': StandardScaler(),
    'min_max_scaler': MinMaxScaler(),
    'max_abs_scaler': MaxAbsScaler(),
    'robust_scaler': RobustScaler(),
    'quantile_transformer_normal': QuantileTransformer(n_quantiles=700, output_distribution='normal'),
    'quantile_transformer_uniform': QuantileTransformer(n_quantiles=700, output_distribution='uniform'),
    'power_transformer': PowerTransformer(),
    'normalizer': Normalizer(),
    'binarizer': Binarizer()}

# Define the imputer dictionary
imputer = {'Simple_mean': SimpleImputer(strategy='mean'),
              'Simple_median': SimpleImputer(strategy='median'),
                'Simple_most_frequent': SimpleImputer(strategy='most_frequent'),
                'Simple_constant': SimpleImputer(strategy='constant', fill_value=0),
                'Iterative': IterativeImputer(),
                'KNN': KNNImputer(),
                'MICE': IterativeImputer(initial_strategy='median', imputation_order='random', random_state=0),
                'Drop': 'drop'}  # Use 'drop' as a placeholder for droping rows with missing values

# Test all combinations of scalers and imputers
for imp in imputer:
    for scale in scaler:
        if imp == 'Drop':
            X_transformed = X.dropna()  # Drop rows with missing values
            y = y[X_transformed.index]  # Drop corresponding y values
        else:
            pipeline = Pipeline([
                ('imputer', imputer[imp]),
                ('scaler', scaler[scale])
            ])

            X_transformed = pipeline.fit_transform(X)  # Use X produced above

        X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=42)
        regressor = RandomForestRegressor() # Using the Random Forest Regressor as established as the best performing model previously
        regressor.fit(X_train, y_train)
        y_pred = regressor.predict(X_test)

        #Calculate error metrics and appending to results list
        rmse = sqrt(mean_squared_error(y_test, y_pred))
        r2 = r2_score(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        results.append([imp, scale, rmse, r2, mae])

#Converting to a dataframe for analysis/ranking and exporting to excel
df_results = pd.DataFrame(results, columns=['Imputer', 'Scaler', 'RMSE', 'R2', 'MAE'])
df_results['RMSE_rank'] = df_results['RMSE'].rank()
df_results['R2_rank'] = df_results['R2'].rank(ascending=False)
df_results['MAE_rank'] = df_results['MAE'].rank()
df_results['rank_sum'] = df_results['RMSE_rank'] + df_results['R2_rank'] + df_results['MAE_rank']

df_results.sort_values(by=['rank_sum'], inplace=True)
df_results.to_excel('Scaler_Imputer_Test.xlsx')

#Best performing pair of scaler and imputer based on the sum of the ranks
best_scaler = df_results['Scaler'].iloc[0]
best_imputer = df_results['Imputer'].iloc[0]

print('Best scaler: ', best_scaler)
print('Best imputer: ', best_imputer)



Best scaler:  power_transformer
Best imputer:  KNN
