In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler, QuantileTransformer, PowerTransformer, Normalizer, Binarizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from math import sqrt
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer

    
results = []

scaler = {
    'standard_scaler': StandardScaler(),
    'min_max_scaler': MinMaxScaler(),
    'max_abs_scaler': MaxAbsScaler(),
    'robust_scaler': RobustScaler(),
    'quantile_transformer_normal': QuantileTransformer(n_quantiles=700, output_distribution='normal'),
    'quantile_transformer_uniform': QuantileTransformer(n_quantiles=700, output_distribution='uniform'),
    'power_transformer': PowerTransformer(),
    'normalizer': Normalizer(),
    'binarizer': Binarizer()
}

imputer = {'Simple_mean': SimpleImputer(strategy='mean'),
              'Simple_median': SimpleImputer(strategy='median'),
                'Simple_most_frequent': SimpleImputer(strategy='most_frequent'),
                'Simple_constant': SimpleImputer(strategy='constant', fill_value=0),
                'Iterative': IterativeImputer(),
                'KNN': KNNImputer(),
                'MICE': IterativeImputer(initial_strategy='median', imputation_order='random', random_state=0)}  
# Define the imputer dictionary

for imp in imputer:
    for scale in scaler:
        pipeline = Pipeline([
            ('imputer', imputer[imp]),
            ('scaler', scaler[scale])
        ])

        pipeline.fit(finalX_data, finalY_data)
        finalX_data = pipeline.transform(finalX_data)
        X_train, X_test, y_train, y_test = train_test_split(finalX_data, finalY_data, test_size=0.2, random_state=42)
        regressor = RandomForestRegressor()
        regressor.fit(X_train, y_train)
        y_pred = regressor.predict(X_test)
        rmse = sqrt(mean_squared_error(y_test, y_pred))
        r2 = r2_score(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        results.append([imp, scale, rmse, r2, mae])

df_results = pd.DataFrame(results, columns=['Imputer', 'Scaler', 'RMSE', 'R2', 'MAE'])
df_results['RMSE_rank'] = df_results['RMSE'].rank()
df_results['R2_rank'] = df_results['R2'].rank(ascending=False)
df_results['MAE_rank'] = df_results['MAE'].rank()
df_results['rank_sum'] = df_results['RMSE_rank'] + df_results['R2_rank'] + df_results['MAE_rank']

df_results.sort_values(by=['rank_sum'], inplace=True)
df_results.to_excel('Scaler_Imputer_Test.xlsx')
best_scaler = df_results['Scaler'].iloc[0]
best_imputer = df_results['Imputer'].iloc[0]

X_train, X_test, y_train, y_test = train_test_split(finalX_data, finalY_data, test_size=0.2, random_state=42)

print(best_scaler)
print(best_imputer)

final_pipeline = Pipeline([
    ('imputer', imputer[best_imputer]),
    ('scaler', scaler[best_scaler])
])

X_traindf = pd.DataFrame(X_train)
X_testdf = pd.DataFrame(X_test)

X_train_final = pd.DataFrame(final_pipeline.fit_transform(X_traindf), columns=X_traindf.columns)
X_test_final = pd.DataFrame(final_pipeline.transform(X_testdf), columns=X_testdf.columns)
y_train_final = y_train
y_test_final = y_test
    

NameError: name 'SimpleImputer' is not defined