Collect the Data

In [1]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
auto_mpg = fetch_ucirepo(id=9) 
  
# data (as pandas dataframes) 
X = auto_mpg.data.features 
y = auto_mpg.data.targets 

Impute the missing values

In [2]:
from sklearn.impute import SimpleImputer

# Create the imputer using the mean strategy
imputer = SimpleImputer(strategy='mean')

# Fit and transform only the 'horsepower' column
X['horsepower'] = imputer.fit_transform(X[['horsepower']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['horsepower'] = imputer.fit_transform(X[['horsepower']])


Identifying the optimal feature count through ML models

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from feature_engine.selection import MRMR as sel
from sklearn.metrics import r2_score
from custom_mrmr import MRMR as sel_special

# Define models
models = {
    'LinearRegression': LinearRegression(),
    'DecisionTree': DecisionTreeRegressor(random_state=42),
    'KNN': KNeighborsRegressor()
}

# Parameters
n_splits = 5
random_state = 42
max_k = X.shape[1] 

# Result storage
results = []

# K-Fold split
kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)

for fold, (train_idx, test_idx) in enumerate(kf.split(X), start=1):
    print(f"\n🔄 Fold {fold}")
    
    # Split and impute
    X_train = X.iloc[train_idx].copy()
    X_test = X.iloc[test_idx].copy()
    y_train = y.iloc[train_idx]
    y_test = y.iloc[test_idx]
    
    # Try different numbers of selected features
    for k in range(1, max_k + 1):
        if k==1:
            # MRMR selector
            selector = sel_special(method="MID", regression=True, random_state=random_state, max_features=X.shape[1])
            selector.fit(X_train, y_train)
            # Get selected features in order
            selected_features = selector.selected_features_
            # Build k → list of top-k selected features
            k_dict = {k: selected_features[:k] for k in range(1, len(selected_features) + 1)}

            X_train_selected = X_train[k_dict[1]]
            print(X_train_selected)
            X_test_selected = X_test[k_dict[1]]

            # Fit and evaluate each model
            for model_name, model in models.items():
                model.fit(X_train_selected, y_train)
                preds = model.predict(X_test_selected)
                mse = mean_squared_error(y_test, preds)
                rmse = np.sqrt(mse)
                # Calculate R-squared
                r2 = r2_score(y_test, preds)

                results.append({
                    'Fold': fold,
                    'Model': model_name,
                    'Num_Features': k,
                    'RMSE': rmse,
                    'R2': r2

                })
        else:
                
            # MRMR selector
            selector = sel(
                method="MID",
                regression=True,
                max_features=k,
                random_state=random_state
            )
            
            selector.fit(X_train, y_train)
            X_train_selected = selector.transform(X_train)
            X_test_selected = selector.transform(X_test)

            # Fit and evaluate each model
            for model_name, model in models.items():
                model.fit(X_train_selected, y_train)
                preds = model.predict(X_test_selected)
                mse = mean_squared_error(y_test, preds)
                rmse = np.sqrt(mse)
                # Calculate R-squared
                r2 = r2_score(y_test, preds)

                results.append({
                    'Fold': fold,
                    'Model': model_name,
                    'Num_Features': k,
                    'RMSE': rmse,
                    'R2': r2

                })

# Convert results to DataFrame
results_df = pd.DataFrame(results)


🔄 Fold 1
     displacement
1           350.0
2           318.0
3           304.0
4           302.0
6           454.0
..            ...
390         144.0
391         135.0
392         151.0
395         135.0
397         119.0

[318 rows x 1 columns]


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)



🔄 Fold 2
     weight
0      3504
1      3693
2      3436
4      3449
5      4341
..      ...
393    2790
394    2130
395    2295
396    2625
397    2720

[318 rows x 1 columns]


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)



🔄 Fold 3
     displacement
0           307.0
1           350.0
3           304.0
4           302.0
5           429.0
..            ...
393         140.0
394          97.0
395         135.0
396         120.0
397         119.0

[318 rows x 1 columns]


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)



🔄 Fold 4
     weight
0      3504
1      3693
2      3436
3      3433
5      4341
..      ...
393    2790
394    2130
395    2295
396    2625
397    2720

[319 rows x 1 columns]


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)



🔄 Fold 5
     weight
0      3504
2      3436
3      3433
4      3449
5      4341
..      ...
390    2665
392    2950
393    2790
394    2130
396    2625

[319 rows x 1 columns]


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Plotting the curve

In [5]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set global font sizes
plt.rcParams.update({
    'font.size': 14,          # Base font size
    'axes.titlesize': 16,     # Title font
    'axes.labelsize': 14,     # Axis label font
    'xtick.labelsize': 12,    # X-axis tick labels
    'ytick.labelsize': 12,    # Y-axis tick labels
    'legend.fontsize': 12,    # Legend font
    'legend.title_fontsize': 13
})

# Set Seaborn style
sns.set(style="whitegrid")

# Get unique model names and folds
model_names = results_df['Model'].unique()
folds = sorted(results_df['Fold'].unique())

# Create one plot per model
for model_name in model_names:
    plt.figure(figsize=(10, 6))

    for fold in folds:
        fold_data = results_df[
            (results_df['Model'] == model_name) &
            (results_df['Fold'] == fold)
        ].sort_values(by='Num_Features')

        plt.plot(
            fold_data['Num_Features'],
            fold_data['RMSE'],
            label=f'Fold {fold}',
            marker='o'
        )

    plt.xlabel('Number of Features')
    plt.ylabel('Test RMSE')
    plt.legend(title='Fold', loc='best')
    plt.tight_layout()
    plt.grid(True)

    # Save plot as PDF
    safe_name = model_name.replace(" ", "_").replace("/", "_")
    plt.savefig(f'{safe_name}_rmse_plot.pdf', format='pdf')

    plt.close()
