In [189]:
import pandas as pd
from sklearn.feature_selection import SelectKBest, mutual_info_regression
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder

In [190]:
# Load and preprocess data
df = pd.read_csv("sample_dataset.csv")

"""As there are NAN existing, 
SelectKBest and OrdinalEncoder cannot handle NaN values

SimpleImputer was included as a precautionary data cleaning step"""


imputer = SimpleImputer(strategy='most_frequent')
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)



In [191]:
# Prepare features and target
X = df_imputed.iloc[:, 0:10]
y = df_imputed['radius error']



In [192]:
# Encode categorical features
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
X_encoded = encoder.fit_transform(X)



In [193]:
# Feature selection
selector = SelectKBest(
    lambda X, y: mutual_info_regression(X, y, discrete_features='auto'),
    k=5
)
X_selected = selector.fit_transform(X_encoded, y)
selected_features = X.columns[selector.get_support()]



In [194]:
X_selected

array([[137., 411., 321., 401., 357.],
       [370., 440., 366., 238., 267.],
       [355., 433., 344.,   0.,   0.],
       ...,
       [296., 370.,  37.,   0., 221.],
       [373., 453., 357., 410., 360.],
       [  3.,   1.,   3.,   0.,   0.]], shape=(569, 5))

In [195]:
selected_features

Index(['mean radius', 'mean perimeter', 'mean area', 'mean concavity',
       'mean concave points'],
      dtype='object')

In [196]:
# Output results
print("Selected features:", selected_features.tolist())

"""The output shows the 5 most important features for predicting the 'radius error' target variable, 
selected using mutual information regression."""

Selected features: ['mean radius', 'mean perimeter', 'mean area', 'mean concavity', 'mean concave points']


"The output shows the 5 most important features for predicting the 'radius error' target variable, \nselected using mutual information regression."