In [10]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest, chi2

# --- Assume you have preprocessed data and feature names ---
# X_processed: Your numerical, non-negative feature data (NumPy array)
# y: Your target variable (e.g., a pandas Series or NumPy array)
# all_feature_names: A list of names for the columns in X_processed

# Example placeholder data:
X_processed = np.array([
    [0.5, 1.0, 1, 0, 1, 0],
    [0.8, 0.2, 0, 1, 0, 1],
    [0.1, 0.5, 1, 0, 1, 0],
    [1.0, 0.0, 0, 1, 0, 1],
    [0.4, 0.7, 0, 1, 1, 0]
])
y = np.array([1, 0, 1, 0, 1])
all_feature_names = ['age', 'fare', 'sex_female', 'sex_male', 'embarked_C', 'embarked_S']

# --- Apply Chi-Squared Feature Selection ---
# Select the 'k' best features. Let's choose k=4.
k_best_features = 4

# Create the selector object
# The score_func=chi2 calculates the chi-squared stat between each feature and the target.
selector = SelectKBest(score_func=chi2, k=k_best_features)

# Run the feature selection on your data
X_new = selector.fit_transform(X_processed, y)

# Get the names of the selected features
selected_indices = selector.get_support(indices=True)
selected_features = [all_feature_names[i] for i in selected_indices]


# --- Display the Results ---
print("--- Feature Selection Results ---")
print(f"Scores for each feature (higher is better): {np.round(selector.scores_, 2)}")
print(f"\nOriginal number of features: {X_processed.shape[1]}")
print(f"Reduced number of features: {X_new.shape[1]}")
print(f"Selected features: {selected_features}")

# The final data ready for a model
final_df = pd.DataFrame(X_new, columns=selected_features)
print("\n--- Final Data with Selected Features ---")
print(final_df)

--- Feature Selection Results ---
Scores for each feature (higher is better): [0.69 1.   1.33 0.89 2.   3.  ]

Original number of features: 6
Reduced number of features: 4
Selected features: ['fare', 'sex_female', 'embarked_C', 'embarked_S']

--- Final Data with Selected Features ---
   fare  sex_female  embarked_C  embarked_S
0   1.0         1.0         1.0         0.0
1   0.2         0.0         0.0         1.0
2   0.5         1.0         1.0         0.0
3   0.0         0.0         0.0         1.0
4   0.7         0.0         1.0         0.0



**(1) Prepare non-negative data and a target variable**
**(2) choose the number of features k to keep**
**(3) calculate the Chi-squared score for each feature**
**(4) select the top k features with the highest scores**
**(5) create the final dataset.**