# Feature Selection Analysis for Heart Disease Dataset

This notebook demonstrates comprehensive feature selection techniques for the heart disease dataset, including:
- Random Forest feature importance
- Recursive Feature Elimination (RFE)
- Chi-Square statistical tests
- Correlation-based selection
- Univariate statistical tests
- Mutual information
- Ensemble feature selection

## Objectives
1. Apply multiple feature selection methods
2. Compare their performance and results
3. Select the optimal feature subset
4. Visualize feature importance and rankings

In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import warnings
warnings.filterwarnings('ignore')

# Add src to path
sys.path.append('../src')

from feature_selector import FeatureSelector
from data_processor import DataProcessor

# Set plotting style
plt.style.use('default')
sns.set_palette("husl")

print("Libraries imported successfully!")

Libraries imported successfully!


## 1. Data Loading and Preparation

In [2]:
# Load the cleaned dataset
data = pd.read_csv('../data/processed/heart_disease_cleaned.csv')

print(f"Dataset shape: {data.shape}")
print(f"\nColumns: {list(data.columns)}")
print(f"\nFirst few rows:")
data.head()

Dataset shape: (303, 14)

Columns: ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target']

First few rows:


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63.0,1,1,145.0,233.0,1,2,150.0,0,2.3,3,0,6,0
1,67.0,1,4,160.0,286.0,0,2,108.0,1,1.5,2,3,3,1
2,67.0,1,4,120.0,229.0,0,2,129.0,1,2.6,2,2,7,1
3,37.0,1,3,130.0,250.0,0,0,187.0,0,3.5,3,0,3,0
4,41.0,0,2,130.0,204.0,0,2,172.0,0,1.4,1,0,3,0


In [3]:
# Split features and target
X = data.drop('target', axis=1)
y = data['target']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nTarget distribution:")
print(y.value_counts())

# Display basic statistics
print(f"\nFeature statistics:")
X.describe()

Features shape: (303, 13)
Target shape: (303,)

Target distribution:
target
0    164
1    139
Name: count, dtype: int64

Feature statistics:


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.438944,0.679868,3.158416,131.346535,245.584158,0.148515,0.990099,149.65264,0.326733,1.024422,1.60066,0.663366,4.722772
std,9.038662,0.467299,0.960126,16.648749,47.558803,0.356198,0.994971,22.731735,0.469794,1.110127,0.616226,0.934375,1.938383
min,29.0,0.0,1.0,94.0,126.0,0.0,0.0,84.75,0.0,0.0,1.0,0.0,3.0
25%,48.0,0.0,3.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0,3.0
50%,56.0,1.0,3.0,130.0,241.0,0.0,1.0,153.0,0.0,0.8,2.0,0.0,3.0
75%,61.0,1.0,4.0,140.0,275.0,0.0,2.0,166.0,1.0,1.6,2.0,1.0,7.0
max,77.0,1.0,4.0,170.0,371.0,1.0,2.0,202.0,1.0,4.0,3.0,3.0,7.0


## 2. Initialize Feature Selector

In [None]:
# Initialize the feature selector
selector = FeatureSelector(log_level="INFO")
print("Feature selector initialized successfully!")

## 3. Individual Feature Selection Methods

### 3.1 Random Forest Feature Importance

In [None]:
# Random Forest feature importance
rf_result = selector.random_forest_importance(X, y, n_estimators=100)

print("Random Forest Feature Importance Results:")
print(f"Top 10 features:")
print(rf_result['feature_rankings'].head(10))

# Plot feature importance
selector.plot_feature_importance('random_forest', top_n=13, figsize=(12, 8))

### 3.2 Recursive Feature Elimination (RFE)

In [None]:
# Recursive Feature Elimination
rfe_result = selector.recursive_feature_elimination(X, y, cv=5)

print("RFE Results:")
print(f"Number of features selected: {rfe_result['n_features_selected']}")
print(f"Selected features: {rfe_result['selected_features']}")
print(f"\nFeature rankings:")
print(rfe_result['feature_rankings'])

### 3.3 Chi-Square Statistical Test

In [None]:
# Chi-Square feature selection
chi2_result = selector.chi_square_selection(X, y, k=8)

print("Chi-Square Selection Results:")
print(f"Selected features: {chi2_result['selected_features']}")
print(f"\nTop features by Chi-Square score:")
print(chi2_result['feature_rankings'].head(10))

# Plot Chi-Square scores
selector.plot_feature_importance('chi_square', top_n=13, figsize=(12, 8))

### 3.4 Univariate Statistical Selection

In [None]:
# Univariate feature selection
uni_result = selector.univariate_selection(X, y, k=8)

print("Univariate Selection Results:")
print(f"Selected features: {uni_result['selected_features']}")
print(f"\nTop features by F-score:")
print(uni_result['feature_rankings'].head(10))

# Plot univariate scores
selector.plot_feature_importance('univariate_f_classif', top_n=13, figsize=(12, 8))

### 3.5 Mutual Information

In [None]:
# Mutual information feature selection
mi_result = selector.mutual_information_selection(X, y, k=8)

print("Mutual Information Selection Results:")
print(f"Selected features: {mi_result['selected_features']}")
print(f"\nTop features by MI score:")
print(mi_result['feature_rankings'].head(10))

# Plot mutual information scores
selector.plot_feature_importance('mutual_information', top_n=13, figsize=(12, 8))

### 3.6 Correlation-Based Selection

In [None]:
# Correlation-based feature selection
corr_result = selector.correlation_based_selection(X, threshold=0.8)

print("Correlation-Based Selection Results:")
print(f"Features to remove (high correlation): {corr_result['features_to_remove']}")
print(f"Remaining features: {corr_result['remaining_features']}")
print(f"Number of features removed: {corr_result['n_removed']}")

# Plot correlation matrix
plt.figure(figsize=(12, 10))
sns.heatmap(corr_result['correlation_matrix'], annot=True, cmap='coolwarm', center=0, 
            square=True, fmt='.2f')
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

# Show high correlation pairs if any
if corr_result['high_corr_pairs']:
    print(f"\nHigh correlation pairs (threshold > {corr_result['threshold']}):")
    for pair in corr_result['high_corr_pairs']:
        print(f"{pair[0]} - {pair[1]}: {pair[2]:.3f}")
else:
    print(f"\nNo feature pairs found with correlation > {corr_result['threshold']}")

## 4. Comprehensive Method Comparison

In [None]:
# Compare all feature selection methods
comparison_result = selector.compare_selection_methods(X, y)

print("Feature Selection Methods Comparison:")
print("=" * 50)

# Display performance comparison
if 'performance_comparison' in comparison_result:
    print("\nMethod Performance (Cross-Validation Accuracy):")
    perf_df = pd.DataFrame(comparison_result['performance_comparison']).T
    perf_df = perf_df.sort_values('cv_mean', ascending=False)
    print(perf_df[['cv_mean', 'cv_std', 'n_features']])

# Display summary
if 'summary' in comparison_result:
    summary = comparison_result['summary']
    print(f"\nSummary:")
    print(f"Best performing method: {summary['best_method']}")
    print(f"Average features selected: {summary['avg_features_selected']:.1f}")
    
    print(f"\nMost frequently selected features:")
    for feature, count in summary['most_frequent_features'][:10]:
        print(f"{feature}: selected by {count} methods")

In [None]:
# Create comprehensive comparison plots
selector.plot_method_comparison(figsize=(16, 12))

## 5. Ensemble Feature Selection

In [None]:
# Select best features using ensemble approach
ensemble_result = selector.select_best_features(X, y, strategy='ensemble')

print("Ensemble Feature Selection Results:")
print(f"Strategy: {ensemble_result['strategy']}")
print(f"Number of features selected: {ensemble_result['n_features']}")
print(f"Selected features: {ensemble_result['selected_features']}")

if 'performance' in ensemble_result:
    perf = ensemble_result['performance']
    print(f"\nPerformance:")
    print(f"Cross-validation accuracy: {perf['cv_mean']:.4f} ± {perf['cv_std']:.4f}")
    print(f"Training accuracy: {perf['train_accuracy']:.4f}")
    print(f"Feature ratio: {perf['feature_ratio']:.2f}")

## 6. Alternative Selection Strategies

In [None]:
# Compare different selection strategies
strategies = ['ensemble', 'best_method', 'union']
strategy_results = {}

for strategy in strategies:
    try:
        result = selector.select_best_features(X, y, strategy=strategy)
        strategy_results[strategy] = result
        print(f"\n{strategy.upper()} Strategy:")
        print(f"Features: {result['selected_features']}")
        print(f"Count: {result['n_features']}")
        if 'performance' in result:
            print(f"CV Accuracy: {result['performance']['cv_mean']:.4f} ± {result['performance']['cv_std']:.4f}")
    except Exception as e:
        print(f"Error with {strategy}: {e}")

# Create comparison DataFrame
if strategy_results:
    strategy_comparison = []
    for strategy, result in strategy_results.items():
        if 'performance' in result:
            strategy_comparison.append({
                'Strategy': strategy,
                'N_Features': result['n_features'],
                'CV_Accuracy': result['performance']['cv_mean'],
                'CV_Std': result['performance']['cv_std'],
                'Feature_Ratio': result['performance']['feature_ratio']
            })
    
    strategy_df = pd.DataFrame(strategy_comparison)
    print("\nStrategy Comparison:")
    print(strategy_df)

## 7. Feature Selection Visualization

In [None]:
# Create a comprehensive feature selection summary plot
fig, axes = plt.subplots(2, 3, figsize=(18, 12))

# Plot 1: Random Forest Importance
rf_data = selector.feature_rankings['random_forest'].head(10)
axes[0, 0].barh(range(len(rf_data)), rf_data['importance'], color='skyblue', alpha=0.7)
axes[0, 0].set_yticks(range(len(rf_data)))
axes[0, 0].set_yticklabels(rf_data['feature'])
axes[0, 0].set_title('Random Forest Feature Importance')
axes[0, 0].set_xlabel('Importance Score')
axes[0, 0].invert_yaxis()

# Plot 2: Chi-Square Scores
chi2_data = selector.feature_rankings['chi_square'].head(10)
axes[0, 1].barh(range(len(chi2_data)), chi2_data['chi2_score'], color='lightcoral', alpha=0.7)
axes[0, 1].set_yticks(range(len(chi2_data)))
axes[0, 1].set_yticklabels(chi2_data['feature'])
axes[0, 1].set_title('Chi-Square Feature Scores')
axes[0, 1].set_xlabel('Chi-Square Score')
axes[0, 1].invert_yaxis()

# Plot 3: Mutual Information
mi_data = selector.feature_rankings['mutual_information'].head(10)
axes[0, 2].barh(range(len(mi_data)), mi_data['mi_score'], color='lightgreen', alpha=0.7)
axes[0, 2].set_yticks(range(len(mi_data)))
axes[0, 2].set_yticklabels(mi_data['feature'])
axes[0, 2].set_title('Mutual Information Scores')
axes[0, 2].set_xlabel('MI Score')
axes[0, 2].invert_yaxis()

# Plot 4: Feature Selection Frequency
feature_freq = comparison_result['summary']['feature_frequency']
top_features = sorted(feature_freq.items(), key=lambda x: x[1], reverse=True)[:10]
features, frequencies = zip(*top_features)

axes[1, 0].barh(range(len(features)), frequencies, color='gold', alpha=0.7)
axes[1, 0].set_yticks(range(len(features)))
axes[1, 0].set_yticklabels(features)
axes[1, 0].set_title('Feature Selection Frequency')
axes[1, 0].set_xlabel('Selection Count')
axes[1, 0].invert_yaxis()

# Plot 5: Method Performance Comparison
if 'performance_comparison' in comparison_result:
    perf_data = comparison_result['performance_comparison']
    methods = list(perf_data.keys())
    cv_scores = [perf_data[m]['cv_mean'] for m in methods]
    cv_stds = [perf_data[m]['cv_std'] for m in methods]
    
    axes[1, 1].bar(methods, cv_scores, yerr=cv_stds, capsize=5, alpha=0.7, color='plum')
    axes[1, 1].set_title('Method Performance Comparison')
    axes[1, 1].set_ylabel('CV Accuracy')
    axes[1, 1].tick_params(axis='x', rotation=45)
    axes[1, 1].grid(axis='y', alpha=0.3)

# Plot 6: Selected Features Visualization
selected_features = ensemble_result['selected_features']
all_features = list(X.columns)
selection_status = ['Selected' if f in selected_features else 'Not Selected' for f in all_features]

selected_count = selection_status.count('Selected')
not_selected_count = selection_status.count('Not Selected')

axes[1, 2].pie([selected_count, not_selected_count], 
               labels=['Selected', 'Not Selected'],
               autopct='%1.1f%%',
               colors=['lightblue', 'lightgray'],
               startangle=90)
axes[1, 2].set_title('Feature Selection Summary')

plt.tight_layout()
plt.show()

## 8. Save Results and Create Selected Dataset

In [None]:
# Save all feature selection results
saved_files = selector.save_results('../results/feature_selection')

print("Saved Files:")
for key, path in saved_files.items():
    print(f"{key}: {path}")

In [None]:
# Create dataset with selected features
selected_features = ensemble_result['selected_features']
dataset_path = selector.create_selected_dataset(X, y, selected_features, 
                                              '../data/processed/heart_disease_selected.csv')

print(f"Selected dataset saved to: {dataset_path}")

# Load and display the selected dataset
selected_data = pd.read_csv('../data/processed/heart_disease_selected.csv')
print(f"\nSelected dataset shape: {selected_data.shape}")
print(f"Selected features: {list(selected_data.columns[:-1])}")
print(f"\nFirst few rows of selected dataset:")
selected_data.head()

In [None]:
# Save selected features information
import json

selected_features_info = {
    'selected_features': ensemble_result['selected_features'],
    'n_features': len(ensemble_result['selected_features']),
    'selection_strategy': 'ensemble',
    'performance': ensemble_result['performance'],
    'original_features': list(X.columns),
    'n_original_features': len(X.columns),
    'feature_reduction_ratio': len(ensemble_result['selected_features']) / len(X.columns),
    'method_comparison': {method: result.get('selected_features', []) 
                         for method, result in comparison_result['methods_results'].items()}
}

with open('../results/feature_selection/selected_features.json', 'w') as f:
    json.dump(selected_features_info, f, indent=2)

print("Selected features information saved to: ../results/feature_selection/selected_features.json")

## 9. Summary and Conclusions

In [None]:
print("FEATURE SELECTION ANALYSIS SUMMARY")
print("=" * 50)

print(f"\nOriginal dataset: {X.shape[1]} features")
print(f"Selected features: {len(ensemble_result['selected_features'])} features")
print(f"Reduction ratio: {(1 - len(ensemble_result['selected_features'])/X.shape[1]):.1%}")

print(f"\nFinal selected features:")
for i, feature in enumerate(ensemble_result['selected_features'], 1):
    print(f"{i:2d}. {feature}")

print(f"\nPerformance with selected features:")
perf = ensemble_result['performance']
print(f"Cross-validation accuracy: {perf['cv_mean']:.4f} ± {perf['cv_std']:.4f}")
print(f"Training accuracy: {perf['train_accuracy']:.4f}")

print(f"\nMethod comparison (by CV accuracy):")
if 'performance_comparison' in comparison_result:
    sorted_methods = sorted(comparison_result['performance_comparison'].items(), 
                          key=lambda x: x[1]['cv_mean'], reverse=True)
    for i, (method, perf) in enumerate(sorted_methods, 1):
        print(f"{i}. {method}: {perf['cv_mean']:.4f} ± {perf['cv_std']:.4f} ({perf['n_features']} features)")

print(f"\nKey insights:")
print(f"- Most important features appear consistently across methods")
print(f"- Ensemble approach balances performance and feature count")
print(f"- Feature reduction maintains high predictive accuracy")
print(f"- Selected features represent diverse aspects of heart health")

print(f"\nFiles created:")
print(f"- Selected dataset: ../data/processed/heart_disease_selected.csv")
print(f"- Feature rankings: ../results/feature_selection/")
print(f"- Selection summary: ../results/feature_selection/selected_features.json")