# Variable Selection Analysis

**Date:** 25 September 2024

**Topic:** Feature selection techniques for regression models

This notebook demonstrates:
1. Correlation Analysis
2. Mutual Information for Feature Selection
3. Interactive Visualization Techniques
4. California Housing Dataset Analysis

## 1. Setup and Data Loading

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_california_housing
import plotly.graph_objects as go

# Set plotting parameters
plt.style.use('default')
sns.set_palette("husl")

### 1.1 California Housing Dataset
Loading the California Housing dataset which contains information about house values and demographic features.

In [None]:
# Load California Housing dataset
california = fetch_california_housing(as_frame=True)

# Display dataset structure
print("Dataset Information:")
print(f"Number of samples: {california.data.shape[0]}")
print(f"Number of features: {california.data.shape[1]}")
print(f"Target variable: {california.target_names[0]}")
print("\nFeature names:")
for i, feature in enumerate(california.feature_names):
    print(f"{i+1}. {feature}")

### 1.2 Data Preparation

In [None]:
# Combine features and target into a single DataFrame
df = pd.concat([california.data, california.target], axis=1)

# Export for future use
df.to_csv("housing.csv", index=False)

# Display basic information
print("Dataset shape:", df.shape)
print("\nFirst few rows:")
df.head()

### 1.3 Data Quality Check

In [None]:
# Check data types
print("Data types:")
print(df.dtypes)
print("\n" + "="*50)

# Check for missing values
print("\nMissing values per column:")
missing_values = df.isna().sum()
print(missing_values)

if missing_values.sum() == 0:
    print("\n‚úÖ No missing values found in the dataset")
else:
    print(f"\n‚ö†Ô∏è Total missing values: {missing_values.sum()}")

## 2. Correlation Analysis

### 2.1 Correlation Matrix Calculation

In [None]:
# Calculate correlation matrix for numeric features
numeric_features = df.select_dtypes(include=[np.number])
correlation_matrix = numeric_features.corr()

# Display correlation with target variable
target_correlations = correlation_matrix['MedHouseVal'].drop('MedHouseVal').sort_values(key=abs, ascending=False)

print("Correlation with Target Variable (MedHouseVal):")
print("=" * 45)
for feature, corr in target_correlations.items():
    direction = "üìà" if corr > 0 else "üìâ"
    strength = "Strong" if abs(corr) > 0.5 else "Moderate" if abs(corr) > 0.3 else "Weak"
    print(f"{feature:12}: {corr:6.3f} {direction} ({strength})")

### 2.2 Interactive Correlation Heatmap

In [None]:
# Create interactive correlation heatmap using Plotly
fig = go.Figure()

fig.add_trace(
    go.Heatmap(
        z=correlation_matrix.values,
        x=correlation_matrix.columns,
        y=correlation_matrix.columns,
        text=np.around(correlation_matrix.values, decimals=3),
        texttemplate="%{text}",
        textfont={"size": 10},
        colorscale="RdBu",
        zmid=0,
        colorbar=dict(
            title="Correlation",
            titleside="right"
        )
    )
)

fig.update_layout(
    title={
        'text': "Feature Correlation Matrix - California Housing Dataset",
        'x': 0.5,
        'xanchor': 'center'
    },
    xaxis_title="Features",
    yaxis_title="Features",
    width=800,
    height=700,
    font=dict(size=12)
)

fig.show()

### 2.3 Static Correlation Visualization

In [None]:
# Create static correlation heatmap using seaborn
plt.figure(figsize=(12, 10))
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))

sns.heatmap(
    correlation_matrix,
    mask=mask,
    annot=True,
    cmap='RdBu_r',
    center=0,
    square=True,
    linewidths=0.5,
    cbar_kws={"shrink": 0.5},
    fmt='.3f'
)

plt.title('Correlation Matrix - Lower Triangle', fontsize=16, pad=20)
plt.tight_layout()
plt.show()

## 3. Mutual Information Analysis

### 3.1 Data Splitting for Analysis

In [None]:
# Split the data for mutual information calculation
X_train, X_test, y_train, y_test = train_test_split(
    california.data, 
    california.target, 
    test_size=0.25,
    random_state=42
)

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")
print(f"Feature dimensions: {X_train.shape[1]} features")

### 3.2 Mutual Information Calculation
Mutual information measures the dependency between variables, capturing both linear and non-linear relationships.

In [None]:
# Calculate mutual information scores
mi_scores = mutual_info_regression(X_train, y_train, random_state=42)
mi_series = pd.Series(mi_scores, index=X_train.columns).sort_values(ascending=False)

# Display mutual information results
print("Mutual Information Scores:")
print("=" * 35)
mi_df = mi_series.to_frame("mutual_information")
mi_df['rank'] = range(1, len(mi_df) + 1)
mi_df = mi_df[['rank', 'mutual_information']]

for feature, row in mi_df.iterrows():
    print(f"{row['rank']:2}. {feature:12}: {row['mutual_information']:.4f}")

display(mi_df)

### 3.3 Mutual Information Visualization

In [None]:
# Create horizontal bar plot for mutual information
plt.figure(figsize=(10, 6))
bars = plt.barh(range(len(mi_series)), mi_series.values, color='skyblue', alpha=0.7)
plt.yticks(range(len(mi_series)), mi_series.index)
plt.xlabel('Mutual Information Score')
plt.title('Feature Importance: Mutual Information with Target Variable')
plt.grid(axis='x', alpha=0.3)

# Add value labels on bars
for i, (feature, score) in enumerate(mi_series.items()):
    plt.text(score + 0.001, i, f'{score:.3f}', va='center', fontsize=9)

plt.tight_layout()
plt.show()

### 3.4 Alternative Visualization - Vertical Bar Plot

In [None]:
# Create vertical bar plot using seaborn
plt.figure(figsize=(10, 6))
ax = sns.barplot(
    x=mi_series.index, 
    y=mi_series.values,
    palette='viridis'
)

plt.xticks(rotation=45, ha="right")
plt.ylabel('Mutual Information Score')
plt.xlabel('Features')
plt.title('Mutual Information Scores for Regression Features')
plt.grid(axis='y', alpha=0.3)

# Add value labels on top of bars
for i, v in enumerate(mi_series.values):
    ax.text(i, v + 0.01, f'{v:.3f}', ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.show()

## 4. Comparison: Correlation vs Mutual Information

### 4.1 Side-by-Side Comparison

In [None]:
# Create comparison DataFrame
comparison_df = pd.DataFrame({
    'Correlation': target_correlations,
    'Mutual_Information': mi_series
})

# Add absolute correlation for ranking
comparison_df['Abs_Correlation'] = comparison_df['Correlation'].abs()

# Sort by mutual information (descending)
comparison_df = comparison_df.sort_values('Mutual_Information', ascending=False)

print("Feature Ranking Comparison:")
print("=" * 60)
print(f"{'Feature':<12} {'Correlation':<12} {'MI Score':<10} {'MI Rank':<8} {'Corr Rank':<10}")
print("-" * 60)

# Calculate ranks
mi_ranks = comparison_df['Mutual_Information'].rank(ascending=False)
corr_ranks = comparison_df['Abs_Correlation'].rank(ascending=False)

for feature, row in comparison_df.iterrows():
    mi_rank = int(mi_ranks[feature])
    corr_rank = int(corr_ranks[feature])
    print(f"{feature:<12} {row['Correlation']:>9.3f}   {row['Mutual_Information']:>7.3f}   {mi_rank:>5}     {corr_rank:>7}")

display(comparison_df[['Correlation', 'Mutual_Information']].round(4))

### 4.2 Scatter Plot Comparison

In [None]:
# Create scatter plot comparing correlation vs mutual information
plt.figure(figsize=(10, 8))

scatter = plt.scatter(
    comparison_df['Abs_Correlation'], 
    comparison_df['Mutual_Information'],
    s=100, 
    alpha=0.7,
    c=range(len(comparison_df)),
    cmap='tab10'
)

# Add feature labels
for feature, row in comparison_df.iterrows():
    plt.annotate(
        feature, 
        (row['Abs_Correlation'], row['Mutual_Information']),
        xytext=(5, 5), 
        textcoords='offset points',
        fontsize=10,
        ha='left'
    )

plt.xlabel('Absolute Correlation with Target')
plt.ylabel('Mutual Information Score')
plt.title('Feature Selection: Correlation vs Mutual Information')
plt.grid(True, alpha=0.3)

# Add diagonal reference line
max_val = max(comparison_df['Abs_Correlation'].max(), comparison_df['Mutual_Information'].max())
plt.plot([0, max_val], [0, max_val], 'r--', alpha=0.5, label='Perfect Agreement')
plt.legend()

plt.tight_layout()
plt.show()

## 5. Feature Selection Recommendations

### 5.1 Top Features by Different Criteria

In [None]:
# Define selection criteria
n_features = 5

# Top features by mutual information
top_mi_features = mi_series.head(n_features).index.tolist()

# Top features by absolute correlation
top_corr_features = target_correlations.abs().sort_values(ascending=False).head(n_features).index.tolist()

# Combined approach (average rank)
comparison_df['Combined_Rank'] = (mi_ranks + corr_ranks) / 2
top_combined_features = comparison_df.sort_values('Combined_Rank').head(n_features).index.tolist()

print(f"Top {n_features} Feature Recommendations:")
print("=" * 50)

print("\n1. By Mutual Information:")
for i, feature in enumerate(top_mi_features, 1):
    score = mi_series[feature]
    print(f"   {i}. {feature:12} (MI: {score:.4f})")

print("\n2. By Absolute Correlation:")
for i, feature in enumerate(top_corr_features, 1):
    score = abs(target_correlations[feature])
    print(f"   {i}. {feature:12} (|Corr|: {score:.4f})")

print("\n3. By Combined Ranking:")
for i, feature in enumerate(top_combined_features, 1):
    mi_score = mi_series[feature]
    corr_score = target_correlations[feature]
    print(f"   {i}. {feature:12} (MI: {mi_score:.4f}, Corr: {corr_score:.4f})")

# Find consensus features
consensus_features = set(top_mi_features) & set(top_corr_features) & set(top_combined_features)
print(f"\nüéØ Consensus Features (appear in all top-{n_features} lists): {list(consensus_features)}")

## 6. Summary and Insights

### Key Findings

In [None]:
# Summary statistics
print("VARIABLE SELECTION ANALYSIS SUMMARY")
print("=" * 50)

print(f"\nüìä Dataset Overview:")
print(f"   ‚Ä¢ Total samples: {df.shape[0]:,}")
print(f"   ‚Ä¢ Total features: {df.shape[1]-1}")
print(f"   ‚Ä¢ Target variable: {california.target_names[0]}")

print(f"\nüîç Correlation Analysis:")
strongest_corr = target_correlations.abs().max()
strongest_feature = target_correlations.abs().idxmax()
print(f"   ‚Ä¢ Strongest correlation: {strongest_feature} ({target_correlations[strongest_feature]:.3f})")
print(f"   ‚Ä¢ Average absolute correlation: {target_correlations.abs().mean():.3f}")

print(f"\nüéØ Mutual Information Analysis:")
best_mi_feature = mi_series.idxmax()
best_mi_score = mi_series.max()
print(f"   ‚Ä¢ Highest MI score: {best_mi_feature} ({best_mi_score:.4f})")
print(f"   ‚Ä¢ Average MI score: {mi_series.mean():.4f}")

print(f"\nüí° Recommendations:")
print(f"   ‚Ä¢ Primary features: {', '.join(top_combined_features[:3])}")
print(f"   ‚Ä¢ Method: Both correlation and mutual information agree on top performers")
print(f"   ‚Ä¢ Consider: {strongest_feature} shows strongest linear relationship")
print(f"   ‚Ä¢ Note: Mutual information captures non-linear relationships better than correlation")