# Feature Importance
In this notebook we show examples of how to do feature importance

### Matplotlib backend
pick a back end that can display the figure and allow interaction

In [1]:
%matplotlib qt

## Imports

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from matplotlib.widgets import Slider
import matplotlib.pyplot as plt


## Random Forest Classifier
In this example we are looking at binary data (i.e. True or False)

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification # used to make example data

## Make example data

In [8]:
X, y= make_classification(n_samples=100, n_features=5, 
                               n_informative=2, n_redundant=0, 
                               n_classes=2, random_state=42)
# make one feature that is a perfect match to the target variable
X[y==0, 0]=-1
X[y==1, 0]=1
# make a second feature is not as good a match
X[y==0, 1]=2
X[y==1, 1]=-2
ind= np.where(y==0)[0]
ind=ind[:int(len(ind)/2)]
X[ind, 1]=np.random.uniform(-5, 5, size=len(ind))
ind= np.where(y==1)[0]
ind=ind[int(len(ind)/2):]
X[ind, 1]=np.random.uniform(-5, 5, size=len(ind))

# the remaining features are purely random

## Create Bar Chart Feature Importance
We add a slider that adds noise to all features to show what happens to the feature importance as it is more poorly correlated with the target variable

In [33]:
# Function to generate data, train the model, and plot feature importances
def plot_feature_importance(noise_level, ax, bar=False):
    X_= X.copy()
    X_= X_+ np.random.binomial(n=noise_level, p=0.5, size=X.shape)
    # Convert to DataFrame for convenience
    X_df = pd.DataFrame(X_, columns=[f'Feature_{i}' for i in range(len(X.T))])    

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_df, y, test_size=0.3, random_state=42)

    # Initialize and train RandomForestClassifier
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X_train, y_train)

    # Get feature importances
    feature_importances_clf = pd.Series(clf.feature_importances_, index=X_df.columns).sort_values(ascending=False)
    
    ax.figure.suptitle(f'Substorm Onsets \n Feature Importances for RandomForestClassifier (Noise Level = {noise_level})')
    
    if bar:
        # Update existing bars with new heights
        for b, val in zip(bar, feature_importances_clf.values):
            b.set_height(val)
        plt.draw()
    else:
        # Plot the feature importances using a bar chart
        return ax.bar(range(len(feature_importances_clf)), height=feature_importances_clf, color='salmon')


# Create the initial figure and axes
fig = plt.figure(figsize=(8, 6))
ax = fig.add_axes([0.2, 0.1, .7, 0.8])
ax.set_ylabel('Importance')
ax.set_xlabel('Features')
ax.set_xticks(list(range(5)))
ax.set_xticklabels(['AL Index', 'IMF Bz', 'Solar Zenith Angle', 'Ireland Population', 'rainfall'])
# Initial plot with noise level 0
bar = plot_feature_importance(0, ax=ax)

# Create a slider for controlling noise level
axamp = fig.add_axes([0.1, 0.1, 0.0225, 0.8])
noise_slider = Slider(ax=axamp, valmin=0, valmax=20, label='Noise Level', valstep=1, valinit=0, orientation='vertical')

# Update function for slider
def update(noise_level):
    plot_feature_importance(noise_level, ax=ax, bar=bar)
    fig.canvas.draw_idle()

# Call the update function when slider value changes
noise_slider.on_changed(update)

plt.show()


## Create Confusion Matrix
Here we use a confusion matrix to show how good predictions are. We use the train test split creating the model on the train set and testing the accuracy using the test set

In [35]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Generate example data
X, y = make_classification(n_samples=500, n_features=5, 
                           n_informative=2, n_redundant=0, 
                           n_classes=2, random_state=42)

# Make one feature that is a perfect match to the target variable
X[y == 0, 0] = -1
X[y == 1, 0] = 1

# Make a second feature that is not as good a match
X[y == 0, 1] = 2
X[y == 1, 1] = -2

# Add noise to some samples in the second feature
ind = np.where(y == 0)[0]
ind = ind[:int(len(ind) / 2)]
X[ind, 1] = np.random.uniform(-5, 5, size=len(ind))
ind = np.where(y == 1)[0]
ind = ind[int(len(ind) / 2):]
X[ind, 1] = np.random.uniform(-5, 5, size=len(ind))

# Function to generate data, train the model, and calculate predictions
def get_vals(noise_level):
    X_ = X.copy()
    # Add noise to the features
    X_ = X_ + np.random.binomial(n=noise_level, p=0.5, size=X_.shape)
    
    # Convert to DataFrame for convenience
    X_df = pd.DataFrame(X_, columns=[f'Feature_{i}' for i in range(len(X_.T))])

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_df, y, test_size=0.3, random_state=42)

    # Initialize and train RandomForestClassifier
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = clf.predict(X_test)
    return y_test, y_pred

# Function to plot the confusion matrix
def plot_confusion_matrix(noise, ax, overwrite_image=None, colorbar=False):
    y_test, y_pred = get_vals(noise)
    cm = confusion_matrix(y_test, y_pred)

    ax.clear()  # Clear the previous plot

    # Create a confusion matrix display
    cmd = ConfusionMatrixDisplay(cm, display_labels=["No Onset", "Onset"])
    ax.set_title(f'Confusion Matrix (Noise Level = {noise})')
    return cmd.plot(ax=ax, cmap='Blues', values_format='d', im_kw={'vmin':0, 'vmax':100}, colorbar=colorbar)


# Create the initial figure and axes
fig, ax = plt.subplots(figsize=(8, 6))
# Initial plot with noise level 0
im= plot_confusion_matrix(0, ax=ax, colorbar=True)

# Create a slider for controlling noise level
axamp = fig.add_axes([0.1, 0.1, 0.0225, 0.8])
noise_slider = Slider(ax=axamp, valmin=0, valmax=20, label='Noise Level', valstep=1, valinit=0, orientation='vertical')

# Update function for slider
def update(noise_level):
    # Overwrite the existing confusion matrix with the new data for the selected noise level
    plot_confusion_matrix(noise_level, ax=ax)
    plt.draw()
    fig.canvas.draw_idle()

# Call the update function when slider value changes
noise_slider.on_changed(update)
plt.show()


## Random Forest Regressor
In this example we are looking at continuous data

In [45]:
from sklearn.ensemble import RandomForestRegressor

n_samples = 100
y_regression = np.random.randn(n_samples)

# Function to generate the data, fit the model, and plot feature importances
def plot_model_accuracy(noise_level, ax, bar=False):
    # Set random seed for reproducibility
    np.random.seed(42)

    # Number of features
    n_features = 5

    # Generate features
    # High importance feature - strong linear relationship with target
    # Low importance feature - random noise
    low_importance_feature = np.random.randn(n_samples)

    # Generate remaining features (random noise)
    other_features = np.random.randn(n_samples, n_features - 2)

    # Target variable - strong relationship with high_importance_feature + some noise
    high_importance_feature = 5 * y_regression + np.random.randn(n_samples) * 0.5  # High importance feature dominates

    # Add noise based on the noise_level slider value
    high_importance_feature += np.random.normal(0, noise_level, size=high_importance_feature.shape)

    # Combine features into a DataFrame
    X_regression = pd.DataFrame(np.column_stack([high_importance_feature, low_importance_feature, other_features]),
                                columns=['High_Importance_Feature', 'Low_Importance_Feature'] + [f'Feature_{i}' for i in range(n_features - 2)])

    # Split data into training and testing sets
    X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_regression, y_regression, test_size=0.3, random_state=42)

    # Initialize and train RandomForestRegressor
    reg = RandomForestRegressor(n_estimators=100, random_state=42)
    reg.fit(X_train_reg, y_train_reg)

    # Get feature importances
    feature_importances_reg = pd.Series(reg.feature_importances_, index=X_regression.columns).sort_values(ascending=False)
    ax.figure.suptitle(f'Feature Importances for RandomForestRegressor (Noise Level = {noise_level})')

    if bar:
        for b, val in zip(bar, feature_importances_reg.values):
            b.set_height(val)
        plt.draw()
    else:
        # Plot the feature importances using a bar chart
        return ax.bar(range(len(feature_importances_reg)), height=feature_importances_reg, color='salmon')


fig=plt.figure()
ax=fig.add_axes([0.2, 0.1, .5, 0.8])
ax.set_ylabel('Importance')
ax.set_xlabel('Features')
ax.set_xticks(list(range(5)))
ax.set_xticklabels(['AL Index', 'IMF Bz', 'Solar Zenith Angle', 'Ireland Population', 'rainfall'])
bar= plot_feature_importance(0, ax=ax)


axamp = fig.add_axes([0.1, 0.1, 0.0225, 0.8])
noise_slider = Slider(ax=axamp, valmin=0.0, valmax=20.0, label='Noise Level', valinit=0, orientation='vertical')
def update(noise_level):
    plot_feature_importance(noise_level, ax=ax, bar=bar)
    fig.canvas.draw_idle()
noise_slider.on_changed(update)


0

## Model Accuracy
Here we test the model accuracy with a line plot by plotting the predictions against the model

In [46]:
from sklearn.ensemble import RandomForestRegressor

n_samples = 100
y_regression = np.random.randn(n_samples)

# Function to generate the data, fit the model, and plot feature importances
def plot_model_accuracy(noise_level, ax, scatter=False):
    # Set random seed for reproducibility
    np.random.seed(42)

    # Number of features
    n_features = 5

    # Generate features
    # High importance feature - strong linear relationship with target
    # Low importance feature - random noise
    low_importance_feature = np.random.randn(n_samples)

    # Generate remaining features (random noise)
    other_features = np.random.randn(n_samples, n_features - 2)

    # Target variable - strong relationship with high_importance_feature + some noise
    high_importance_feature = 5 * y_regression + np.random.randn(n_samples) * 0.5  # High importance feature dominates

    # Add noise based on the noise_level slider value
    high_importance_feature += np.random.normal(0, noise_level, size=high_importance_feature.shape)

    # Combine features into a DataFrame
    X_regression = pd.DataFrame(np.column_stack([high_importance_feature, low_importance_feature, other_features]),
                                columns=['High_Importance_Feature', 'Low_Importance_Feature'] + [f'Feature_{i}' for i in range(n_features - 2)])

    # Split data into training and testing sets
    X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_regression, y_regression, test_size=0.3, random_state=42)

    # Initialize and train RandomForestRegressor
    reg = RandomForestRegressor(n_estimators=100, random_state=42)
    reg.fit(X_train_reg, y_train_reg)
    
    y_pred= reg.predict(X_test_reg)

    ax.figure.suptitle(f'Model Fit for RandomForestRegressor (Noise Level = {noise_level})')

    if scatter:
        scatter.set_offsets(np.array([y_test_reg, y_pred]).T)
        plt.draw()
    else:
        return ax.scatter(y_test_reg, y_pred, color='salmon')


fig=plt.figure()
ax=fig.add_axes([0.2, 0.1, .5, 0.8])
ax.set_ylabel('Model')
ax.set_xlabel('Truth')
ax.set_xlim(-10, 10)
ax.set_ylim(-10, 10)
ax.set_aspect('equal')
scatter= plot_model_accuracy(0, ax=ax)


axamp = fig.add_axes([0.1, 0.1, 0.0225, 0.8])
noise_slider = Slider(ax=axamp, valmin=0.0, valmax=20.0, label='Noise Level', valinit=0, orientation='vertical')
def update(noise_level):
    plot_model_accuracy(noise_level, ax=ax, scatter=scatter)
    fig.canvas.draw_idle()
noise_slider.on_changed(update)


0