In [None]:
# Assess feature importance using a random forest classifier from scikit-learn
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import datasets

In [None]:
# Load the data set
#data = datasets.load_wine()
#data = datasets.load_breast_cancer()
data = datasets.load_diabetes()
X = data.data
y = data.target

# Split the data into a training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train a random forest classifier
clf = RandomForestClassifier(n_estimators=500, random_state=42)
clf.fit(X_train, y_train)

# Print the feature ranking as a list
print("Feature ranking (descending importance):")
ranking = { data.feature_names[i]: clf.feature_importances_[i] for i in range(X.shape[1]) }
feature_name_max_length = max(ranking.keys(), key=len)
for i, feature in enumerate(sorted(ranking, key=ranking.get, reverse=True)):
    print(f"{i+1:2d}) {ranking[feature]:.4f} {feature}")

# Plot the feature importances of the forest
# Feature names on the y-axis, importances on the x-axis
# The most important features are at the top
# This means we first have to sort the features
feature_importance = clf.feature_importances_
indices = np.argsort(feature_importance)

plt.figure()
plt.title("Feature importance")
plt.yticks(range(X.shape[1]), np.array(data.feature_names)[indices])

# add error bars as well by computing the standard deviation of the feature importances
std = np.std([tree.feature_importances_ for tree in clf.estimators_], axis=0)
plt.barh(range(X.shape[1]), feature_importance[indices], xerr=std[indices], align="center")

plt.show()


In [None]:
# Plot correlation matrix
import seaborn as sns
import pandas as pd
df = pd.DataFrame(X, columns=data.feature_names)
df["target"] = y
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm")
plt.show()