In [7]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectFromModel
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

column_names = ['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor', 'gill-attachment',
                'gill-spacing', 'gill-size', 'gill-color', 'stalk-shape', 'stalk-root',
                'stalk-surface-above-ring', 'stalk-surface-below-ring', 'stalk-color-above-ring',
                'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number', 'ring-type',
                'spore-print-color', 'population', 'habitat']

data = pd.read_csv('agaricus-lepiota.data', header=None, names=column_names)

label_encoder = LabelEncoder()

data['class'] = label_encoder.fit_transform(data['class'])

data_encoded = pd.get_dummies(data.drop('class', axis=1))

X = data_encoded
y = data['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

importances = clf.feature_importances_

feature_importances = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)

print(feature_importances.head(10))

selector = SelectFromModel(clf, threshold='mean', prefit=True)
X_selected = selector.transform(X)

print(f"Number of selected features: {X_selected.shape[1]}")

scaler = StandardScaler()
data_scaled = scaler.fit_transform(data_encoded)

pca = PCA()
pca.fit(data_scaled)

explained_variance = pca.explained_variance_ratio_
cumulative_variance = np.cumsum(explained_variance)

n_components = np.argmax(cumulative_variance >= 0.95) + 1
print(f"Number of components to explain 95% variance: {n_components}")

pca_reduced = PCA(n_components=n_components)
pca_reduced.fit(data_scaled)

loadings = pca_reduced.components_

loading_matrix = pd.DataFrame(loadings, columns=data_encoded.columns)

feature_importance = loading_matrix.abs().sum(axis=0)

top_features = feature_importance.sort_values(ascending=False)

print("Top features contributing to variance:")
print(top_features.head(10))

print("There is not much overlap between features which are most discriminative and those which cause much variance,")
print("which could mean that the features that cause much of the variance also do a poor job at being classifiers, ")
print("so we could remove these and likely not lose much of the accuracy of our model.")

                       Feature  Importance
27                      odor_n    0.145422
36                 gill-size_n    0.072117
24                      odor_f    0.068575
35                 gill-size_b    0.055724
96         spore-print-color_h    0.050855
61  stalk-surface-below-ring_k    0.049323
37                gill-color_b    0.038629
57  stalk-surface-above-ring_k    0.033862
21                   bruises_t    0.029547
94                 ring-type_p    0.027991
Number of selected features: 26




Number of components to explain 95% variance: 59
Top features contributing to variance:
gill-color_u     5.196515
cap-shape_s      5.126763
gill-color_k     4.941647
cap-color_b      4.940531
cap-surface_f    4.706292
cap-color_c      4.638111
habitat_m        4.503917
cap-surface_y    4.495514
cap-color_p      4.468420
cap-color_w      4.427859
dtype: float64
There is not much overlap between features which are most discriminative and those which cause much variance,
which could mean that the features that cause much of the variance also do a poor job at being classifiers, 
so we could remove these and likely not lose much of the accuracy of our model.
