In [1]:
import os
import json
import pandas as pd
import numpy as np

import ast

import pickle 
import matplotlib.pyplot as plt 

from sklearn.feature_selection import *
from sklearn.model_selection import *
from sklearn.preprocessing import *
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

from sklearn.svm import SVC, SVR, LinearSVC
from sklearn.metrics import *
from sklearn.model_selection import cross_val_score

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier


from sklearn.base import TransformerMixin
from sklearn.metrics import *

In [2]:
X = np.loadtxt("data/processed/filled.np.gz")

In [3]:
with open("data/processed/processed_data.json") as jfile:
    df = json.load(jfile)["df"]
    df = pd.DataFrame(df)

print(df)

      Age Gender Status                                           features
0      28      F      H  [32.385882764364446, -43.052354272490966, 12.5...
1      24      M      S  [19.56469203834567, 5.210403728117817, -0.4542...
2      29      M      N  [-11.747426683495547, -16.025053894630748, 2.9...
3      28      M      H  [24.528365943690584, -17.436003818998074, 15.6...
4      25      M      N  [26.801209609932265, -43.94187271754049, -3.43...
...   ...    ...    ...                                                ...
2600   21      F      N  [-33.801597619138526, -31.4414974953694, -14.5...
2601   24      F      H  [3.6372571370198563, -14.181891019636248, 2.48...
2602   12      F      H  [14.692804659916057, -0.08902988676968887, -19...
2603   25      M      H  [-24.018674240475235, 4.557305777329094, -9.10...
2604    6      M      H  [8.843705696375519, 56.15470172873436, -54.477...

[2545 rows x 4 columns]


In [4]:
with open("data/processed/indices.json") as jfile:
    indices = json.load(jfile)


# We created a set of indices for each subset, and it's important to keep 
# Test indices seperate from our hyperparameter tunings,
# Only validation is tested for feature selection and etc

train_indices = np.array(indices["train_indices"])
val_indices = np.array(indices["val_indices"])
test_indices = np.array(indices["test_indices"])

X_train = X[train_indices]
X_val = X[val_indices]
X_test = X[test_indices]


feature_dims = np.array(indices["feature_dims"])
features = "f1 f2 f3 f4 f5".split()


# Feature Ranking

In [5]:
gender_encoder = LabelEncoder().fit(df["Gender"])
status_encoder = LabelEncoder().fit(df["Status"])

genders = gender_encoder.transform(df["Gender"])
status = status_encoder.transform(df["Status"])

gender_onehot = OneHotEncoder().fit_transform(genders.reshape(-1, 1))
status_onehot = OneHotEncoder().fit_transform(status.reshape(-1, 1))


In [6]:
# Boundaries in the X matrix, each feature set starts from a start_column and ends in a end_column
feature_boundaries = {}
start_idx = 0
for f, feature_len in zip(features, feature_dims):
    feature_boundaries[f] = (start_idx, start_idx + feature_len)
    start_idx += feature_len
feature_boundaries

{'f1': (0, 512),
 'f2': (512, 2048),
 'f3': (2048, 4352),
 'f4': (4352, 6912),
 'f5': (6912, 7116)}

## Feature Set Importance Using Random Forests


We use a random forest clf to classify the most important features for each label, starting with age.

First we calculate a "leave feature out" error rate, to compare, then we calculate direct feature importances on a PCA projected set of each original set, and show some metrics. 

First we define some custom helper classes.

In [7]:
# Defining a custom pca for cross validation

class CustomPCA(TransformerMixin):
    def __init__(self, indices, explained_var=0.8):
        super().__init__()
        self.explained_var = explained_var
        self.indices = indices
        self.pcas = {}

    def fit(self, X, y=None, **kwargs):
        for k in self.indices:
            start_col, end_col = self.indices[k]
            self.pcas[k] = Pipeline(
                [
                    ("scaler", StandardScaler()),
                    ("pca", PCA(self.explained_var))
                ]
            )
            self.pcas[k].fit_transform(X[:, start_col:end_col])
        return self
        
    def transform(self, X, **kwargs):
        X_transform = []
        for k in self.indices:
            start_col, end_col = self.indices[k]
            X_transform.append(self.pcas[k].transform(X[:, start_col:end_col]))
        return np.concatenate(X_transform, axis=-1)

In [8]:
class ProbaScorer:
    def __init__(self):
        pass

    def __call__(self, clf, X, y):
        '''
            calls predict_proba function of clf and computes
            a bayesian mse for the clf
        '''
        probs = clf.predict_proba(X)
        n_class = len(np.unique(y))
        one_hot = np.zeros((len(X), n_class))
        one_hot[:, y] = 1
        mse = ((probs - one_hot) ** 2).mean()
        return mse

In [9]:
class FeatureImportanceScorer:
    def __init__(self, indices):
        self.indices = indices
    
    def __call__(self, clf, X=None, y=None):
        '''
            reporting mean, max and std of importances of random forest
        '''
        result = []
        for f in self.indices:
            start_col, end_col = self.indices[f]
            importances = clf.feature_importances_[start_col:end_col]
            result.append({
                "feature set": f,
                "max_importance": importances.max(),
                "mean_importance": importances.mean(),
                "std_importance": importances.std(),
            })
        return result


In [10]:
# Projecting from high dimensional data to each feature set's PC space
pca = CustomPCA(feature_boundaries).fit(X[train_indices])
X_train_transform = pca.transform(X_train)
X_val_transform = pca.transform(X_val)

In [11]:
# The previous indices does not work, we need to find the new indices for each pc set
transformed_feature_boundaries = {}
start = 0
for k in pca.pcas:
    end = start + pca.pcas[k]["pca"].n_components_
    transformed_feature_boundaries[k] = (start, end)
    start = end
transformed_feature_boundaries

{'f1': (0, 27), 'f2': (27, 63), 'f3': (63, 87), 'f4': (87, 94), 'f5': (94, 97)}

#### Gender Importances

In [12]:
gender_except_info = []

clf = RandomForestClassifier(n_estimators=100, )

for f in features:
    # all sets except one
    indices = [i for k, (start, end) in transformed_feature_boundaries.items() for i in range(start, end) if k != f]
    curr_X_train = X_train_transform[..., indices]
    curr_X_val = X_val_transform[..., indices]
    clf.fit(curr_X_train, genders[train_indices])
    preds = clf.predict(curr_X_val)
    target = genders[val_indices]
    gender_except_info.append(
        {
            "except_feature": f,
            "accuracy": accuracy_score(preds, target),
            "bayes_mse": ProbaScorer()(clf, curr_X_train, target),
        }
    )

gender_only_info = []
for f in features:
    # all sets except one
    start, end = transformed_feature_boundaries[f]
    curr_X_train = X_train_transform[:, start:end]
    curr_X_val = X_val_transform[:, start:end]
    clf.fit(curr_X_train, genders[train_indices])
    preds = clf.predict(curr_X_val)
    target = genders[val_indices]
    gender_only_info.append(
        {
            "feature_set": f,
            "accuracy": accuracy_score(preds, target),
            "bayes_mse": ProbaScorer()(clf, curr_X_train, target),
        }
    )



clf.fit(X_train_transform, genders[train_indices])
gender_set_importances = FeatureImportanceScorer(transformed_feature_boundaries)(clf)


print("####\n#### Gender ####\n####")

print(f"Error rate for leave one feature out:\n\n{pd.DataFrame(gender_except_info)}\n\n")
print(f"Error rate for training with only one feature set:\n\n{pd.DataFrame(gender_only_info)}\n\n")

print(f"Collective Feature Importances:\n\n{pd.DataFrame(gender_set_importances)}\n\n")

####
#### Gender ####
####
Error rate for leave one feature out:

  except_feature  accuracy  bayes_mse
0             f1  0.937255   0.435095
1             f2  0.960784   0.447691
2             f3  0.960784   0.448259
3             f4  0.960784   0.449880
4             f5  0.964706   0.447316


Error rate for training with only one feature set:

  feature_set  accuracy  bayes_mse
0          f1  0.937255   0.449726
1          f2  0.933333   0.434458
2          f3  0.937255   0.432331
3          f4  0.733333   0.379653
4          f5  0.713725   0.386318


Collective Feature Importances:

  feature set  max_importance  mean_importance  std_importance
0          f1        0.218537         0.013791        0.040239
1          f2        0.088101         0.009106        0.017832
2          f3        0.083861         0.010351        0.016315
3          f4        0.003989         0.003258        0.000557
4          f5        0.015157         0.009543        0.004013




#### Status Importances

In [13]:
status_except_info = []

clf = RandomForestClassifier(n_estimators=100, )

for f in features:
    # all sets except one
    indices = [i for k, (start, end) in transformed_feature_boundaries.items() for i in range(start, end) if k != f]
    curr_X_train = X_train_transform[..., indices]
    curr_X_val = X_val_transform[..., indices]
    clf.fit(curr_X_train, status[train_indices])
    preds = clf.predict(curr_X_val)
    target = status[val_indices]
    status_except_info.append(
        {
            "except_feature": f,
            "accuracy": accuracy_score(preds, target),
            "bayes_mse": ProbaScorer()(clf, curr_X_train, target),
        }
    )

status_only_info = []
for f in features:
    # all sets except one
    start, end = transformed_feature_boundaries[f]
    curr_X_train = X_train_transform[..., start:end]
    curr_X_val = X_val_transform[..., start:end]
    clf.fit(curr_X_train, status[train_indices])
    preds = clf.predict(curr_X_val)
    target = status[val_indices]
    status_only_info.append(
        {
            "feature_set": f,
            "accuracy": accuracy_score(preds, target),
            "bayes_mse": ProbaScorer()(clf, curr_X_train, target),
        }
    )



clf.fit(X_train_transform, status[train_indices])
status_set_importances = FeatureImportanceScorer(transformed_feature_boundaries)(clf)


print("####\n#### Status ####\n####")

print(f"Error rate for leave one feature out:\n\n{pd.DataFrame(status_except_info)}\n\n")
print(f"Error rate for training with only one feature set:\n\n{pd.DataFrame(status_only_info)}\n\n")

print(f"Collective Feature Importances:\n\n{pd.DataFrame(status_set_importances)}\n\n")

####
#### Status ####
####
Error rate for leave one feature out:

  except_feature  accuracy  bayes_mse
0             f1  0.349020   0.541014
1             f2  0.360784   0.540963
2             f3  0.364706   0.541952
3             f4  0.388235   0.542903
4             f5  0.411765   0.542734


Error rate for training with only one feature set:

  feature_set  accuracy  bayes_mse
0          f1  0.376471   0.538241
1          f2  0.380392   0.541711
2          f3  0.352941   0.541800
3          f4  0.349020   0.537754
4          f5  0.360784   0.535122


Collective Feature Importances:

  feature set  max_importance  mean_importance  std_importance
0          f1        0.012665         0.010508        0.000883
1          f2        0.013328         0.010158        0.000941
2          f3        0.012663         0.010408        0.001185
3          f4        0.011813         0.010341        0.000753
4          f5        0.009822         0.009477        0.000248




In [14]:
regressor = RandomForestRegressor(n_estimators=100)

age = df["Age"].to_numpy()

age_except_info = []


for f in features:
    # all sets except one
    indices = [i for k, (start, end) in transformed_feature_boundaries.items() for i in range(start, end) if k != f]
    curr_X_train = X_train_transform[..., indices]
    curr_X_val = X_val_transform[..., indices]
    regressor.fit(curr_X_train, age[train_indices])
    preds = regressor.predict(curr_X_val)
    target = age[val_indices]
    age_except_info.append(
        {
            "feature": f,
            "mae": mean_squared_error(target, preds),
            "mse": mean_absolute_error(target, preds),
        }
    )

age_only_info = []
for f in features:
    # all sets except one
    start, end = transformed_feature_boundaries[f]
    curr_X_train = X_train_transform[..., start:end]
    curr_X_val = X_val_transform[..., start:end]
    regressor.fit(curr_X_train, age[train_indices])
    preds = regressor.predict(curr_X_val)
    target = age[val_indices]
    age_only_info.append(
        {
            "feature": f,
            "mae": mean_squared_error(target, preds),
            "mse": mean_absolute_error(target, preds),
        }
    )



regressor.fit(X_train_transform, age[train_indices])
age_set_importances = FeatureImportanceScorer(transformed_feature_boundaries)(clf)


print("####\n#### Age ####\n####")

print(f"Error rate for leave one feature out:\n\n{pd.DataFrame(age_except_info)}\n\n")
print(f"Error rate for training with only one feature set:\n\n{pd.DataFrame(age_only_info)}\n\n")

print(f"Collective Feature Importances:\n\n{pd.DataFrame(age_set_importances)}\n\n")

####
#### Age ####
####
Error rate for leave one feature out:

  feature        mae       mse
0      f1  73.069441  5.519351
1      f2  40.609770  4.075303
2      f3  41.004975  4.067718
3      f4  39.828087  4.219155
4      f5  43.336268  4.386042


Error rate for training with only one feature set:

  feature         mae        mse
0      f1   65.386517   4.790350
1      f2   72.218903   5.580814
2      f3   73.691390   5.750493
3      f4  176.939425  10.116381
4      f5  201.158481  10.566140


Collective Feature Importances:

  feature set  max_importance  mean_importance  std_importance
0          f1        0.012665         0.010508        0.000883
1          f2        0.013328         0.010158        0.000941
2          f3        0.012663         0.010408        0.001185
3          f4        0.011813         0.010341        0.000753
4          f5        0.009822         0.009477        0.000248


