In [1]:
import os
import json
import pandas as pd
import numpy as np

import ast

import pickle 
import matplotlib.pyplot as plt 

from sklearn.feature_selection import *
from sklearn.model_selection import *
from sklearn.preprocessing import *
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

from sklearn.svm import SVC, SVR, LinearSVC
from sklearn.metrics import *
from sklearn.model_selection import cross_val_score

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier


from sklearn.base import TransformerMixin
from sklearn.metrics import *

In [2]:
X = np.loadtxt("data/processed/filled.np.gz")

In [3]:
with open("data/processed/processed_data.json") as jfile:
    df = json.load(jfile)["df"]
    df = pd.DataFrame(df)

print(df)

      Age Gender Status                                           features
0      28      F      H  [29.99383078996264, -45.86057111460573, -2.002...
1      24      M      S  [19.732400698167893, 4.998593506385246, 4.1476...
2      29      M      N  [-11.592710202177722, -15.346528626992365, -1....
3      28      M      H  [23.354775505152173, -22.054154733699782, 5.18...
4      25      M      N  [23.50105405966495, -41.22614300331204, -21.03...
...   ...    ...    ...                                                ...
2600   21      F      N  [-35.85415092051014, -22.576961021566664, -18....
2601   24      F      H  [1.1883946041473443, -12.860573937604215, -8.1...
2602   12      F      H  [14.337451745943092, 6.2856630123797, -16.1075...
2603   25      M      H  [-23.829385914151345, 11.042148119541661, -10....
2604    6      M      H  [10.750766504165492, 70.0228469515277, -27.622...

[2545 rows x 4 columns]


In [4]:
with open("data/processed/indices.json") as jfile:
    indices = json.load(jfile)


# We created a set of indices for each subset, and it's important to keep 
# Test indices seperate from our hyperparameter tunings,
# Only validation is tested for feature selection and etc

train_indices = np.array(indices["train_indices"])
val_indices = np.array(indices["val_indices"])
test_indices = np.array(indices["test_indices"])

assert len(np.intersect1d(train_indices, test_indices)) == 0
assert len(np.intersect1d(train_indices, val_indices)) == 0
assert len(np.intersect1d(val_indices, test_indices)) == 0

X_train = X[train_indices]
X_val = X[val_indices]
X_test = X[test_indices]


feature_dims = np.array(indices["feature_dims"])
features = "f1 f2 f3 f4 f5".split()


# Feature Ranking

In [5]:
gender_encoder = LabelEncoder().fit(df["Gender"])
status_encoder = LabelEncoder().fit(df["Status"])

genders = gender_encoder.transform(df["Gender"])
status = status_encoder.transform(df["Status"])

gender_onehot = OneHotEncoder().fit_transform(genders.reshape(-1, 1))
status_onehot = OneHotEncoder().fit_transform(status.reshape(-1, 1))


In [6]:
# Boundaries in the X matrix, each feature set starts from a start_column and ends in a end_column
feature_boundaries = {}
start_idx = 0
for f, feature_len in zip(features, feature_dims):
    feature_boundaries[f] = (start_idx, start_idx + feature_len)
    start_idx += feature_len
feature_boundaries

{'f1': (0, 512),
 'f2': (512, 2048),
 'f3': (2048, 4352),
 'f4': (4352, 6912),
 'f5': (6912, 7116)}

## Feature Set Importance Using Random Forests


We use a random forest clf to classify the most important features for each label, starting with age.

First we calculate a "leave feature out" error rate, to compare, then we calculate direct feature importances on a PCA projected set of each original set, and show some metrics. 

First we define some custom helper classes.

In [7]:
# Defining a custom pca for cross validation

class CustomPCA(TransformerMixin):
    def __init__(self, indices, explained_var=0.8):
        super().__init__()
        self.explained_var = explained_var
        self.indices = indices
        self.pcas = {}

    def fit(self, X, y=None, **kwargs):
        for k in self.indices:
            start_col, end_col = self.indices[k]
            self.pcas[k] = Pipeline(
                [
                    ("scaler", StandardScaler()),
                    ("pca", PCA(self.explained_var))
                ]
            )
            self.pcas[k].fit_transform(X[:, start_col:end_col])
        return self
        
    def transform(self, X, **kwargs):
        X_transform = []
        for k in self.indices:
            start_col, end_col = self.indices[k]
            X_transform.append(self.pcas[k].transform(X[:, start_col:end_col]))
        return np.concatenate(X_transform, axis=-1)

In [8]:
class ProbaScorer:
    def __init__(self):
        pass

    def __call__(self, clf, X, y):
        '''
            calls predict_proba function of clf and computes
            a bayesian mse for the clf
        '''
        probs = clf.predict_proba(X)
        n_class = len(np.unique(y))
        one_hot = np.zeros((len(X), n_class))
        one_hot[:, y] = 1
        mse = ((probs - one_hot) ** 2).mean()
        return mse

In [9]:
class FeatureImportanceScorer:
    def __init__(self, indices):
        self.indices = indices
    
    def __call__(self, clf, X=None, y=None):
        '''
            reporting mean, max and std of importances of random forest
        '''
        result = []
        for f in self.indices:
            start_col, end_col = self.indices[f]
            importances = clf.feature_importances_[start_col:end_col]
            result.append({
                "feature set": f,
                "max_importance": importances.max(),
                "mean_importance": importances.mean(),
                "std_importance": importances.std(),
            })
        return result


In [10]:
# Projecting from high dimensional data to each feature set's PC space
pca = CustomPCA(feature_boundaries).fit(X[train_indices])
X_train_transform = pca.transform(X_train)
X_val_transform = pca.transform(X_val)

In [11]:
# The previous indices does not work, we need to find the new indices for each pc set
transformed_feature_boundaries = {}
start = 0
for k in pca.pcas:
    end = start + pca.pcas[k]["pca"].n_components_
    transformed_feature_boundaries[k] = (start, end)
    start = end
transformed_feature_boundaries

{'f1': (0, 27), 'f2': (27, 63), 'f3': (63, 88), 'f4': (88, 95), 'f5': (95, 98)}

#### Gender Importances

In [12]:
gender_except_info = []

clf = RandomForestClassifier(n_estimators=100, )

for f in features:
    # all sets except one
    indices = [i for k, (start, end) in transformed_feature_boundaries.items() for i in range(start, end) if k != f]
    curr_X_train = X_train_transform[..., indices]
    curr_X_val = X_val_transform[..., indices]
    clf.fit(curr_X_train, genders[train_indices])
    preds = clf.predict(curr_X_val)
    target = genders[val_indices]
    gender_except_info.append(
        {
            "except_feature": f,
            "accuracy": accuracy_score(preds, target),
            "bayes_mse": ProbaScorer()(clf, curr_X_train, target),
        }
    )

gender_only_info = []
for f in features:
    # all sets except one
    start, end = transformed_feature_boundaries[f]
    curr_X_train = X_train_transform[:, start:end]
    curr_X_val = X_val_transform[:, start:end]
    clf.fit(curr_X_train, genders[train_indices])
    preds = clf.predict(curr_X_val)
    target = genders[val_indices]
    gender_only_info.append(
        {
            "feature_set": f,
            "accuracy": accuracy_score(preds, target),
            "bayes_mse": ProbaScorer()(clf, curr_X_train, target),
        }
    )



clf.fit(X_train_transform, genders[train_indices])
gender_set_importances = FeatureImportanceScorer(transformed_feature_boundaries)(clf)


print("####\n#### Gender ####\n####")

print(f"Error rate for leave one feature out:\n\n{pd.DataFrame(gender_except_info)}\n\n")
print(f"Error rate for training with only one feature set:\n\n{pd.DataFrame(gender_only_info)}\n\n")

print(f"Collective Feature Importances:\n\n{pd.DataFrame(gender_set_importances)}\n\n")

####
#### Gender ####
####
Error rate for leave one feature out:

  except_feature  accuracy  bayes_mse
0             f1  0.897059   0.436301
1             f2  0.950980   0.444856
2             f3  0.955882   0.446529
3             f4  0.936275   0.448694
4             f5  0.941176   0.446873


Error rate for training with only one feature set:

  feature_set  accuracy  bayes_mse
0          f1  0.931373   0.448379
1          f2  0.906863   0.434452
2          f3  0.901961   0.436767
3          f4  0.647059   0.380532
4          f5  0.710784   0.385537


Collective Feature Importances:

  feature set  max_importance  mean_importance  std_importance
0          f1        0.213016         0.013251        0.039226
1          f2        0.091364         0.009163        0.018296
2          f3        0.090295         0.010236        0.017106
3          f4        0.005980         0.003970        0.001142
4          f5        0.017606         0.009554        0.005770




#### Status Importances

In [13]:
status_except_info = []

clf = RandomForestClassifier(n_estimators=100, )

for f in features:
    # all sets except one
    indices = [i for k, (start, end) in transformed_feature_boundaries.items() for i in range(start, end) if k != f]
    curr_X_train = X_train_transform[..., indices]
    curr_X_val = X_val_transform[..., indices]
    clf.fit(curr_X_train, status[train_indices])
    preds = clf.predict(curr_X_val)
    target = status[val_indices]
    status_except_info.append(
        {
            "except_feature": f,
            "accuracy": accuracy_score(preds, target),
            "bayes_mse": ProbaScorer()(clf, curr_X_train, target),
        }
    )

status_only_info = []
for f in features:
    # all sets except one
    start, end = transformed_feature_boundaries[f]
    curr_X_train = X_train_transform[..., start:end]
    curr_X_val = X_val_transform[..., start:end]
    clf.fit(curr_X_train, status[train_indices])
    preds = clf.predict(curr_X_val)
    target = status[val_indices]
    status_only_info.append(
        {
            "feature_set": f,
            "accuracy": accuracy_score(preds, target),
            "bayes_mse": ProbaScorer()(clf, curr_X_train, target),
        }
    )



clf.fit(X_train_transform, status[train_indices])
status_set_importances = FeatureImportanceScorer(transformed_feature_boundaries)(clf)


print("####\n#### Status ####\n####")

print(f"Error rate for leave one feature out:\n\n{pd.DataFrame(status_except_info)}\n\n")
print(f"Error rate for training with only one feature set:\n\n{pd.DataFrame(status_only_info)}\n\n")

print(f"Collective Feature Importances:\n\n{pd.DataFrame(status_set_importances)}\n\n")

####
#### Status ####
####
Error rate for leave one feature out:

  except_feature  accuracy  bayes_mse
0             f1  0.382353   0.540415
1             f2  0.387255   0.539167
2             f3  0.387255   0.539987
3             f4  0.352941   0.541879
4             f5  0.352941   0.541067


Error rate for training with only one feature set:

  feature_set  accuracy  bayes_mse
0          f1  0.343137   0.537074
1          f2  0.426471   0.541638
2          f3  0.348039   0.541013
3          f4  0.377451   0.537888
4          f5  0.372549   0.536202


Collective Feature Importances:

  feature set  max_importance  mean_importance  std_importance
0          f1        0.012203         0.010276        0.000685
1          f2        0.014114         0.010214        0.001358
2          f3        0.012417         0.010094        0.001150
3          f4        0.012022         0.010492        0.000799
4          f5        0.010428         0.009683        0.000753




In [14]:
regressor = RandomForestRegressor(n_estimators=100)

age = df["Age"].to_numpy()

age_except_info = []


for f in features:
    # all sets except one
    indices = [i for k, (start, end) in transformed_feature_boundaries.items() for i in range(start, end) if k != f]
    curr_X_train = X_train_transform[..., indices]
    curr_X_val = X_val_transform[..., indices]
    regressor.fit(curr_X_train, age[train_indices])
    preds = regressor.predict(curr_X_val)
    target = age[val_indices]
    age_except_info.append(
        {
            "feature": f,
            "mae": mean_squared_error(target, preds),
            "mse": mean_absolute_error(target, preds),
        }
    )

age_only_info = []
for f in features:
    # all sets except one
    start, end = transformed_feature_boundaries[f]
    curr_X_train = X_train_transform[..., start:end]
    curr_X_val = X_val_transform[..., start:end]
    regressor.fit(curr_X_train, age[train_indices])
    preds = regressor.predict(curr_X_val)
    target = age[val_indices]
    age_only_info.append(
        {
            "feature": f,
            "mae": mean_squared_error(target, preds),
            "mse": mean_absolute_error(target, preds),
        }
    )



regressor.fit(X_train_transform, age[train_indices])
age_set_importances = FeatureImportanceScorer(transformed_feature_boundaries)(clf)


print("####\n#### Age ####\n####")

print(f"Error rate for leave one feature out:\n\n{pd.DataFrame(age_except_info)}\n\n")
print(f"Error rate for training with only one feature set:\n\n{pd.DataFrame(age_only_info)}\n\n")

print(f"Collective Feature Importances:\n\n{pd.DataFrame(age_set_importances)}\n\n")

####
#### Age ####
####
Error rate for leave one feature out:

  feature        mae       mse
0      f1  86.434132  6.079125
1      f2  59.117000  4.532079
2      f3  61.962560  4.780256
3      f4  61.588169  4.746810
4      f5  59.102492  4.635944


Error rate for training with only one feature set:

  feature         mae        mse
0      f1   77.084181   5.003633
1      f2   91.450984   6.100178
2      f3   92.147626   6.269327
3      f4  177.571151  10.586940
4      f5  183.283707  10.598791


Collective Feature Importances:

  feature set  max_importance  mean_importance  std_importance
0          f1        0.012203         0.010276        0.000685
1          f2        0.014114         0.010214        0.001358
2          f3        0.012417         0.010094        0.001150
3          f4        0.012022         0.010492        0.000799
4          f5        0.010428         0.009683        0.000753


