In [1]:
import os
import json
import pandas as pd
import numpy as np

import ast

import pickle 
import matplotlib.pyplot as plt 

from sklearn.feature_selection import *
from sklearn.model_selection import *
from sklearn.preprocessing import *
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

from sklearn.svm import SVC, SVR, LinearSVC
from sklearn.metrics import *
from sklearn.model_selection import cross_val_score

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier


from sklearn.base import TransformerMixin
from sklearn.metrics import *

In [2]:
X = np.loadtxt("data/processed/filled.np.gz")

In [3]:
with open("data/processed/processed_data.json") as jfile:
    df = json.load(jfile)["df"]
    df = pd.DataFrame(df)

print(df)

      Age Gender Status                                           features
0      28      F      H  [-22.657711933297467, 30.769295879945037, -23....
1      24      M      S  [182.05834145671884, 20.138809568394194, -6.23...
2      29      M      N  [-21.308495839973528, -12.517586239957701, -9....
3      28      M      H  [-22.320687052787786, 22.945827273719978, -22....
4      25      M      N  [-22.65158385249504, 25.13058078215149, -11.95...
...   ...    ...    ...                                                ...
2600   21      F      N  [-21.756444948496828, -33.34037201123824, -2.2...
2601   24      F      H  [-20.832185096880092, 1.4025807238407926, -12....
2602   12      F      H  [-20.441490886122775, 14.783729334551827, -7.3...
2603   25      M      H  [-20.757076001239067, -24.569769281532093, -1....
2604    6      M      H  [-20.658551689088036, 10.331795410271129, 40.6...

[2545 rows x 4 columns]


In [4]:
with open("data/processed/indices.json") as jfile:
    indices = json.load(jfile)


# We created a set of indices for each subset, and it's important to keep 
# Test indices seperate from our hyperparameter tunings,
# Only validation is tested for feature selection and etc

train_indices = np.array(indices["train_indices"])
val_indices = np.array(indices["val_indices"])
test_indices = np.array(indices["test_indices"])

X_train = X[train_indices]
X_val = X[val_indices]
X_test = X[test_indices]


feature_dims = np.array(indices["feature_dims"])
features = "f1 f2 f3 f4 f5".split()


# Feature Ranking

In [5]:
gender_encoder = LabelEncoder().fit(df["Gender"])
status_encoder = LabelEncoder().fit(df["Status"])

genders = gender_encoder.transform(df["Gender"])
status = status_encoder.transform(df["Status"])

gender_onehot = OneHotEncoder().fit_transform(genders.reshape(-1, 1))
status_onehot = OneHotEncoder().fit_transform(status.reshape(-1, 1))


In [6]:
# Boundaries in the X matrix, each feature set starts from a start_column and ends in a end_column
feature_boundaries = {}
start_idx = 0
for f, feature_len in zip(features, feature_dims):
    feature_boundaries[f] = (start_idx, start_idx + feature_len)
    start_idx += feature_len
feature_boundaries

{'f1': (0, 512),
 'f2': (512, 2048),
 'f3': (2048, 4352),
 'f4': (4352, 6912),
 'f5': (6912, 7116)}

## Feature Set Importance Using Random Forests


We use a random forest clf to classify the most important features for each label, starting with age.

First we calculate a "leave feature out" error rate, to compare, then we calculate direct feature importances on a PCA projected set of each original set, and show some metrics. 

First we define some custom helper classes.

In [7]:
# Defining a custom pca for cross validation

class CustomPCA(TransformerMixin):
    def __init__(self, indices, explained_var=0.8):
        super().__init__()
        self.explained_var = explained_var
        self.indices = indices
        self.pcas = {}

    def fit(self, X, y=None, **kwargs):
        for k in self.indices:
            start_col, end_col = self.indices[k]
            self.pcas[k] = Pipeline(
                [
                    ("scaler", StandardScaler()),
                    ("pca", PCA(self.explained_var))
                ]
            )
            self.pcas[k].fit_transform(X[:, start_col:end_col])
        return self
        
    def transform(self, X, **kwargs):
        X_transform = []
        for k in self.indices:
            start_col, end_col = self.indices[k]
            X_transform.append(self.pcas[k].transform(X[:, start_col:end_col]))
        return np.concatenate(X_transform, axis=-1)

In [8]:
class ProbaScorer:
    def __init__(self):
        pass

    def __call__(self, clf, X, y):
        '''
            calls predict_proba function of clf and computes
            a bayesian mse for the clf
        '''
        probs = clf.predict_proba(X)
        n_class = len(np.unique(y))
        one_hot = np.zeros((len(X), n_class))
        one_hot[:, y] = 1
        mse = ((probs - one_hot) ** 2).mean()
        return mse

In [9]:
class FeatureImportanceScorer:
    def __init__(self, indices):
        self.indices = indices
    
    def __call__(self, clf, X=None, y=None):
        '''
            reporting mean, max and std of importances of random forest
        '''
        result = []
        for f in self.indices:
            start_col, end_col = self.indices[f]
            importances = clf.feature_importances_[start_col:end_col]
            result.append({
                "feature set": f,
                "max_importance": importances.max(),
                "mean_importance": importances.mean(),
                "std_importance": importances.std(),
            })
        return result


In [10]:
# Projecting from high dimensional data to each feature set's PC space
pca = CustomPCA(feature_boundaries).fit(X[train_indices])
X_train_transform = pca.transform(X_train)
X_val_transform = pca.transform(X_val)

In [11]:
# The previous indices does not work, we need to find the new indices for each pc set
transformed_feature_boundaries = {}
start = 0
for k in pca.pcas:
    end = start + pca.pcas[k]["pca"].n_components_
    transformed_feature_boundaries[k] = (start, end)
    start = end
transformed_feature_boundaries

{'f1': (0, 27), 'f2': (27, 63), 'f3': (63, 88), 'f4': (88, 95), 'f5': (95, 98)}

#### Gender Importances

In [16]:
gender_except_info = []

clf = RandomForestClassifier(n_estimators=100, )

for f in features:
    # all sets except one
    indices = [i for k, (start, end) in transformed_feature_boundaries.items() for i in range(start, end) if k != f]
    curr_X_train = X_train_transform[..., indices]
    curr_X_val = X_val_transform[..., indices]
    clf.fit(curr_X_train, genders[train_indices])
    preds = clf.predict(curr_X_val)
    target = genders[val_indices]
    gender_except_info.append(
        {
            "except_feature": f,
            "accuracy": accuracy_score(preds, target),
            "bayes_mse": ProbaScorer()(clf, curr_X_train, target),
        }
    )

gender_only_info = []
for f in features:
    # all sets except one
    start, end = transformed_feature_boundaries[f]
    curr_X_train = X_train_transform[:, start:end]
    curr_X_val = X_val_transform[:, start:end]
    clf.fit(curr_X_train, genders[train_indices])
    preds = clf.predict(curr_X_val)
    target = genders[val_indices]
    gender_only_info.append(
        {
            "feature_set": f,
            "accuracy": accuracy_score(preds, target),
            "bayes_mse": ProbaScorer()(clf, curr_X_train, target),
        }
    )



clf.fit(X_train_transform, genders[train_indices])
gender_set_importances = FeatureImportanceScorer(transformed_feature_boundaries)(clf)


print("####\n#### Gender ####\n####")

print(f"Error rate for leave one feature out:\n\n{pd.DataFrame(gender_except_info)}\n\n")
print(f"Error rate for training with only one feature set:\n\n{pd.DataFrame(gender_only_info)}\n\n")

print(f"Collective Feature Importances:\n\n{pd.DataFrame(gender_set_importances)}\n\n")

####
#### Gender ####
####
Error rate for leave one feature out:

  except_feature  accuracy  bayes_mse
0             f1  0.941176   0.434471
1             f2  0.976471   0.444887
2             f3  0.972549   0.447153
3             f4  0.980392   0.448348
4             f5  0.972549   0.445493


Error rate for training with only one feature set:

  feature_set  accuracy  bayes_mse
0          f1  0.929412   0.449166
1          f2  0.933333   0.432584
2          f3  0.929412   0.432437
3          f4  0.686275   0.379724
4          f5  0.745098   0.386273


Collective Feature Importances:

  feature set  max_importance  mean_importance  std_importance
0          f1        0.188727         0.013064        0.034564
1          f2        0.101787         0.009525        0.018949
2          f3        0.076356         0.009877        0.015386
3          f4        0.004589         0.003548        0.000770
4          f5        0.017329         0.010873        0.004905




#### Status Importances

In [17]:
status_except_info = []

clf = RandomForestClassifier(n_estimators=100, )

for f in features:
    # all sets except one
    indices = [i for k, (start, end) in transformed_feature_boundaries.items() for i in range(start, end) if k != f]
    curr_X_train = X_train_transform[..., indices]
    curr_X_val = X_val_transform[..., indices]
    clf.fit(curr_X_train, status[train_indices])
    preds = clf.predict(curr_X_val)
    target = status[val_indices]
    status_except_info.append(
        {
            "except_feature": f,
            "accuracy": accuracy_score(preds, target),
            "bayes_mse": ProbaScorer()(clf, curr_X_train, target),
        }
    )

status_only_info = []
for f in features:
    # all sets except one
    start, end = transformed_feature_boundaries[f]
    curr_X_train = X_train_transform[..., start:end]
    curr_X_val = X_val_transform[..., start:end]
    clf.fit(curr_X_train, status[train_indices])
    preds = clf.predict(curr_X_val)
    target = status[val_indices]
    status_only_info.append(
        {
            "feature_set": f,
            "accuracy": accuracy_score(preds, target),
            "bayes_mse": ProbaScorer()(clf, curr_X_train, target),
        }
    )



clf.fit(X_train_transform, status[train_indices])
status_set_importances = FeatureImportanceScorer(transformed_feature_boundaries)(clf)


print("####\n#### Status ####\n####")

print(f"Error rate for leave one feature out:\n\n{pd.DataFrame(status_except_info)}\n\n")
print(f"Error rate for training with only one feature set:\n\n{pd.DataFrame(status_only_info)}\n\n")

print(f"Collective Feature Importances:\n\n{pd.DataFrame(status_set_importances)}\n\n")

####
#### Status ####
####
Error rate for leave one feature out:

  except_feature  accuracy  bayes_mse
0             f1  0.376471   0.541008
1             f2  0.376471   0.541616
2             f3  0.352941   0.541872
3             f4  0.333333   0.542971
4             f5  0.360784   0.543356


Error rate for training with only one feature set:

  feature_set  accuracy  bayes_mse
0          f1  0.278431   0.538276
1          f2  0.368627   0.542200
2          f3  0.423529   0.541988
3          f4  0.313725   0.538201
4          f5  0.333333   0.535299


Collective Feature Importances:

  feature set  max_importance  mean_importance  std_importance
0          f1        0.012077         0.010459        0.000688
1          f2        0.012180         0.010081        0.000936
2          f3        0.013172         0.009986        0.001380
3          f4        0.012031         0.010652        0.000786
4          f5        0.010715         0.010160        0.000623




In [19]:
regressor = RandomForestRegressor(n_estimators=100)

age = df["Age"].to_numpy()

age_except_info = []


for f in features:
    # all sets except one
    indices = [i for k, (start, end) in transformed_feature_boundaries.items() for i in range(start, end) if k != f]
    curr_X_train = X_train_transform[..., indices]
    curr_X_val = X_val_transform[..., indices]
    regressor.fit(curr_X_train, age[train_indices])
    preds = regressor.predict(curr_X_val)
    target = age[val_indices]
    age_except_info.append(
        {
            "feature": f,
            "mae": mean_squared_error(target, preds),
            "mse": mean_absolute_error(target, preds),
        }
    )

age_only_info = []
for f in features:
    # all sets except one
    start, end = transformed_feature_boundaries[f]
    curr_X_train = X_train_transform[..., start:end]
    curr_X_val = X_val_transform[..., start:end]
    regressor.fit(curr_X_train, age[train_indices])
    preds = regressor.predict(curr_X_val)
    target = age[val_indices]
    age_only_info.append(
        {
            "feature": f,
            "mae": mean_squared_error(target, preds),
            "mse": mean_absolute_error(target, preds),
        }
    )



regressor.fit(X_train_transform, age[train_indices])
age_set_importances = FeatureImportanceScorer(transformed_feature_boundaries)(clf)


print("####\n#### Age ####\n####")

print(f"Error rate for leave one feature out:\n\n{pd.DataFrame(age_except_info)}\n\n")
print(f"Error rate for training with only one feature set:\n\n{pd.DataFrame(age_only_info)}\n\n")

print(f"Collective Feature Importances:\n\n{pd.DataFrame(age_set_importances)}\n\n")

####
#### Age ####
####
Error rate for leave one feature out:

  feature        mae       mse
0      f1  51.890446  4.474479
1      f2  40.151473  3.567261
2      f3  40.394874  3.667981
3      f4  36.454900  3.554640
4      f5  38.545130  3.681453


Error rate for training with only one feature set:

  feature         mae        mse
0      f1   74.863858   4.935777
1      f2   56.130875   4.701605
2      f3   55.557789   4.796693
3      f4  174.091490  10.092094
4      f5  191.070681  10.631193


Collective Feature Importances:

  feature set  max_importance  mean_importance  std_importance
0          f1        0.012077         0.010459        0.000688
1          f2        0.012180         0.010081        0.000936
2          f3        0.013172         0.009986        0.001380
3          f4        0.012031         0.010652        0.000786
4          f5        0.010715         0.010160        0.000623


