In [31]:
import numpy as np
import pandas as pd
import os

from sklearn.feature_selection import mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from src.feature_selector import BaseFeatureSelector
from src.settings import DATA_DIR
from src.experiment_utils import perform_experiments, find_best_experiments
from src.experiment import Experiment
from src.custom_feature_selectors.boruta import Boruta
from src.custom_feature_selectors.after_boruta_selectors import PermutationImportance, Impurity
from sklearn.inspection import permutation_importance
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from skfeature.function.similarity_based.fisher_score import fisher_score

In [2]:
# Read the text file into a dataframe
X = pd.read_csv(os.path.join(DATA_DIR, 'x_train.txt'), sep=' ', header=None).to_numpy()
y = pd.read_csv(os.path.join(DATA_DIR, 'y_train.txt'), header=None).to_numpy().T[0]
X_test = pd.read_csv(os.path.join(DATA_DIR, 'x_test.txt'), sep=' ', header=None).to_numpy()

In [3]:
from sklearn.model_selection import train_test_split
from src.train import calculate_score
from boruta import BorutaPy


df = pd.DataFrame(X)
corr_matrix = df.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]
df.drop(to_drop, axis=1, inplace=True)

X_reduced = df.to_numpy()

model = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)

feat_selector = BorutaPy(
    verbose=2,
    estimator=model,
    n_estimators='auto',
    max_iter=10
)

feat_selector.fit(X_reduced, y)

Iteration: 	1 / 10
Confirmed: 	0
Tentative: 	496
Rejected: 	0
Iteration: 	2 / 10
Confirmed: 	0
Tentative: 	496
Rejected: 	0
Iteration: 	3 / 10
Confirmed: 	0
Tentative: 	496
Rejected: 	0
Iteration: 	4 / 10
Confirmed: 	0
Tentative: 	496
Rejected: 	0
Iteration: 	5 / 10
Confirmed: 	0
Tentative: 	496
Rejected: 	0
Iteration: 	6 / 10
Confirmed: 	0
Tentative: 	496
Rejected: 	0
Iteration: 	7 / 10
Confirmed: 	0
Tentative: 	496
Rejected: 	0
Iteration: 	8 / 10
Confirmed: 	0
Tentative: 	15
Rejected: 	481
Iteration: 	9 / 10
Confirmed: 	12
Tentative: 	3
Rejected: 	481


BorutaPy finished running.

Iteration: 	10 / 10
Confirmed: 	12
Tentative: 	0
Rejected: 	481


In [4]:
X_reduced = feat_selector.transform(X_reduced)
chosen_columns = []
for i in range(X.shape[1]):
    for j in range(X_reduced.shape[1]):
        if np.equal(X[:, i], X_reduced[:, j]).all():
            chosen_columns.append(i)
            break

In [6]:
chosen_columns

[0, 1, 2, 3, 4, 5, 100, 101, 102, 103, 104, 105]

In [None]:
def discretize_dataset(X, bins=10):
    X_discr = np.copy(X)
    for i in range(X.shape[1]):
        X_discr[:, i] = pd.cut(X[:, i], bins=bins, labels=False)

    return X_discr

In [9]:
class ChiSquared(BaseFeatureSelector):

    def __init__(self, n_feats=3) -> None:
        super().__init__()
        self.n_feats = n_feats
        self.chi_chosen = None

    def fit(self, X: np.ndarray, y: np.ndarray) -> None:
        X_cat = discretize_dataset(X)
        chi2_features = SelectKBest(chi2, k=3)
        X_best = chi2_features.fit_transform(X_cat, y)

        self.chi_chosen = []
        for i in range(X_cat.shape[1]):
            for j in range(X_best.shape[1]):
                if np.equal(X_cat[:, i], X_best[:, j]).all():
                    self.chi_chosen.append(i)
                    break

    def get_support(self, indices: bool = True) -> np.ndarray:
        return self.chi_chosen

In [94]:
class Fisher(BaseFeatureSelector):

    def __init__(self, n_feats=3) -> None:
        super().__init__()
        self.n_feats = n_feats
        self.fisher_chosen = None

    def fit(self, X: np.ndarray, y: np.ndarray) -> None:
        self.fisher_results = np.argsort(-fisher_score(X, y))

    def get_support(self, indices: bool = True) -> np.ndarray:
        return self.fisher_results[:self.n_feats]

In [39]:
from sklearn.neural_network import MLPClassifier

In [42]:
experiment_config = [
    # Experiment(
    #     classifier=RandomForestClassifier,
    #     classifier_config={
    #         "max_depth": 3
    #     },
    #     feature_selector=SelectFromModel,
    #     feature_selector_config={
    #         "estimator": LogisticRegression(max_iter=1000)
    #     }
    # ),
    Experiment(
        classifier=MLPClassifier,
        classifier_config={
            "learning_rate_init": 0.01,
        },
        feature_selector=None,
        # feature_selector_config={
        #     "n_feats": 10,
        #     "max_depth": 5
        # }
    ),
    # Experiment(
    #     classifier=RandomForestClassifier,
    #     classifier_config={
    #         "max_depth": 5,
    #         "n_estimators": 100
    #     },
    #     feature_selector=PermutationImportance,
    #     feature_selector_config={
    #         "n_feats": 2,
    #     }
    # ),
    # Experiment(
    #     classifier=QDA,
    #     feature_selector=Boruta,
    #     feature_selector_config={
    #         "additional_feat_selector": SequentialFeatureSelector(
    #             estimator=RandomForestClassifier(max_depth=5),
    #             n_features_to_select="auto",
    #             tol= 0.001,
    #             direction="forward"
    #         ),
    #         "model_n_estimators": 100,
    #         "model_max_depth": 5,
    #         "boruta_n_estimators": "auto",
    #         "boruta_max_iter": 10,
    #     }
    # ),
]

In [7]:
# indices for best none: 
np.array(chosen_columns)[[6, 8, 9]]

array([100, 102, 103])

In [43]:
scores, indices = perform_experiments(X_reduced[:, [6, 8, 9]], y, experiment_config)

Experiment exp_mlpc_non_4d22d2 in progress...
Using 3 features, we properly classified 146/200 clients.
Using 3 features, we properly classified 146/200 clients.
Using 3 features, we properly classified 147/200 clients.
Using 3 features, we properly classified 151/200 clients.
Using 3 features, we properly classified 149/200 clients.


In [18]:
indices

{'exp_gnb_pi_13df8e': [array([4, 5, 1], dtype=int64),
  array([0, 5, 1], dtype=int64),
  array([0, 1, 4], dtype=int64),
  array([5, 4, 3], dtype=int64),
  array([0, 5, 1], dtype=int64)]}

In [237]:
best_experiments = find_best_experiments()
best_experiment = best_experiments[1]

model = best_experiment.classifier(**best_experiment.classifier_config)
feature_selector = best_experiment.feature_selector(
    **best_experiment.feature_selector_config
)

# feature_selector.fit(X, y)
# X_train_reduced = feature_selector.transform(X)
# model.fit(X_train_reduced, y)

In [25]:
feature_selector.fit(X_reduced, y)
X_final = feature_selector.transform(X_reduced)

In [252]:
from sklearn.metrics import accuracy_score
import xgboost as xgb

X_train, X_test, y_train, y_test = train_test_split(X_reduced[:, 6:], y, test_size=0.2)
# print(calculate_score(model, X_train, X_test, y_train, y_test))
# model = xgb.XGBClassifier(tree_method="hist")
model.fit(X_train, y_train)
preds_proba = model.predict_proba(X_test)
preds = model.predict(X_test)
print(accuracy_score(y_test, preds))
print(calculate_score(model, X_train, X_test, y_train, y_test))

0.699
Using 6 features, we properly classified 162/200 clients.
6900.0


In [135]:
np.sum(y_test[np.argsort(-preds_proba[:, 1])[:200]])

158

In [None]:
X_test_reduced = feature_selector.transform(X_test)
proba_preds = model.predict_proba(X_test_reduced)
np.argsort(proba_preds[:, 1])[-1000:]