In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import mutual_info_score
from sklearn.tree import DecisionTreeClassifier

####################################################################
#                          Read data                               #
####################################################################

prefix = ""

_test_x = pd.read_table(prefix + "artificial_test.data", sep=" ", header=None)
_test_x.drop(_test_x.columns[500], axis=1, inplace=True)
_train_y = pd.read_table(prefix + "artificial_train.labels", header=None)
_train_x = pd.read_table(prefix + "artificial_train.data", sep=" ", header=None)
_train_x.drop(_train_x.columns[500], axis=1, inplace=True)

In [24]:
def get_train_and_validation_data():
    split = 400
    train_x, valid_x = _train_x[split:], _train_x[:split]
    train_y, valid_y = _train_y[split:], _train_y[:split]
    print("train_x.shape: ", train_x.shape)
    print("train_y.shape: ", train_y.shape)
    print("valid_x.shape: ", valid_x.shape)
    print("valid_y.shape: ", valid_y.shape)
    return train_x, train_y, valid_x, valid_y

In [25]:
train_x, train_y, valid_x, valid_y = get_train_and_validation_data()

train_x.shape:  (1600, 500)
train_y.shape:  (1600, 1)
valid_x.shape:  (400, 500)
valid_y.shape:  (400, 1)


In [26]:
# 1. Remove Highly Correlated Columns
def remove_highly_correlated_features(train_x, valid_x, threshold=0.95):
    corr_matrix = train_x.corr().abs()
    # Select upper triangle of correlation matrix
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    # Find index of feature columns with correlation greater than threshold
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
    train_x = train_x.drop(to_drop, axis=1)
    valid_x = valid_x.drop(to_drop, axis=1)
    return train_x, valid_x

In [27]:
train_x, valid_x = remove_highly_correlated_features(train_x, valid_x)

In [28]:
train_x.shape

(1600, 490)

In [29]:
# 2. Remove Low Variance Columns
def remove_low_variance_features(train_x, valid_x, threshold=(0.8 * (1 - 0.8))):
    sel = VarianceThreshold(threshold=threshold)
    sel.fit(train_x)
    train_x = train_x[train_x.columns[sel.get_support(indices=True)]]
    valid_x = valid_x[valid_x.columns[sel.get_support(indices=True)]]
    return train_x, valid_x

In [30]:
train_x, valid_x = remove_low_variance_features(train_x, valid_x)

In [31]:
train_x.shape

(1600, 490)

In [35]:
# 3. Remove Random Columns (Optional)
# This step is an approximation and should be tailored to your specific needs
# Here we use a Decision Tree to estimate feature importance
def remove_random_features(
    train_x: pd.DataFrame,
    train_y: pd.DataFrame,
    valid_x: pd.DataFrame,
    importance=0.005,
):
    tree: DecisionTreeClassifier = DecisionTreeClassifier(random_state=0)
    tree.fit(train_x, train_y)
    importances = tree.feature_importances_

    # Assume columns with very low importance are "random"
    # This threshold can be adjusted based on domain knowledge
    important_indices = [i for i, imp in enumerate(importances) if imp > importance]
    train_x = train_x.iloc[:, important_indices]
    valid_x = valid_x.iloc[:, important_indices]
    return train_x, valid_x

In [36]:
train_x, valid_x = remove_random_features(
    train_x=train_x, train_y=train_y, valid_x=valid_x
)

In [37]:
train_x.shape

(1600, 54)

In [38]:
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_classif


def anove_filter(
    train_x: pd.DataFrame,
    train_y: pd.DataFrame,
    valid_x: pd.DataFrame,
    k: int = 50,
):
    # Using ANOVA F-test to select features
    selector = SelectKBest(
        f_classif, k=k
    )  # Change k to select the number of features you want
    selector.fit(train_x, train_y)

    # Get F-values and p-values for each feature
    f_values = selector.scores_
    p_values = selector.pvalues_

    # Selecting features (you can use a threshold or select top k features)
    selected_features = train_x.columns[selector.get_support()]

    # Transforming train_x to include only the selected features
    train_x = selector.transform(train_x)
    valid_x = selector.transform(valid_x)
    return train_x, valid_x

In [39]:
# train_x, valid_x = anove_filter(train_x=train_x, train_y=train_y, valid_x=valid_x)

In [40]:
label = "class"
train_y = train_y.rename(columns={0: label})
valid_y = valid_y.rename(columns={0: label})

In [41]:
train_data = pd.concat([train_x, train_y[label]], axis=1)

In [42]:
# sanity check
(
    original_train_x,
    original_train_y,
    original_valid_x,
    original_valid_y,
) = get_train_and_validation_data()
for y, original_y in zip([train_y, valid_y], [original_train_y, original_valid_y]):
    assert y.shape == original_y.shape

train_x.shape:  (1600, 500)
train_y.shape:  (1600, 1)
valid_x.shape:  (400, 500)
valid_y.shape:  (400, 1)


In [43]:
from autogluon.tabular import TabularPredictor

save_path = "some_path"
# train for 3 minutes with increased num_boost_round
predictor = TabularPredictor(
    label=label, path=save_path, eval_metric="balanced_accuracy",  problem_type="binary"
).fit(train_data, time_limit=60 * 20,presets = "best_quality", hyperparameters ="default")

Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
Dynamic stacking is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
Detecting stacked overfitting by sub-fitting AutoGluon on the input data. That is, copies of AutoGluon will be sub-fit on subset(s) of the data. Then, the holdout validation data is used to detect stacked overfitting.
Sub-fit(s) time limit is: 1200 seconds.
Starting holdout-based sub-fit for dynamic stacking. Context path is: some_path/ds_sub_fit/sub_fit_ho.
Running the sub-fit in a ray process to avoid memory leakage.
Spend 208 seconds for the sub-fit(s) during dynamic stacking.
Time left for full fit of AutoGluon: 992 seconds.
Starting full fit now with num_stack_levels 0.
Beginning AutoGluon training ... Time limit = 992s
AutoGluon will save models to "some_path"
AutoGl

In [44]:
predictor.leaderboard()

Unnamed: 0,model,score_val,eval_metric,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,CatBoost_BAG_L1,0.84925,balanced_accuracy,0.023005,25.830919,0.023005,25.830919,1,True,7
1,WeightedEnsemble_L2,0.84925,balanced_accuracy,0.026011,26.691,0.003006,0.860081,2,True,14
2,XGBoost_BAG_L1,0.841764,balanced_accuracy,0.049008,6.374003,0.049008,6.374003,1,True,11
3,LightGBMLarge_BAG_L1,0.837537,balanced_accuracy,0.047004,10.807577,0.047004,10.807577,1,True,13
4,LightGBM_BAG_L1,0.82988,balanced_accuracy,0.037004,4.248002,0.037004,4.248002,1,True,4
5,LightGBMXT_BAG_L1,0.775309,balanced_accuracy,0.035962,4.128309,0.035962,4.128309,1,True,3
6,RandomForestGini_BAG_L1,0.754196,balanced_accuracy,0.127038,1.319147,0.127038,1.319147,1,True,5
7,KNeighborsDist_BAG_L1,0.753204,balanced_accuracy,0.031,0.007998,0.031,0.007998,1,True,2
8,KNeighborsUnif_BAG_L1,0.753204,balanced_accuracy,0.034993,0.008001,0.034993,0.008001,1,True,1
9,RandomForestEntr_BAG_L1,0.751047,balanced_accuracy,0.122036,1.20192,0.122036,1.20192,1,True,6


In [45]:
# check on validation data
print(valid_x.shape)
print(valid_y.shape)
valid_data = pd.concat([valid_x, valid_y[label]], axis=1)
predictor.evaluate(valid_data)
# best model WeightedEnsamble_L2 score 0.86

(400, 54)
(400, 1)


{'balanced_accuracy': 0.8362155388471177,
 'accuracy': 0.8375,
 'mcc': 0.6740293579627241,
 'roc_auc': 0.9091478696741856,
 'f1': 0.8257372654155496,
 'precision': 0.8415300546448088,
 'recall': 0.8105263157894737}