In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import mutual_info_score
from sklearn.tree import DecisionTreeClassifier

####################################################################
#                          Read data                               #
####################################################################

prefix = ""

_test_x = pd.read_table(prefix + "artificial_test.data", sep=" ", header=None)
_test_x.drop(_test_x.columns[500], axis=1, inplace=True)
_train_y = pd.read_table(prefix + "artificial_train.labels", header=None)
_train_x = pd.read_table(prefix + "artificial_train.data", sep=" ", header=None)
_train_x.drop(_train_x.columns[500], axis=1, inplace=True)

In [2]:
def get_train_data():
    return _train_x.copy(), _train_y.copy()

In [3]:
train_x, train_y = get_train_data()

In [4]:
# 1. Remove Highly Correlated Columns
corr_matrix = train_x.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Find index of feature columns with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
train_x = train_x.drop(to_drop, axis=1)

In [5]:
train_x.shape

(2000, 490)

In [6]:
# 2. Remove Low Variance Columns
sel = VarianceThreshold(threshold=(0.8 * (1 - 0.8)))  # Example threshold
train_x = sel.fit_transform(train_x)

In [7]:
train_x.shape

(2000, 490)

In [8]:
# 3. Remove Random Columns (Optional)
# This step is an approximation and should be tailored to your specific needs
# Here we use a Decision Tree to estimate feature importance
tree = DecisionTreeClassifier(random_state=0)
tree.fit(train_x, train_y)
importances = tree.feature_importances_

# Assume columns with very low importance are "random"
# This threshold can be adjusted based on domain knowledge
important_indices = [i for i, imp in enumerate(importances) if imp > 0.01]
train_x = train_x[:, important_indices]

In [9]:
train_x.shape

(2000, 12)

In [None]:
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_classif

# Using ANOVA F-test to select features
selector = SelectKBest(f_classif, k=50)  # Change k to select the number of features you want
selector.fit(train_x, train_y)

# Get F-values and p-values for each feature
f_values = selector.scores_
p_values = selector.pvalues_

# Selecting features (you can use a threshold or select top k features)
selected_features = train_x.columns[selector.get_support()]

# Transforming train_x to include only the selected features
train_x_selected = selector.transform(train_x)

In [None]:
train_x_selected.shape
train_x = train_x_selected

In [10]:
label = "class"
train_y.columns = [label]
train_x = pd.DataFrame(train_x)

# train_y = pd.DataFrame(train_y, columns=['class'])

In [11]:
train_data = pd.concat([train_x, train_y[label]], axis=1)

In [12]:
from autogluon.tabular import TabularPredictor

save_path = "some_path"

predictor = TabularPredictor(
    label=label, path=save_path, eval_metric="balanced_accuracy"
).fit(train_data)

  from .autonotebook import tqdm as notebook_tqdm
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets.
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='best_quality'   : Maximize accuracy. Default time_limit=3600.
	presets='high_quality'   : Strong accuracy with fast inference speed. Default time_limit=3600.
	presets='good_quality'   : Good accuracy with very fast inference speed. Default time_limit=3600.
	presets='medium_quality' : Fast training time, ideal for initial prototyping.
Beginning AutoGluon training ...
AutoGluon will save models to "some_path"
AutoGluon Version:  1.0.0
Python Version:     3.8.10
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22621
CPU Count:          8
Memory Avail:       6.02 GB / 15.71 GB (38.3%)
Disk Space Avail:   86.47 GB / 357.30 GB (24.2%)
Train Data Rows:    2000
Train D

[1000]	valid_set's binary_logloss: 0.399732	valid_set's balanced_accuracy: 0.8575
[2000]	valid_set's binary_logloss: 0.464121	valid_set's balanced_accuracy: 0.87


	0.87	 = Validation score   (balanced_accuracy)
	3.03s	 = Training   runtime
	0.02s	 = Validation runtime
Fitting model: LightGBM ...
	0.8675	 = Validation score   (balanced_accuracy)
	0.75s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: RandomForestGini ...
	0.855	 = Validation score   (balanced_accuracy)
	0.74s	 = Training   runtime
	0.08s	 = Validation runtime
Fitting model: RandomForestEntr ...
	0.8525	 = Validation score   (balanced_accuracy)
	0.72s	 = Training   runtime
	0.08s	 = Validation runtime
Fitting model: CatBoost ...
	0.8725	 = Validation score   (balanced_accuracy)
	4.11s	 = Training   runtime
	0.0s	 = Validation runtime
Fitting model: ExtraTreesGini ...
	0.8475	 = Validation score   (balanced_accuracy)
	0.74s	 = Training   runtime
	0.08s	 = Validation runtime
Fitting model: ExtraTreesEntr ...
	0.855	 = Validation score   (balanced_accuracy)
	0.64s	 = Training   runtime
	0.08s	 = Validation runtime
Fitting model: NeuralNetFastAI ...
Metric balanced_ac

In [14]:
predictor.leaderboard()
# best model WeightedEnsamble_L2 score 0.86

Unnamed: 0,model,score_val,eval_metric,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,0.89,balanced_accuracy,0.251109,8.697992,0.001011,0.531049,2,True,14
1,XGBoost,0.875,balanced_accuracy,0.007,1.208777,0.007,1.208777,1,True,11
2,CatBoost,0.8725,balanced_accuracy,0.004002,4.109244,0.004002,4.109244,1,True,7
3,LightGBMLarge,0.87,balanced_accuracy,0.005997,1.721323,0.005997,1.721323,1,True,13
4,LightGBMXT,0.87,balanced_accuracy,0.015993,3.027059,0.015993,3.027059,1,True,3
5,LightGBM,0.8675,balanced_accuracy,0.005996,0.752002,0.005996,0.752002,1,True,4
6,NeuralNetTorch,0.8575,balanced_accuracy,0.004,4.864398,0.004,4.864398,1,True,12
7,ExtraTreesEntr,0.855,balanced_accuracy,0.078253,0.636105,0.078253,0.636105,1,True,9
8,RandomForestGini,0.855,balanced_accuracy,0.079995,0.740787,0.079995,0.740787,1,True,5
9,RandomForestEntr,0.8525,balanced_accuracy,0.080849,0.716876,0.080849,0.716876,1,True,6
