In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import mutual_info_score
from sklearn.tree import DecisionTreeClassifier

####################################################################
#                          Read data                               #
####################################################################

prefix = ""

_test_x = pd.read_table(prefix + "artificial_test.data", sep=" ", header=None)
_test_x.drop(_test_x.columns[500], axis=1, inplace=True)
_train_y = pd.read_table(prefix + "artificial_train.labels", header=None)
_train_x = pd.read_table(prefix + "artificial_train.data", sep=" ", header=None)
_train_x.drop(_train_x.columns[500], axis=1, inplace=True)

In [None]:
def get_train_and_validation_data():
    split = 400
    train_x, valid_x = _train_x[split:], _train_x[:split]
    train_y, valid_y = _train_y[split:], _train_y[:split]
    print("train_x.shape: ", train_x.shape)
    print("train_y.shape: ", train_y.shape)
    print("valid_x.shape: ", valid_x.shape)
    print("valid_y.shape: ", valid_y.shape)
    return train_x, train_y, valid_x, valid_y

In [None]:
train_x, train_y, valid_x, valid_y = get_train_and_validation_data()

In [None]:
# 1. Remove Highly Correlated Columns
def remove_highly_correlated_features(train_x, valid_x, threshold=0.95):
    corr_matrix = train_x.corr().abs()
    # Select upper triangle of correlation matrix
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    # Find index of feature columns with correlation greater than threshold
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
    train_x = train_x.drop(to_drop, axis=1)
    valid_x = valid_x.drop(to_drop, axis=1)
    return train_x, valid_x

In [None]:
train_x, valid_x = remove_highly_correlated_features(train_x, valid_x)
train_x.shape

In [None]:
# 2. Remove Low Variance Columns
def remove_low_variance_features(train_x, valid_x, threshold=(0.8 * (1 - 0.8))):
    sel = VarianceThreshold(threshold=threshold)
    sel.fit(train_x)
    train_x = train_x[train_x.columns[sel.get_support(indices=True)]]
    valid_x = valid_x[valid_x.columns[sel.get_support(indices=True)]]
    return train_x, valid_x

In [None]:
train_x, valid_x = remove_low_variance_features(train_x, valid_x)
train_x.shape

In [None]:
# 3. Remove Random Columns (Optional)
# This step is an approximation and should be tailored to your specific needs
# Here we use a Decision Tree to estimate feature importance
def remove_random_features(
    train_x: pd.DataFrame,
    train_y: pd.DataFrame,
    valid_x: pd.DataFrame,
    importance=0.005,
):
    tree: DecisionTreeClassifier = DecisionTreeClassifier(random_state=0)
    tree.fit(train_x, train_y)
    importances = tree.feature_importances_

    # Assume columns with very low importance are "random"
    # This threshold can be adjusted based on domain knowledge
    important_indices = [i for i, imp in enumerate(importances) if imp > importance]
    train_x = train_x.iloc[:, important_indices]
    valid_x = valid_x.iloc[:, important_indices]
    return train_x, valid_x

In [None]:
train_x, valid_x = remove_random_features(
    train_x=train_x, train_y=train_y, valid_x=valid_x
)
train_x.shape

In [None]:
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_classif


def anova_filter(
    train_x: pd.DataFrame,
    train_y: pd.DataFrame,
    valid_x: pd.DataFrame,
    k: int = 50,
):
    # Using ANOVA F-test to select features
    selector = SelectKBest(
        f_classif, k=k
    )  # Change k to select the number of features you want
    selector.fit(train_x, train_y)

    # Get F-values and p-values for each feature
    f_values = selector.scores_
    p_values = selector.pvalues_

    # Selecting features (you can use a threshold or select top k features)
    selected_features = train_x.columns[selector.get_support()]

    # Transforming train_x to include only the selected features
    train_x = selector.transform(train_x)
    valid_x = selector.transform(valid_x)
    return train_x, valid_x

In [None]:
# train_x, valid_x = anove_filter(train_x=train_x, train_y=train_y, valid_x=valid_x)
# train_x.shape

In [None]:
label = "class"
train_y = train_y.rename(columns={0: label})
valid_y = valid_y.rename(columns={0: label})
train_data = pd.concat([train_x, train_y[label]], axis=1)

In [None]:
# sanity check
(
    original_train_x,
    original_train_y,
    original_valid_x,
    original_valid_y,
) = get_train_and_validation_data()
for y, original_y in zip([train_y, valid_y], [original_train_y, original_valid_y]):
    assert y.shape == original_y.shape

In [None]:
# from autogluon.tabular import TabularPredictor

# save_path = "some_path"
# # train for 3 minutes with increased num_boost_round
# predictor = TabularPredictor(
#     label=label, path=save_path, eval_metric="balanced_accuracy",  problem_type="binary"
# ).fit(train_data, time_limit=60 * 10,presets = "best_quality", hyperparameters ="default")

In [None]:
# predictor.leaderboard()

In [None]:
# # check on validation data
# print(valid_x.shape)
# print(valid_y.shape)
# valid_data = pd.concat([valid_x, valid_y[label]], axis=1)
# predictor.evaluate(valid_data)
# # best model WeightedEnsamble_L2 score 0.86

In [None]:
# !pip install mljar-supervised

In [None]:
from supervised.automl import AutoML  # mljar-supervised

# train models with AutoML
automl = AutoML(
    mode="Compete", ml_task="binary_classification", total_time_limit=60 * 10, eval_metric="f1"
)
automl.fit(train_x, train_y)

In [None]:
from sklearn.metrics import balanced_accuracy_score

predictions = automl.predict(valid_x)
balanced_accuracy_score(valid_y, predictions)

In [None]:
# !pip install auto-sklearn
# !pip install ydata-profiling
from autosklearn.classification import AutoSklearnClassifier
from autosklearn.experimental.askl2 import AutoSklearn2Classifier
from autosklearn.metrics import balanced_accuracy 

In [None]:
# Edit the settings to try in both AutoSklearn1 and AutoSklearn2
# Possibilities https://automl.github.io/auto-sklearn/master/api.html

#------------------------- edit code here
settings = {
  "time_left_for_this_task": 120,  # seconds
  "seed": 42,
  "metric": balanced_accuracy,
  "n_jobs": 4,
}

# This will only be used by autosklearn 1 while autosklearn 2 will automatically
# select a strategy
resampling_strategy = "holdout"

#-------------------------

In [None]:
# Create and train an ensemble with AutoML
# Auto-sklearn will ingest the pandas dataframe and detects column types
askl2 = AutoSklearn2Classifier(
    **settings,
    resampling_strategy=resampling_strategy
)
askl2.fit(X_train, y_train)

In [None]:
leaderboard = askl2.leaderboard(sort_by="model_id", ensemble_only=True)
print(leaderboard)

In [None]:
# calucalte balanced accuracy on validation data
predictions = askl2.predict(X_test)
balanced_accuracy_score(y_test, predictions)
