In [31]:
import numpy as np
import pandas as pd
import os
from pydantic import BaseModel

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA

from src.settings import DATA_DIR
from src.experiment_utils import perform_experiments, find_best_experiments

In [2]:
# Read the text file into a dataframe
X = pd.read_csv(os.path.join(DATA_DIR, 'x_train.txt'), sep=' ', header=None).to_numpy()
y = pd.read_csv(os.path.join(DATA_DIR, 'y_train.txt'), header=None).to_numpy().T[0]
X_test = pd.read_csv(os.path.join(DATA_DIR, 'x_test.txt'), sep=' ', header=None).to_numpy()

In [3]:
experiment_dict = {
    "exp_1": {
        "model": RandomForestClassifier,
        "model_config": {},
        "feature_selector": SelectFromModel,
        "feature_selector_config": {
            "estimator": LogisticRegression(max_iter=1000)
        }
    },
    "exp_2": {
        "model": QDA,
        "model_config": {},
        "feature_selector": SequentialFeatureSelector,
        "feature_selector_config": {
            "estimator": LogisticRegression(max_iter=1000),
            "n_features_to_select": "auto",
            "tol": 0.01,
            "direction": "forward"
        }
    }
}

In [None]:
perform_experiments(X, y, experiment_dict)

In [None]:
best_experiments = find_best_experiments()
best_experiment = best_experiments[1]

model = best_experiment["model"](**best_experiment["model_config"])
feature_selector = best_experiment["feature_selector"](
    **best_experiment["feature_selector_config"]
)

feature_selector.fit(X, y)
X_train_reduced = feature_selector.transform(X)
model.fit(X_train_reduced, y)

X_test_reduced = feature_selector.transform(X_test)

In [None]:
proba_preds = model.predict_proba(X_test_reduced)
np.argsort(proba_preds[:, 1])[-1000:]