In [19]:
import os

import polars as pl
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures, StandardScaler

In [20]:
df = pl.read_csv(os.path.join("data", "tabular", "feature_extraction_filtered.csv"))

y = df["class"].to_list()
X = df.drop("class").to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=462, shuffle=True)

In [21]:
model = make_pipeline(
    PolynomialFeatures(include_bias=False),
    StandardScaler(),
    LogisticRegression(
        penalty="l2",
        solver="lbfgs",
        max_iter=2000,
        random_state=462,
    ),
)

In [22]:
param_grid = {
    "polynomialfeatures__degree": [1, 2],
    "logisticregression__C": [0.01, 0.05, 0.1, 0.5, 1.0],
}

grid = GridSearchCV(model, param_grid, cv=5, n_jobs=-1)
grid.fit(X_train, y_train)

print("Best params:", grid.best_params_)
print("Best CV score:", grid.best_score_)

Best params: {'logisticregression__C': 0.1, 'polynomialfeatures__degree': 2}
Best CV score: 0.9862620758707505


In [23]:
best_model = grid.best_estimator_

test_acc = best_model.score(X_test, y_test)
print("Test accuracy:", test_acc)

Test accuracy: 0.9862681744749596


In [24]:
results = pl.DataFrame(grid.cv_results_)
display(results[["params", "mean_test_score", "std_test_score", "rank_test_score"]])

params,mean_test_score,std_test_score,rank_test_score
struct[2],f64,f64,i32
"{0.01,1}",0.929154,0.009394,10
"{0.01,2}",0.983299,0.005423,5
"{0.05,1}",0.959325,0.00665,9
"{0.05,2}",0.985993,0.00367,2
"{0.1,1}",0.969292,0.005846,8
"{0.1,2}",0.986262,0.003445,1
"{0.5,1}",0.981684,0.005742,6
"{0.5,2}",0.984106,0.003749,3
"{1.0,1}",0.983839,0.005445,4
"{1.0,2}",0.981412,0.003342,7
