In [8]:
import os

import polars as pl
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures, StandardScaler

In [9]:
df = pl.read_csv(os.path.join("data", "tabular", "feature_extraction_filtered.csv"))

y = df["class"].to_list()
X = df.drop("class").to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=462, shuffle=True)

In [10]:
model = make_pipeline(
    PolynomialFeatures(include_bias=False),
    StandardScaler(),
    LogisticRegression(
        penalty="l2",
        solver="lbfgs",
        max_iter=2000,
        random_state=462,
    ),
)

In [11]:
param_grid = {
    "polynomialfeatures__degree": [1, 2],
    "logisticregression__C": [0.01, 0.05, 0.1, 0.5, 1.0],
}

grid = GridSearchCV(model, param_grid, cv=5, n_jobs=-1)
grid.fit(X_train, y_train)

print("Best params:", grid.best_params_)
print("Best CV score:", grid.best_score_)

Best params: {'logisticregression__C': 0.1, 'polynomialfeatures__degree': 2}
Best CV score: 0.9099999999999999


In [12]:
best_model = grid.best_estimator_

test_acc = best_model.score(X_test, y_test)
print("Test accuracy:", test_acc)

Test accuracy: 0.926


In [13]:
results = pl.DataFrame(grid.cv_results_)
display(results[["params", "mean_test_score", "std_test_score", "rank_test_score"]])

params,mean_test_score,std_test_score,rank_test_score
struct[2],f64,f64,i32
"{0.01,1}",0.828,0.01259,10
"{0.01,2}",0.891,0.013472,5
"{0.05,1}",0.8575,0.002739,9
"{0.05,2}",0.9085,0.016477,2
"{0.1,1}",0.8645,0.003674,8
"{0.1,2}",0.91,0.01917,1
"{0.5,1}",0.881,0.007,7
"{0.5,2}",0.902,0.010654,3
"{1.0,1}",0.888,0.007969,6
"{1.0,2}",0.9,0.013509,4
