In [1]:
import pandas as pd

from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer, f1_score, accuracy_score, precision_score, recall_score

In [2]:
data = pd.read_csv('../merged.csv')

X = data.drop(columns=["label"])  # Drop the target column
y = data["label"]  # Target column

In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

In [4]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [5]:
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score),
    'f1': make_scorer(f1_score)
}

In [6]:
# Grid search with class balancing
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10]}

svc = LinearSVC(class_weight='balanced', max_iter=5000)

grid_search = GridSearchCV(svc, param_grid, cv=5, scoring=scoring, refit='f1')

# Fit and evaluate
grid_search.fit(X_train_scaled, y_train)



GridSearchCV(cv=5, estimator=LinearSVC(class_weight='balanced', max_iter=5000),
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10]}, refit='f1',
             scoring={'accuracy': make_scorer(accuracy_score),
                      'f1': make_scorer(f1_score),
                      'precision': make_scorer(precision_score),
                      'recall': make_scorer(recall_score)})

In [7]:
print(f"Best params: {grid_search.best_params_}")
print("Best F1 score:", grid_search.best_score_)

Best params: {'C': 0.001}
Best F1 score: 0.6390486235976847


In [9]:
print("All scores:")
results = grid_search.cv_results_
for metric in scoring.keys():
    best_index = results['rank_test_f1'].argmin()
    print(f"{metric}: {results[f'mean_test_{metric}'][best_index]}")

print(results)

All scores:
accuracy: 0.599258064516129
precision: 0.566699016893004
recall: 0.7325942139322421
f1: 0.6390486235976847
{'mean_fit_time': array([117.87480445, 259.49073534, 266.568857  , 274.72352257,
       291.25771928]), 'std_fit_time': array([31.28487113,  6.28246574,  2.4846649 ,  6.82423121,  2.08482377]), 'mean_score_time': array([0.03184657, 0.03265009, 0.03753333, 0.03794708, 0.03620186]), 'std_score_time': array([0.0089645 , 0.00409881, 0.00484189, 0.00176386, 0.00493349]), 'param_C': masked_array(data=[0.001, 0.01, 0.1, 1, 10],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'C': 0.001}, {'C': 0.01}, {'C': 0.1}, {'C': 1}, {'C': 10}], 'split0_test_accuracy': array([0.59928571, 0.60875576, 0.6125576 , 0.61294931, 0.61635945]), 'split1_test_accuracy': array([0.60016129, 0.61191244, 0.61569124, 0.61698157, 0.62069124]), 'split2_test_accuracy': array([0.59700461, 0.61029954, 0.61541475, 0.61707373, 0.61921659]), 