In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
df = pd.read_csv("reduced_dataset.csv")

In [4]:
df['target'] = (df['num'] > 0).astype(int)
df = df.drop(columns=['num'])

In [5]:
X = df.drop(columns=['target'])
y = df['target']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
# Define Models & Hyperparameter Grids

In [7]:
# parameter grids
param_grids = {
    "Logistic Regression": {
      "C" : [0.01, 0.1, 1, 10],
        "penalty": ["l1","l2","elasticnet", None],
        "solver": ["saga","lbfgs"]
    },

    "Decision Tree" : {
        "max_depth" : [3,5,10,None],
        "min_samples_split" : [2,5,10],
        "criterion" : ["gini", "entropy"]
    },

    "Random Forest" : {
        "n_estimators" : [50,100,200],
        "max_depth" : [5,10,None],
        "min_samples_split" : [2, 5, 10]
    },

    "SVM" : {
        "C" : [0.1, 1, 10],
        "kernel" : ["linear", "rbf", "poly"],
        "gamma" : ["scale", "auto"]
    }
}

In [None]:
# RandomizedSearchCV 

In [10]:
best_models_random = {}

for name, params in param_grids.items():
    if name == "Logistic Regression":
        model = LogisticRegression(max_iter = 5000)
    elif name == "Decision Tree" : 
        model = DecisionTreeClassifier()
    elif name == "Random Forest" : 
        model = RandomForestClassifier()
    else:
        model = SVC(probability=True)

    random_search = RandomizedSearchCV(
        estimator = model,
        param_distributions = params,
        n_iter = 10,
        cv = 5,
        scoring = "f1",
        n_jobs = -1
    )

    random_search.fit(X_train, y_train)
    best_models_random[name] = random_search.best_estimator_

    print(f"{name} Randomized best params: {random_search.best_params_}")

15 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Lenovo\miniconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Lenovo\miniconda3\Lib\site-packages\sklearn\base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\Lenovo\miniconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1228, in fit
    raise ValueError("l1_ratio must be specified when penalty is elasticnet.")
ValueError: l1_ratio must be specif

Logistic Regression Randomized best params: {'solver': 'lbfgs', 'penalty': None, 'C': 10}
Decision Tree Randomized best params: {'min_samples_split': 10, 'max_depth': 10, 'criterion': 'gini'}
Random Forest Randomized best params: {'n_estimators': 200, 'min_samples_split': 10, 'max_depth': 5}
SVM Randomized best params: {'kernel': 'linear', 'gamma': 'scale', 'C': 0.1}


In [None]:
# GridSearchCV

In [11]:
best_models_grid = {}

for name, model in best_models_random.items():
    params = param_grids[name]

    grid_search = GridSearchCV(
        estimator = model,
        param_grid = {k: [v] if not isinstance(v, list) else v for k, v in params.items()},
        cv = 5,
        scoring = "f1",
        n_jobs = -1
    )

    grid_search.fit(X_train, y_train)
    best_models_grid[name] = grid_search.best_estimator_

    print(f"{name} Grid best params: {grid_search.best_params_}")

60 fits failed out of a total of 160.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Lenovo\miniconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Lenovo\miniconda3\Lib\site-packages\sklearn\base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\Lenovo\miniconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1218, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\Lenovo\miniconda3\Lib\site-pack

Logistic Regression Grid best params: {'C': 0.01, 'penalty': None, 'solver': 'saga'}
Decision Tree Grid best params: {'criterion': 'gini', 'max_depth': 10, 'min_samples_split': 10}
Random Forest Grid best params: {'max_depth': None, 'min_samples_split': 10, 'n_estimators': 200}
SVM Grid best params: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}


In [None]:
# Evaluate Tuned Model 

In [None]:
results = []

for name, model in best_models_grid.items():
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score()y_test, y_pred)
    rec = recall_score()