# Model Research

Highlights from my attempts at finding the best possible model to predict who is most likely to subscribe to the newsletter.

### Summary

&emsp;[Simple Logistic Regression](#round-1) \
&emsp;[Where I tried to find out why I wasn't seeing incremental F1-score improvements, but jumps and plateaus](#round-4) \
&emsp;[Simple Random Forest Classifier](#round-8) \
&emsp;[Where I started out automating the use of GridSearchCV](#round-10) \
&emsp;[np.log() columns](#round-11) \
&emsp;[Back to Logistic Regression, with Polynomial features](#round-13)

### Tracking Server

[The MLFlow tracking server](https://aengusbl-conversion-rate-tracking-server.hf.space)

Please note that the name of the "test_r2" column is a mistake I made when setting up the MLFlow logging: The scores are all F1 scores, but I could not change the name of the column after the fact, and I decided that it would be more confusing to have two score columns in the tracking server, so I left the name there for the sake of consistency.

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, f1_score
from dotenv import load_dotenv


import plotly.express as px
import pandas as pd
import numpy as np

import mlflow, os

In [None]:
data = pd.read_csv('conversion_data_train.csv')
data.columns

Index(['country', 'age', 'new_user', 'source', 'total_pages_visited',
       'converted'],
      dtype='object')

### Round 1:

```python
model = LogisticRegression(random_state=444719, n_jobs=-1)

param_grid = {
    "penalty": ['l1', 'l2'],
    "C": [0.001, 0.01, 0.1, 1, 10, 100],
    "max_iter": [10, 50, 100, 150, 200],
    "solver": ["liblinear", "saga"]
}
```

In [None]:
categorical_cols = ['country', 'source']
numerical_cols = ['age', 'total_pages_visited']

X = data[categorical_cols + numerical_cols + ["new_user"]] # new_user shouldn't be normalised, nor does it need to be one-hot encoded (0s and 1s already)
y = data["converted"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    train_size=0.75,
    random_state=444719,
    stratify=y
)

onehot_encoder = OneHotEncoder(drop="first", handle_unknown="ignore")
standard_scaler = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ("categorical", onehot_encoder, categorical_cols),
        ("numerical", standard_scaler, numerical_cols)
    ],
    remainder="passthrough"
)

X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

scorer = make_scorer(f1_score)

In [None]:
model = LogisticRegression(random_state=444719, n_jobs=-1)

In [52]:
param_grid = {
    "penalty": ['l1', 'l2'],
    "C": [0.001, 0.01, 0.1, 1, 10],
    "max_iter": [10, 50, 100, 150, 200],
    "solver": ["lbfgs", "sag", "saga", "newton-cholesky", "newton-cg"]
}

In [None]:
load_dotenv()

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=5,
    scoring=scorer,
    n_jobs=-1
)

EXPERIMENT_NAME="conversion-rate"
mlflow.set_tracking_uri("https://aengusbl-conversion-rate-tracking-server.hf.space")
mlflow.set_experiment(EXPERIMENT_NAME)
experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)

with mlflow.start_run(experiment_id = experiment.experiment_id, run_name="Logistic regression"):
    grid_search.fit(X_train, y_train)

    mlflow.log_params(grid_search.best_params_)

    y_pred = grid_search.predict(X_test)
    f1 = f1_score(y_test, y_pred)

    mlflow.log_metric("test_r2", f1)

    print("Best parameters:", grid_search.best_params_)
    print("Best cross-validation R²:", grid_search.best_score_)
    print("Test R²:", f1)

    input_example = X_train[:1]
    mlflow.sklearn.log_model(
        grid_search.best_estimator_,
        "model",
        input_example=input_example,
        registered_model_name="conversion-rate-model"
    )

### Round 2

```python
model = LogisticRegression(random_state=444719, n_jobs=-1)

param_grid = {
    "penalty": ['l1', 'l2', "elasticnet"],
    "C": [10, 20, 30, 40, 50, 60, 70],
    "max_iter": [40, 50, 60, 70, 80, 90],
    "solver": ["lbfgs", "sag", "saga", "newton-cholesky", "newton-cg"]
}
```

In [57]:
model = LogisticRegression(random_state=444719, n_jobs=-1)

In [None]:
param_grid = {
    "penalty": ['l1', 'l2', "elastic"],
    "C": [10, 20, 30, 40, 50, 60, 70],
    "max_iter": [40, 50, 60, 70, 80, 90],
    "solver": ["lbfgs", "sag", "saga", "newton-cholesky", "newton-cg"]
}

In [None]:
load_dotenv()

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=5,
    scoring=scorer,
    n_jobs=-1
)

EXPERIMENT_NAME="conversion-rate"
mlflow.set_tracking_uri("https://aengusbl-conversion-rate-tracking-server.hf.space")
mlflow.set_experiment(EXPERIMENT_NAME)
experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)

with mlflow.start_run(experiment_id = experiment.experiment_id, run_name="Logistic regression"):
    grid_search.fit(X_train, y_train)

    mlflow.log_params(grid_search.best_params_)

    y_pred = grid_search.predict(X_test)
    f1 = f1_score(y_test, y_pred)

    mlflow.log_metric("test_r2", f1)

    print("Best parameters:", grid_search.best_params_)
    print("Best cross-validation F1:", grid_search.best_score_)
    print("Test F1:", f1)

    input_example = X_train[:1]
    mlflow.sklearn.log_model(
        grid_search.best_estimator_,
        "model",
        input_example=input_example,
        registered_model_name="conversion-rate-model"
    )

### Round 3

```python
model = LogisticRegression(random_state=444719, n_jobs=-1)

param_grid = {
    "penalty": ['l1', 'l2', "elasticnet", None],
    "C": [70, 90, 110, 120],
    "max_iter": [30, 35, 40, 45],
    "solver": ["lbfgs", "sag", "saga", "newton-cholesky", "liblinear"]
}
```

In [60]:
model = LogisticRegression(random_state=444719, n_jobs=-1)

In [61]:
param_grid = {
    "penalty": ['l1', 'l2', "elasticnet", None],
    "C": [70, 90, 110, 120],
    "max_iter": [30, 35, 40, 45],
    "solver": ["lbfgs", "sag", "saga", "newton-cholesky", "liblinear"]
}

In [None]:
load_dotenv()

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=5,
    scoring=scorer,
    n_jobs=-1
)

EXPERIMENT_NAME="conversion-rate"
mlflow.set_tracking_uri("https://aengusbl-conversion-rate-tracking-server.hf.space")
mlflow.set_experiment(EXPERIMENT_NAME)
experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)

with mlflow.start_run(experiment_id = experiment.experiment_id, run_name="Logistic regression"):
    grid_search.fit(X_train, y_train)

    mlflow.log_params(grid_search.best_params_)

    y_pred = grid_search.predict(X_test)
    f1 = f1_score(y_test, y_pred)

    mlflow.log_metric("test_r2", f1)

    print("Best parameters:", grid_search.best_params_)
    print("Best cross-validation F1:", grid_search.best_score_)
    print("Test F1:", f1)

    input_example = X_train[:1]
    mlflow.sklearn.log_model(
        grid_search.best_estimator_,
        "model",
        input_example=input_example,
        registered_model_name="conversion-rate-model"
    )

### Round 4

```python
model = LogisticRegression(random_state=444719, n_jobs=-1)

param_grid = {
    "penalty": ['l2'],
    "C": [65, 70, 75],
    "max_iter": [25, 26, 27, 27, 28, 29, 30, 31, 32, 33, 34, 35],
    "solver": ["lbfgs"]
}
```

In [5]:
model = LogisticRegression(random_state=444719, n_jobs=-1)

In [4]:
param_grid = {
    "penalty": ['l2'],
    "C": [65, 70, 75],
    "max_iter": [25, 26, 27, 27, 28, 29, 30, 31, 32, 33, 34, 35],
    "solver": ["lbfgs"]
}

In [None]:
load_dotenv()

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=5,
    scoring=scorer,
    n_jobs=-1
)

EXPERIMENT_NAME="conversion-rate"
mlflow.set_tracking_uri("https://aengusbl-conversion-rate-tracking-server.hf.space")
mlflow.set_experiment(EXPERIMENT_NAME)
experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)

with mlflow.start_run(experiment_id = experiment.experiment_id, run_name="Logistic regression"):
    grid_search.fit(X_train, y_train)

    mlflow.log_params(grid_search.best_params_)

    y_pred = grid_search.predict(X_test)
    f1 = f1_score(y_test, y_pred)

    mlflow.log_metric("test_r2", f1)

    print("Best parameters:", grid_search.best_params_)
    print("Best cross-validation F1:", grid_search.best_score_)
    print("Test F1:", f1)

    input_example = X_train[:1]
    mlflow.sklearn.log_model(
        grid_search.best_estimator_,
        "model",
        input_example=input_example,
        registered_model_name="conversion-rate-model"
    )

Here, I was wondering why the f1 scores were stagnating for a while and then jumping to a higher score, despite GridSearchCV clearly finding ways to make small, incremental improvements. \
I tried to manually fit a single set of hyperparameters that GridSearchCV had rejected, and I did indeed get a score I hadn't seen before, that was a little lower than the best score in the last round. This proved that I probably wasn't doing anything wrong, and that there somehow were specific scores that I could only jump between. This phenomenon kept going for the whole duration of the project.

In [10]:
test_params = {'C': 10, 'max_iter': 50, 'penalty': 'l1', 'solver': 'saga'}

test_model = LogisticRegression(**test_params, random_state=444719, n_jobs=-1)

test_model.fit(X=X_train, y=y_train)

test_y_pred = test_model.predict(X_test)
f1 = f1_score(y_test, test_y_pred)
print(f1)

0.7613526570048309


### Round 7

```python
model = LogisticRegression(random_state=444719, n_jobs=-1)

param_grid = {
    "penalty": ['l2'],
    "C": [i/100 for i in range(1000, 2000)],
    "max_iter": [20],
    "solver": ["lbfgs"]
}
```

In [18]:
model = LogisticRegression(random_state=444719, n_jobs=-1)

In [19]:
param_grid = {
    "penalty": ['l2'],
    "C": [i/100 for i in range(1000, 2000)],
    "max_iter": [20],
    "solver": ["lbfgs"]
}

In [None]:
load_dotenv()

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=5,
    scoring=scorer,
    n_jobs=-1
)

EXPERIMENT_NAME="conversion-rate"
mlflow.set_tracking_uri("https://aengusbl-conversion-rate-tracking-server.hf.space")
mlflow.set_experiment(EXPERIMENT_NAME)
experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)

with mlflow.start_run(experiment_id = experiment.experiment_id, run_name="Logistic regression"):
    grid_search.fit(X_train, y_train)

    mlflow.log_params(grid_search.best_params_)

    y_pred = grid_search.predict(X_test)
    f1 = f1_score(y_test, y_pred)

    mlflow.log_metric("test_r2", f1)

    print("Best parameters:", grid_search.best_params_)
    print("Best cross-validation F1:", grid_search.best_score_)
    print("Test F1:", f1)

    input_example = X_train[:1]
    mlflow.sklearn.log_model(
        grid_search.best_estimator_,
        "model",
        input_example=input_example,
        registered_model_name="conversion-rate-model"
    )

### Round 8

```python
model = RandomForestClassifier(random_state=444719, n_jobs=-1, oob_score=f1_score)

param_grid = {
    "n_estimators": [50, 100],
    "criterion": ["gini", "entropy", "log_loss"],
    "max_depth": [20, 50],
    "min_samples_split": [5, 10],
    "min_samples_leaf": [5, 7],
    "max_samples": [None, 500],
    "min_impurity_decrease": [0.0, 0.2]
}
```

In [None]:
model = RandomForestClassifier(random_state=444719, n_jobs=-1)

In [5]:
param_grid = {
    "n_estimators": [50, 100],
    "criterion": ["gini", "entropy", "log_loss"],
    "max_depth": [20, 50],
    "min_samples_split": [5, 10],
    "min_samples_leaf": [5, 7],
    "max_samples": [None, 500],
    "min_impurity_decrease": [0.0, 0.2]
}

In [None]:
load_dotenv()

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=5,
    scoring=scorer,
    n_jobs=-1
)

EXPERIMENT_NAME="conversion-rate"
mlflow.set_tracking_uri("https://aengusbl-conversion-rate-tracking-server.hf.space")
mlflow.set_experiment(EXPERIMENT_NAME)
experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)

with mlflow.start_run(experiment_id = experiment.experiment_id, run_name="Random forest classification"):
    grid_search.fit(X_train, y_train)

    mlflow.log_params(grid_search.best_params_)

    y_pred = grid_search.predict(X_test)
    f1 = f1_score(y_test, y_pred)

    mlflow.log_metric("test_r2", f1)

    print("Best parameters:", grid_search.best_params_)
    print("Best cross-validation F1:", grid_search.best_score_)
    print("Test F1:", f1)

    input_example = X_train[:1]
    mlflow.sklearn.log_model(
        grid_search.best_estimator_,
        "model",
        input_example=input_example,
        registered_model_name="conversion-rate-model"
    )



Best parameters: {'criterion': 'gini', 'max_depth': 20, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 7, 'min_samples_split': 5, 'n_estimators': 50}
Best cross-validation F1: 0.7602809054260993
Test F1: 0.7474014986705342


Registered model 'conversion-rate-model' already exists. Creating a new version of this model...
2025/08/25 21:43:46 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: conversion-rate-model, version 8
Created version '8' of model 'conversion-rate-model'.


🏃 View run Random forest classification at: https://aengusbl-conversion-rate-tracking-server.hf.space/#/experiments/1/runs/9369b4d870214edab0ecd6185f0dfa79
🧪 View experiment at: https://aengusbl-conversion-rate-tracking-server.hf.space/#/experiments/1


### Round 10

Running grid searches automatically.
The automation script gets improved in subsequent rounds.

In [None]:
def tree_train(params, model=model):
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=params,
        cv=5,
        scoring=scorer,
        n_jobs=-1
    )

    EXPERIMENT_NAME="conversion-rate"
    mlflow.set_tracking_uri("https://aengusbl-conversion-rate-tracking-server.hf.space")
    mlflow.set_experiment(EXPERIMENT_NAME)
    experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)

    with mlflow.start_run(experiment_id = experiment.experiment_id, run_name="Random forest classification"):
        grid_search.fit(X_train, y_train)

        mlflow.log_params(grid_search.best_params_)

        y_pred = grid_search.predict(X_test)
        f1 = f1_score(y_test, y_pred)

        mlflow.log_metric("test_r2", f1)

        print("Best parameters:", grid_search.best_params_)
        print("Best cross-validation F1:", grid_search.best_score_)
        print("Test F1:", f1)

        input_example = X_train[:1]
        mlflow.sklearn.log_model(
            grid_search.best_estimator_,
            "model",
            input_example=input_example,
            registered_model_name="conversion-rate-model"
        )
    
    new_params = dict()
    best_params = grid_search.best_params_
    for param_name, val in best_params.items():
        param_options_for_this_iter = params[param_name]
        if len(param_options_for_this_iter) == 1:
            new_params[param_name] = [val]
            continue
        best_param_index = param_options_for_this_iter.index(val)
        if best_param_index == 2:
            step = val - params[param_name][best_param_index - 1]
            new_params[param_name] = [val, (val + int((step / 2) + 1)), (val + (step * 2))] # Trying to make the step a little irregular to avoid trying rejected values again in following rounds
        elif best_param_index == 1:                                                         # The `+ 1` is there to make sure int() doesn't round down to 0
            try:
                step = int((params[param_name][best_param_index + 1] - val) / 2)
            except:
                step = int((val - params[param_name][best_param_index - 1]) / 2)
            if step <= 1:
                new_params[param_name] = [val]
            else:
                new_params[param_name] = [(val - step), val, (val + step)]
        elif best_param_index == 0:
            step = params[param_name][best_param_index + 1] - val
            new_params[param_name] = [i for i in [abs(int(val - (step * 1.5))), abs(val - step), val] if i > 0]
    
    return new_params



param_grid = {'max_depth': [48, 49, 50], 'min_samples_leaf': [10, 11, 12], 'min_samples_split': [1, 2, 3], 'n_estimators': [53]}

while sum([len(i) for i in param_grid.values()]) > 4:
    print(f"Current params: {param_grid}")
    param_grid = tree_train(params=param_grid)

### Round 11

Running grid searches, but with the log of the numerical values too. \
I tried with np.exp() as well, but too many values were going to infinity. Maybe next time I should use a standard scaler and only then use np.exp() on the values.

In [28]:
data.describe()

Unnamed: 0,age,new_user,total_pages_visited,converted
count,284580.0,284580.0,284580.0,284580.0
mean,30.564203,0.685452,4.873252,0.032258
std,8.266789,0.464336,3.341995,0.176685
min,17.0,0.0,1.0,0.0
25%,24.0,0.0,2.0,0.0
50%,30.0,1.0,4.0,0.0
75%,36.0,1.0,7.0,0.0
max,123.0,1.0,29.0,1.0


In [None]:
categorical_cols = ['country', 'source']
numerical_cols = ['age', 'total_pages_visited']

log_data = data.copy()

log_numerical_cols = numerical_cols.copy()

for col in numerical_cols:
    log_col_name = "log_" + col
    log_data[log_col_name] = log_data[col].apply(lambda i: np.log(i+0.1 if ((i == 0) or (abs(i) == 1)) else i))
    log_numerical_cols.append(log_col_name)

X = log_data[categorical_cols + log_numerical_cols + ["new_user"]] # new_user shouldn't be normalised, nor does it need to be one-hot encoded (0s and 1s already)
y = log_data["converted"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    train_size=0.75,
    random_state=444719,
    stratify=y
)

onehot_encoder = OneHotEncoder(drop="first", handle_unknown="ignore")
standard_scaler = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ("categorical", onehot_encoder, categorical_cols),
        ("numerical", standard_scaler, numerical_cols)
    ],
    remainder="passthrough"
)

X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

scorer = make_scorer(f1_score)

In [None]:
def tree_train(params, model=model):
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=params,
        cv=5,
        scoring=scorer,
        n_jobs=-1
    )

    EXPERIMENT_NAME="conversion-rate"
    mlflow.set_tracking_uri("https://aengusbl-conversion-rate-tracking-server.hf.space")
    mlflow.set_experiment(EXPERIMENT_NAME)
    experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)

    with mlflow.start_run(experiment_id = experiment.experiment_id, run_name="With log cols: Random forest classification"):
        grid_search.fit(X_train, y_train)

        mlflow.log_params(grid_search.best_params_)

        y_pred = grid_search.predict(X_test)
        f1 = f1_score(y_test, y_pred)

        mlflow.log_metric("test_r2", f1)

        print("Best parameters:", grid_search.best_params_)
        print("Best cross-validation F1:", grid_search.best_score_)
        print("Test F1:", f1)

        input_example = X_train[:1]
        mlflow.sklearn.log_model(
            grid_search.best_estimator_,
            "model",
            input_example=input_example,
            registered_model_name="conversion-rate-model"
        )
    
    new_params = dict()
    best_params = grid_search.best_params_
    for param_name, val in best_params.items():
        if param_name == "criterion":
            new_params[param_name] = ["gini", "entropy", "log_loss"]
            continue
        param_options_for_this_iter = params[param_name]
        if len(param_options_for_this_iter) == 1:
            new_params[param_name] = [val]
            continue
        best_param_index = param_options_for_this_iter.index(val)
        if best_param_index == 2:
            step = val - params[param_name][best_param_index - 1]
            new_params[param_name] = [val, (val + int((step / 2) + 1)), (val + (step * 2))]
        elif best_param_index == 1:
            try:
                step = int((params[param_name][best_param_index + 1] - val) / 2)
            except:
                step = int((val - params[param_name][best_param_index - 1]) / 2)
            if step <= 1:
                new_params[param_name] = [val]
            else:
                new_params[param_name] = [(val - step), val, (val + step)]
        elif best_param_index == 0:
            step = params[param_name][best_param_index + 1] - val
            new_params[param_name] = [i for i in [abs(int(val - (step * 1.5))), abs(val - step), val] if i > 0]
    
    return new_params



param_grid = {'max_depth': [2, 50, 1000],
              'min_samples_leaf': [2, 50, 1000],
              'min_samples_split': [2, 50, 1000],
              'n_estimators': [2, 50, 1000],
              "max_samples": [50, 1000, 10_000],
              "criterion": ["gini", "entropy", "log_loss"]
              }

while sum([len(i) for i in param_grid.values()]) > 4:
    print(f"Current params: {param_grid}")
    param_grid = tree_train(params=param_grid)

#### Last attempt at Random forest with log columns

I removed a lot of my failed attempts that timed out on Google Colab and LightningAI or failed in a different way. This is simply a representative example of what I did to get the third-best performing model.

In [None]:
categorical_cols = ['country', 'source']
numerical_cols = ['age', 'total_pages_visited']

log_data = data.copy()

log_numerical_cols = numerical_cols.copy()

for col in numerical_cols:
    log_col_name = "log_" + col
    log_data[log_col_name] = log_data[col].apply(lambda i: np.log(i+0.1 if ((i == 0) or (abs(i) == 1)) else i))
    log_numerical_cols.append(log_col_name)

X = log_data[categorical_cols + log_numerical_cols + ["new_user"]] # new_user shouldn't be normalised, nor does it need to be one-hot encoded (0s and 1s already)
y = log_data["converted"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.70,
    random_state=444719,
    stratify=y
)

onehot_encoder = OneHotEncoder(drop="first", handle_unknown="ignore")
standard_scaler = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ("categorical", onehot_encoder, categorical_cols),
        ("numerical", standard_scaler, numerical_cols) # I left the new log columns out of the scaler by mistake, but it worked well so I left it like that
    ],
    remainder="passthrough"
)

X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

scorer = make_scorer(f1_score)

In [None]:
def tree_train(params, model=model):
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=params,
        cv=3,
        scoring=scorer,
        n_jobs=-1,
        verbose=2
    )

    EXPERIMENT_NAME="conversion-rate"
    mlflow.set_tracking_uri("https://aengusbl-conversion-rate-tracking-server.hf.space")
    mlflow.set_experiment(EXPERIMENT_NAME)
    experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)

    if experiment is None:
        experiment_id = mlflow.create_experiment(
            "my_experiment",
            artifact_location=os.environ["ARTIFACT_STORE_URI"]
        )
    else:
        experiment_id = experiment.experiment_id

    with mlflow.start_run(experiment_id = experiment_id, run_name="With log cols and 0.70 test size (for speed): Random forest classification"):
        grid_search.fit(X_train, y_train)

        mlflow.log_params(grid_search.best_params_)

        y_pred = grid_search.predict(X_test)
        f1 = f1_score(y_test, y_pred)

        mlflow.log_metric("test_r2", f1)

        print("Best parameters:", grid_search.best_params_)
        print("Best cross-validation F1:", grid_search.best_score_)
        print("Test F1:", f1)

        input_example = X_train[:1]
        mlflow.sklearn.log_model(
            grid_search.best_estimator_,
            "model",
            input_example=input_example,
            registered_model_name="conversion-rate-model"
        )

    # I was having issues with some automatically-generated values to try out sometimes being the same ([5, 5, 5]) or negative (due to the calculated step being bigger than the lower value),
    # so there are a lot of checks and little operations to try and mitigate those issues.

    new_params = dict()
    best_params = grid_search.best_params_
    for param_name, val in best_params.items():
        if param_name == "criterion":
            new_params[param_name] = ["entropy"]
            continue
        param_options_for_this_iter = params[param_name]
        if len(param_options_for_this_iter) == 1:
            new_params[param_name] = [val]
            continue
        best_param_index = param_options_for_this_iter.index(val)
        if best_param_index == 2:
            step = val - params[param_name][best_param_index - 1]
            new_params[param_name] = [val, (val + int((step / 2) + 1)), (val + (step * 2))]
        elif best_param_index == 1:
            try:
                step = int((params[param_name][best_param_index + 1] - val) / 2)
            except:
                step = int((val - params[param_name][best_param_index - 1]) / 2)
            if step <= 1:
                new_params[param_name] = [val]
            else:
                new_params[param_name] = [i for i in [(val - step), val, (val + step)] if i > 0]
        elif best_param_index == 0:
            step = params[param_name][best_param_index + 1] - val
            new_params[param_name] = [i for i in [abs(int(val - (step * 1.5))), abs(val - step), val] if i > 0]

    return new_params



param_grid = {'max_depth': [50, 100],
            'max_samples': [28_000, 30_000, 50_000],
            'min_samples_leaf': [2, 5, 10],
            'min_samples_split': [2, 5, 10],
            'n_estimators': [2900, 5000, 7000],
              "criterion": ["entropy"]
              }

while sum([len(i) for i in param_grid.values()]) > 4:
    print(f"Current params: {param_grid}")
    param_grid = tree_train(params=param_grid)

### Round 13:
Logistic regression with polynomial features. \
Polynomial expansion is a built-in way to engineer non-linear features in Sci-Kit Learn.

In [None]:
categorical_cols = ['country', 'source']
numerical_cols = ['age', 'total_pages_visited']

X = data[categorical_cols + numerical_cols + ["new_user"]] # new_user shouldn't be normalised, nor does it need to be one-hot encoded (0s and 1s already)
y = data["converted"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    train_size=0.75,
    random_state=444719,
    stratify=y
)

onehot_encoder = OneHotEncoder(drop="first", handle_unknown="ignore")
standard_scaler = StandardScaler()
poly = PolynomialFeatures(degree=3, include_bias=False)

preprocessor = ColumnTransformer(
    transformers=[
        ("categorical", onehot_encoder, categorical_cols),
        ("numerical_sc", standard_scaler, numerical_cols),
        ("numerical_poly", poly, numerical_cols)
    ],
    remainder="passthrough"
)

X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

scorer = make_scorer(f1_score)

In [49]:
X_train[:1]

array([[ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  1.00000000e+00,  1.73126840e-01,
        -2.60463485e-01,  3.20000000e+01,  4.00000000e+00,
         1.02400000e+03,  1.28000000e+02,  1.60000000e+01,
         3.27680000e+04,  4.09600000e+03,  5.12000000e+02,
         6.40000000e+01,  1.00000000e+00]])

In [50]:
model = LogisticRegression(random_state=444719, n_jobs=-1)

In [51]:
param_grid = {
    "penalty": ['l2'],
    "C": [0.001, 0.01, 0.1, 1, 10],
    "max_iter": [10, 50, 100, 150, 200],
    "solver": ["lbfgs"]
}

In [None]:
load_dotenv()

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=5,
    scoring=scorer,
    n_jobs=-1,
    verbose=1
)

EXPERIMENT_NAME="conversion-rate"
mlflow.set_tracking_uri("https://aengusbl-conversion-rate-tracking-server.hf.space")
mlflow.set_experiment(EXPERIMENT_NAME)
experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)

with mlflow.start_run(experiment_id = experiment.experiment_id, run_name="Polynomial expansion: Logistic regression"):
    grid_search.fit(X_train, y_train)

    mlflow.log_params(grid_search.best_params_)

    y_pred = grid_search.predict(X_test)
    f1 = f1_score(y_test, y_pred)

    mlflow.log_metric("test_r2", f1)

    print("Best parameters:", grid_search.best_params_)
    print("Best cross-validation R²:", grid_search.best_score_)
    print("Test R²:", f1)

    input_example = X_train[:1]
    mlflow.sklearn.log_model(
        grid_search.best_estimator_,
        "model",
        input_example=input_example,
        registered_model_name="conversion-rate-model"
    )

Fitting 5 folds for each of 25 candidates, totalling 125 fits


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=10).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=10).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=10).
You might also want to scale 

Best parameters: {'C': 10, 'max_iter': 200, 'penalty': 'l2', 'solver': 'lbfgs'}
Best cross-validation R²: 0.7163369717760867
Test R²: 0.7128911138923655


Registered model 'conversion-rate-model' already exists. Creating a new version of this model...
2025/08/26 19:53:13 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: conversion-rate-model, version 45
Created version '45' of model 'conversion-rate-model'.


🏃 View run Polynomial expansion: Logistic regression at: https://aengusbl-conversion-rate-tracking-server.hf.space/#/experiments/1/runs/78f8dfba17d04fed9b54368410bf2d2e
🧪 View experiment at: https://aengusbl-conversion-rate-tracking-server.hf.space/#/experiments/1


In [None]:
def tree_train(params, model=model):
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=params,
        cv=5,
        scoring=scorer,
        n_jobs=-1,
        verbose=1
    )

    EXPERIMENT_NAME="conversion-rate"
    mlflow.set_tracking_uri("https://aengusbl-conversion-rate-tracking-server.hf.space")
    mlflow.set_experiment(EXPERIMENT_NAME)
    experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)

    with mlflow.start_run(experiment_id = experiment.experiment_id, run_name="Polynomial expansion and L1: Logistic regression"):
        grid_search.fit(X_train, y_train)

        mlflow.log_params(grid_search.best_params_)

        y_pred = grid_search.predict(X_test)
        f1 = f1_score(y_test, y_pred)

        mlflow.log_metric("test_r2", f1)

        print("Best parameters:", grid_search.best_params_)
        print("Best cross-validation F1:", grid_search.best_score_)
        print("Test F1:", f1)

        input_example = X_train[:1]
        mlflow.sklearn.log_model(
            grid_search.best_estimator_,
            "model",
            input_example=input_example,
            registered_model_name="conversion-rate-model"
        )
    
    new_params = dict()
    best_params = grid_search.best_params_
    for param_name, val in best_params.items():
        if param_name in ["penalty", "solver"]:
            new_params[param_name] = [val]
            continue
        param_options_for_this_iter = params[param_name]
        if len(param_options_for_this_iter) == 1:
            new_params[param_name] = [val]
            continue
        best_param_index = param_options_for_this_iter.index(val)
        if best_param_index == 2:
            step = val - params[param_name][best_param_index - 1]
            new_params[param_name] = [val, (val + (step / 2) + 1), (val + (step * 2))]
        elif best_param_index == 1:
            try:
                step = (params[param_name][best_param_index + 1] - val) / 2
            except:
                step = (val - params[param_name][best_param_index - 1]) / 2
            if step <= 1:
                new_params[param_name] = [val]
            else:
                new_params[param_name] = [i for i in [(val - step), val, (val + step)] if i > 0]
        elif best_param_index == 0:
            step = params[param_name][best_param_index + 1] - val
            new_params[param_name] = [i for i in [abs(val - (step * 1.5)), abs(val - step), val] if i > 0]
        
        if param_name == "max_iter":
            int_values = list(set([int(i) for i in new_params[param_name]]))
            new_params[param_name] = int_values
    
    return new_params

param_grid = {'C': [1, 10, 100], 'max_iter': [10000, 13562, 12375], 'penalty': ['l1'], 'solver': ['liblinear', "saga"]}

while sum([len(i) for i in param_grid.values()]) > 4:
    print(f"Current params: {param_grid}")
    param_grid = tree_train(params=param_grid)