In [19]:
# !npm install -g localtunnel
# !pip install optuna==2.10.0 numpyencoder==0.3.0 -q


In [1]:
#!pip install mlflow==1.23.1 -q
from sklearn import metrics
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import log_loss, precision_recall_fscore_support
import random
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
import json
from collections import Counter
from sklearn.model_selection import train_test_split
import nlpaug.augmenter.word as naw
from sklearn.feature_extraction.text import TfidfVectorizer
import optuna

In [2]:
from argparse import Namespace
import mlflow
from pathlib import Path


In [3]:
# Specify arguments
args = Namespace(
    lower=True,
    stem=False,
    analyzer="char",
    ngram_max_range=7,
    alpha=1e-4,
    learning_rate=1e-1,
    power_t=0.1,
    num_epochs=100
)


In [4]:
# Set tracking URI
MODEL_REGISTRY = Path("experiments")
Path(MODEL_REGISTRY).mkdir(exist_ok=True) # create experiments dir
mlflow.set_tracking_uri("file://" + str(MODEL_REGISTRY.absolute()))


In [5]:
def set_seeds(seed=42):
    """Set seeds for reproducibility."""
    np.random.seed(seed)
    random.seed(seed)
    
nltk.download("stopwords")
# stopwords are word which are generally filtered out, because they don't cary much information.
STOPWORDS = stopwords.words("english")
# lemmatization is normalization process which converts every word to its base root mode
stemmer = PorterStemmer()

def clean_text(text, lower=True, stem=False, stopwords=STOPWORDS):
    """Clean raw text."""
    # Lower
    if lower:
        text = text.lower()

    # Remove stopwords
    if len(stopwords):
        pattern = re.compile(r'\b(' + r"|".join(stopwords) + r")\b\s*")
        text = pattern.sub('', text)

    # Spacing and filters
    text = re.sub(
        r"([!\"'#$%&()*\+,-./:;<=>?@\\\[\]^_`{|}~])", r" \1 ", text
    )  # add spacing between objects to be filtered
    text = re.sub("[^A-Za-z0-9]+", " ", text)  # remove non alphanumeric chars
    text = re.sub(" +", " ", text)  # remove multiple spaces
    text = text.strip()  # strip white space at the ends

    # Remove links
    text = re.sub(r"http\S+", "", text)

    # Stemming
    if stem:
        text = " ".join([stemmer.stem(word, to_lowercase=lower) for word in text.split(" ")])

    return text

# Accepted tags (external constraint)
ACCEPTED_TAGS = ["natural-language-processing", "computer-vision", "mlops", "graph-learning"]


def preprocess(df, lower, stem, min_freq):
    """Preprocess the data."""
    df["text"] = df.title + " " + df.description  # feature engineering
    df.text = df.text.apply(clean_text, lower=lower, stem=stem)  # clean text

    # Replace OOS tags with `other`
    oos_tags = [item for item in df.tag.unique() if item not in ACCEPTED_TAGS]
    df.tag = df.tag.apply(lambda x: "other" if x in oos_tags else x)

    # Replace tags below min_freq with `other`
    tags = Counter(df.tag.values)
    tags_above_freq = Counter(tag for tag in tags.elements()
                            if (tags[tag] >= min_freq))
    df.tag = df.tag.apply(lambda tag: tag if tag in tags_above_freq else None)
    df.tag = df.tag.fillna("other")

    return df

def get_data_splits(X, y, train_size=0.7):
    """Generate balanced data splits."""
    X_train, X_, y_train, y_ = train_test_split(
        X, y, train_size=train_size, stratify=y)
    X_val, X_test, y_val, y_test = train_test_split(
        X_, y_, train_size=0.5, stratify=y_)
    return X_train, X_val, X_test, y_train, y_val, y_test

class LabelEncoder(object):
    """Encode labels into unique indices"""
    def __init__(self, class_to_index={}):
        self.class_to_index = class_to_index or {}  # mutable defaults ;)
        self.index_to_class = {v: k for k, v in self.class_to_index.items()}
        self.classes = list(self.class_to_index.keys())

    def __len__(self):
        return len(self.class_to_index)

    def __str__(self):
        return f"<LabelEncoder(num_classes={len(self)})>"

    def fit(self, y):
        classes = np.unique(y)
        for i, class_ in enumerate(classes):
            self.class_to_index[class_] = i
        self.index_to_class = {v: k for k, v in self.class_to_index.items()}
        self.classes = list(self.class_to_index.keys())
        return self

    def encode(self, y):
        encoded = np.zeros((len(y)), dtype=int)
        for i, item in enumerate(y):
            encoded[i] = self.class_to_index[item]
        return encoded

    def decode(self, y):
        classes = []
        for i, item in enumerate(y):
            classes.append(self.index_to_class[item])
        return classes
#  Do not understand
    def save(self, fp):
        with open(fp, "w") as fp:
            contents = {"class_to_index": self.class_to_index}
            json.dump(contents, fp, indent=4, sort_keys=False)

    @classmethod
    def load(cls, fp):
        with open(fp, "r") as fp:
            kwargs = json.load(fp=fp)
        return cls(**kwargs)
    
def custom_predict(y_prob, threshold, index):
    """Custom predict function that defaults
    to an index if conditions are not met."""
    y_pred = [np.argmax(p) if max(p) > threshold else index for p in y_prob]
    return np.array(y_pred)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/princychahal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
def train(args, trial=None):
    """Train model on data."""

    # Setup
    set_seeds()
    df = pd.read_csv("labeled_projects.csv")
    df = df.sample(frac=1).reset_index(drop=True)
    df = preprocess(df, lower=True, stem=False, min_freq=75)
    label_encoder = LabelEncoder().fit(df.tag)
    X_train, X_val, X_test, y_train, y_val, y_test = \
        get_data_splits(X=df.text.to_numpy(), y=label_encoder.encode(df.tag))

    # Tf-idf
    vectorizer = TfidfVectorizer(analyzer=args.analyzer, ngram_range=(2,args.ngram_max_range))  # char n-grams
    X_train = vectorizer.fit_transform(X_train)
    X_val = vectorizer.transform(X_val)
    X_test = vectorizer.transform(X_test)

    # Oversample
#     oversample = RandomOverSampler(sampling_strategy="all")
#     X_over, y_over = oversample.fit_resample(X_train, y_train)

    # Model
    model = SGDClassifier(
        loss="log", penalty="l2", alpha=args.alpha, max_iter=1,
        learning_rate="constant", eta0=args.learning_rate, power_t=args.power_t,
        warm_start=True)

    # Training
    for epoch in range(args.num_epochs):
        model.fit(X_train, y_train)
        train_loss = log_loss(y_train, model.predict_proba(X_train))
        val_loss = log_loss(y_val, model.predict_proba(X_val))
        if not epoch%10:
            print(
                f"Epoch: {epoch:02d} | "
                f"train_loss: {train_loss:.5f}, "
                f"val_loss: {val_loss:.5f}"
            )

        # Log
        if not trial:
            mlflow.log_metrics({"train_loss": train_loss, "val_loss": val_loss}, step=epoch)

        # Pruning (for optimization in next section)
        if trial:
            trial.report(val_loss, epoch)
            if trial.should_prune():
                raise optuna.TrialPruned()

    # Threshold
    y_pred = model.predict(X_val)
    y_prob = model.predict_proba(X_val)
    args.threshold = np.quantile(
        [y_prob[i][j] for i, j in enumerate(y_pred)], q=0.25)  # Q1

    # Evaluation
    other_index = label_encoder.class_to_index["other"]
    y_prob = model.predict_proba(X_test)
    y_pred = custom_predict(y_prob=y_prob, threshold=args.threshold, index=other_index)
    metrics = precision_recall_fscore_support(y_test, y_pred, average="weighted")
    performance = {"precision": metrics[0], "recall": metrics[1], "f1": metrics[2]}
    print (json.dumps(performance, indent=2))

    return {
        "args": args,
        "label_encoder": label_encoder,
        "vectorizer": vectorizer,
        "model": model,
        "performance": performance
    }


In [7]:
##Tracking

In [10]:
import joblib
import tempfile


In [11]:
# Set experiment
mlflow.set_experiment(experiment_name="baselines")


<Experiment: artifact_location='file:///Users/princychahal/Documents/github/mlops/experiments/0', experiment_id='0', lifecycle_stage='active', name='baselines', tags={}>

In [12]:
def save_dict(d, filepath):
    """Save dict to a json file."""
    with open(filepath, "w") as fp:
        json.dump(d, indent=2, sort_keys=False, fp=fp)


In [10]:
# Tracking
with mlflow.start_run(run_name="sgd"):

    # Train & evaluate
    artifacts = train(args=args,trial = 1)

    # Log key metrics
    mlflow.log_metrics({"precision": artifacts["performance"]["precision"]})
    mlflow.log_metrics({"recall": artifacts["performance"]["recall"]})
    mlflow.log_metrics({"f1": artifacts["performance"]["f1"]})

    # Log artifacts
    with tempfile.TemporaryDirectory() as dp:
        artifacts["label_encoder"].save(Path(dp, "label_encoder.json"))
        joblib.dump(artifacts["vectorizer"], Path(dp, "vectorizer.pkl"))
        joblib.dump(artifacts["model"], Path(dp, "model.pkl"))
        save_dict(artifacts["performance"], Path(dp, "performance.json"))
        mlflow.log_artifacts(dp)

    # Log parameters
    mlflow.log_params(vars(artifacts["args"]))




Epoch: 00 | train_loss: 1.18299, val_loss: 1.20148
Epoch: 10 | train_loss: 0.54027, val_loss: 0.67864




Epoch: 20 | train_loss: 0.37319, val_loss: 0.55959
Epoch: 30 | train_loss: 0.29271, val_loss: 0.50606




Epoch: 40 | train_loss: 0.24549, val_loss: 0.47539
Epoch: 50 | train_loss: 0.21513, val_loss: 0.45581
Epoch: 60 | train_loss: 0.19445, val_loss: 0.44276




Epoch: 70 | train_loss: 0.17978, val_loss: 0.43350
Epoch: 80 | train_loss: 0.16901, val_loss: 0.42670




Epoch: 90 | train_loss: 0.16055, val_loss: 0.42135
{
  "precision": 0.9116161616161617,
  "recall": 0.7569444444444444,
  "f1": 0.7917810016494227
}


In [11]:
def load_dict(filepath):
    """Load a dict from a json file."""
    with open(filepath, "r") as fp:
        d = json.load(fp)
    return d


In [18]:
# Load all runs from experiment
experiment_id = mlflow.get_experiment_by_name("baselines").experiment_id
all_runs = mlflow.search_runs(experiment_ids=experiment_id, order_by=["metrics.val_loss ASC"])
print (all_runs)
print(all_runs['artifact_uri'][0])

                             run_id experiment_id    status  \
0  dae8bef1167342128f2e65e6979e292c             0  FINISHED   
1  7923ecd9c75e4ea9abc6d132108af3b6             0  FINISHED   
2  6734388052354e19a99b079834b02bb1             0  FINISHED   

                                        artifact_uri  \
0  file:///Users/princychahal/Documents/github/ml...   
1  file:///Users/princychahal/Documents/github/ml...   
2  file:///Users/princychahal/Documents/github/ml...   

                        start_time                         end_time  \
0 2022-12-08 19:53:56.644000+00:00 2022-12-08 19:53:58.482000+00:00   
1 2022-12-08 19:08:11.808000+00:00 2022-12-08 19:08:13.682000+00:00   
2 2022-12-08 18:59:30.392000+00:00 2022-12-08 18:59:32.255000+00:00   

   metrics.val_loss  metrics.train_loss  metrics.precision  metrics.recall  \
0          0.417754            0.154866           0.911616        0.756944   
1          0.417754            0.154866           0.911616        0.756944   
2  

In [13]:
# Best run
best_run_id = all_runs.iloc[0].run_id
best_run = mlflow.get_run(run_id=best_run_id)
client = mlflow.tracking.MlflowClient()
with tempfile.TemporaryDirectory() as dp:
    client.download_artifacts(run_id=best_run_id, path="", dst_path=dp)
    vectorizer = joblib.load(Path(dp, "vectorizer.pkl"))
    label_encoder = LabelEncoder.load(fp=Path(dp, "label_encoder.json"))
    model = joblib.load(Path(dp, "model.pkl"))
    performance = load_dict(filepath=Path(dp, "performance.json"))


In [14]:
print (json.dumps(performance, indent=2))


{
  "precision": 0.9116161616161617,
  "recall": 0.7569444444444444,
  "f1": 0.7917810016494227
}


In [18]:
from numpyencoder import NumpyEncoder
from optuna.integration.mlflow import MLflowCallback


In [19]:
# for optimization
def objective(args, trial):
    """Objective function for optimization trials."""
    # Parameters to tune
    args.analyzer = trial.suggest_categorical("analyzer", ["word", "char", "char_wb"])
    args.ngram_max_range = trial.suggest_int("ngram_max_range", 3, 10)
    args.learning_rate = trial.suggest_loguniform("learning_rate", 1e-2, 1e0)
    args.power_t = trial.suggest_uniform("power_t", 0.1, 0.5)

    # Train & evaluate
    artifacts = train(args=args, trial=trial)

    # Set additional attributes
    performance = artifacts["performance"]
    print(json.dumps(performance, indent=2))
    trial.set_user_attr("precision", performance["precision"])
    trial.set_user_attr("recall", performance["recall"])
    trial.set_user_attr("f1", performance["f1"])

    return performance["f1"]


In [20]:
NUM_TRIALS = 20
# Optimize
pruner = optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=5)
study = optuna.create_study(study_name="optimization", direction="maximize", pruner=pruner)
mlflow_callback = MLflowCallback(
    tracking_uri=mlflow.get_tracking_uri(), metric_name="f1")
study.optimize(lambda trial: objective(args, trial),
            n_trials=NUM_TRIALS,
            callbacks=[mlflow_callback])

[32m[I 2022-12-08 15:57:45,084][0m A new study created in memory with name: optimization[0m
  mlflow_callback = MLflowCallback(


Epoch: 00 | train_loss: 1.36600, val_loss: 1.36697
Epoch: 10 | train_loss: 1.18279, val_loss: 1.19557




Epoch: 20 | train_loss: 1.04319, val_loss: 1.07195
Epoch: 30 | train_loss: 0.93640, val_loss: 0.98176




Epoch: 40 | train_loss: 0.85202, val_loss: 0.91300
Epoch: 50 | train_loss: 0.78344, val_loss: 0.85865




Epoch: 60 | train_loss: 0.72641, val_loss: 0.81452
Epoch: 70 | train_loss: 0.67811, val_loss: 0.77790




Epoch: 80 | train_loss: 0.63659, val_loss: 0.74700
Epoch: 90 | train_loss: 0.60042, val_loss: 0.72055


[32m[I 2022-12-08 15:57:47,528][0m Trial 0 finished with value: 0.7658999612930025 and parameters: {'analyzer': 'char', 'ngram_max_range': 10, 'learning_rate': 0.011504911760670478, 'power_t': 0.3945248225795931}. Best is trial 0 with value: 0.7658999612930025.[0m


{
  "precision": 0.8588441477653365,
  "recall": 0.7569444444444444,
  "f1": 0.7658999612930025
}
{
  "precision": 0.8588441477653365,
  "recall": 0.7569444444444444,
  "f1": 0.7658999612930025
}




Epoch: 00 | train_loss: 1.25415, val_loss: 1.26254
Epoch: 10 | train_loss: 0.66705, val_loss: 0.77143




Epoch: 20 | train_loss: 0.47534, val_loss: 0.63235
Epoch: 30 | train_loss: 0.37597, val_loss: 0.56547




Epoch: 40 | train_loss: 0.31478, val_loss: 0.52589




Epoch: 50 | train_loss: 0.27374, val_loss: 0.49981
Epoch: 60 | train_loss: 0.24466, val_loss: 0.48169




Epoch: 70 | train_loss: 0.22324, val_loss: 0.46838
Epoch: 80 | train_loss: 0.20698, val_loss: 0.45831


[32m[I 2022-12-08 15:57:49,615][0m Trial 1 finished with value: 0.7766752854988149 and parameters: {'analyzer': 'char', 'ngram_max_range': 9, 'learning_rate': 0.07295435719252887, 'power_t': 0.22755187491953419}. Best is trial 1 with value: 0.7766752854988149.[0m


Epoch: 90 | train_loss: 0.19414, val_loss: 0.45037
{
  "precision": 0.898955773955774,
  "recall": 0.7430555555555556,
  "f1": 0.7766752854988149
}
{
  "precision": 0.898955773955774,
  "recall": 0.7430555555555556,
  "f1": 0.7766752854988149
}




Epoch: 00 | train_loss: 1.11503, val_loss: 1.14374
Epoch: 10 | train_loss: 0.54408, val_loss: 0.66651
Epoch: 20 | train_loss: 0.39522, val_loss: 0.55362
Epoch: 30 | train_loss: 0.32137, val_loss: 0.50396
Epoch: 40 | train_loss: 0.27494, val_loss: 0.47453
Epoch: 50 | train_loss: 0.24329, val_loss: 0.45522


[32m[I 2022-12-08 15:57:50,121][0m Trial 2 finished with value: 0.7787553070767359 and parameters: {'analyzer': 'char', 'ngram_max_range': 3, 'learning_rate': 0.08140740713591195, 'power_t': 0.18317544750952433}. Best is trial 2 with value: 0.7787553070767359.[0m


Epoch: 60 | train_loss: 0.22109, val_loss: 0.44269
Epoch: 70 | train_loss: 0.20480, val_loss: 0.43371
Epoch: 80 | train_loss: 0.19216, val_loss: 0.42657
Epoch: 90 | train_loss: 0.18119, val_loss: 0.42045
{
  "precision": 0.8780590314203761,
  "recall": 0.75,
  "f1": 0.7787553070767359
}
{
  "precision": 0.8780590314203761,
  "recall": 0.75,
  "f1": 0.7787553070767359
}




Epoch: 00 | train_loss: 0.89749, val_loss: 0.95693
Epoch: 10 | train_loss: 0.26548, val_loss: 0.48807
Epoch: 20 | train_loss: 0.18208, val_loss: 0.43471




Epoch: 30 | train_loss: 0.15429, val_loss: 0.41796
Epoch: 40 | train_loss: 0.14141, val_loss: 0.40988
Epoch: 50 | train_loss: 0.13485, val_loss: 0.40554




Epoch: 60 | train_loss: 0.13133, val_loss: 0.40366
Epoch: 70 | train_loss: 0.12947, val_loss: 0.40256
Epoch: 80 | train_loss: 0.12844, val_loss: 0.40188




[32m[I 2022-12-08 15:57:51,497][0m Trial 3 finished with value: 0.7970605944290153 and parameters: {'analyzer': 'char', 'ngram_max_range': 7, 'learning_rate': 0.32165021458591214, 'power_t': 0.21789198503884}. Best is trial 3 with value: 0.7970605944290153.[0m


Epoch: 90 | train_loss: 0.12670, val_loss: 0.40082
{
  "precision": 0.9125514403292182,
  "recall": 0.7638888888888888,
  "f1": 0.7970605944290153
}
{
  "precision": 0.9125514403292182,
  "recall": 0.7638888888888888,
  "f1": 0.7970605944290153
}




Epoch: 00 | train_loss: 1.10769, val_loss: 1.13940
Epoch: 10 | train_loss: 0.46478, val_loss: 0.61873
Epoch: 20 | train_loss: 0.31908, val_loss: 0.51465
Epoch: 30 | train_loss: 0.25179, val_loss: 0.47015




Epoch: 40 | train_loss: 0.21290, val_loss: 0.44493
Epoch: 50 | train_loss: 0.18837, val_loss: 0.42914
Epoch: 60 | train_loss: 0.17205, val_loss: 0.41902
Epoch: 70 | train_loss: 0.16079, val_loss: 0.41204


[32m[I 2022-12-08 15:57:52,367][0m Trial 4 finished with value: 0.8134950981677184 and parameters: {'analyzer': 'char', 'ngram_max_range': 5, 'learning_rate': 0.1181034228082681, 'power_t': 0.46754476893544983}. Best is trial 4 with value: 0.8134950981677184.[0m


Epoch: 80 | train_loss: 0.15265, val_loss: 0.40697
Epoch: 90 | train_loss: 0.14604, val_loss: 0.40284
{
  "precision": 0.9155773420479303,
  "recall": 0.7847222222222222,
  "f1": 0.8134950981677184
}
{
  "precision": 0.9155773420479303,
  "recall": 0.7847222222222222,
  "f1": 0.8134950981677184
}




Epoch: 00 | train_loss: 1.34600, val_loss: 1.34793
Epoch: 10 | train_loss: 1.03180, val_loss: 1.06216




Epoch: 20 | train_loss: 0.84522, val_loss: 0.90755
Epoch: 30 | train_loss: 0.72196, val_loss: 0.81114




Epoch: 40 | train_loss: 0.63346, val_loss: 0.74472
Epoch: 50 | train_loss: 0.56634, val_loss: 0.69601




Epoch: 60 | train_loss: 0.51346, val_loss: 0.65879
Epoch: 70 | train_loss: 0.47060, val_loss: 0.62938




Epoch: 80 | train_loss: 0.43514, val_loss: 0.60556
Epoch: 90 | train_loss: 0.40521, val_loss: 0.58580


[32m[I 2022-12-08 15:57:54,759][0m Trial 5 finished with value: 0.7888911795247694 and parameters: {'analyzer': 'char', 'ngram_max_range': 10, 'learning_rate': 0.022930706751417954, 'power_t': 0.38370932931545365}. Best is trial 4 with value: 0.8134950981677184.[0m


{
  "precision": 0.8788973922902493,
  "recall": 0.7638888888888888,
  "f1": 0.7888911795247694
}
{
  "precision": 0.8788973922902493,
  "recall": 0.7638888888888888,
  "f1": 0.7888911795247694
}




Epoch: 00 | train_loss: 1.09245, val_loss: 1.12315
Epoch: 10 | train_loss: 0.45138, val_loss: 0.59328
Epoch: 20 | train_loss: 0.31188, val_loss: 0.49524
Epoch: 30 | train_loss: 0.24763, val_loss: 0.45401




Epoch: 40 | train_loss: 0.21051, val_loss: 0.43089
Epoch: 50 | train_loss: 0.18697, val_loss: 0.41632
Epoch: 60 | train_loss: 0.17130, val_loss: 0.40703
Epoch: 70 | train_loss: 0.16056, val_loss: 0.40067




[32m[I 2022-12-08 15:57:55,726][0m Trial 6 finished with value: 0.8184118875913987 and parameters: {'analyzer': 'char_wb', 'ngram_max_range': 8, 'learning_rate': 0.12111490505257488, 'power_t': 0.14309669740156716}. Best is trial 6 with value: 0.8184118875913987.[0m


Epoch: 80 | train_loss: 0.15268, val_loss: 0.39599
Epoch: 90 | train_loss: 0.14627, val_loss: 0.39210
{
  "precision": 0.9064725783475782,
  "recall": 0.7916666666666666,
  "f1": 0.8184118875913987
}
{
  "precision": 0.9064725783475782,
  "recall": 0.7916666666666666,
  "f1": 0.8184118875913987
}




Epoch: 00 | train_loss: 0.91122, val_loss: 0.97013
Epoch: 10 | train_loss: 0.33448, val_loss: 0.51158
Epoch: 20 | train_loss: 0.23372, val_loss: 0.44968
Epoch: 30 | train_loss: 0.19403, val_loss: 0.42864
Epoch: 40 | train_loss: 0.17151, val_loss: 0.41653
Epoch: 50 | train_loss: 0.15705, val_loss: 0.40810


[32m[I 2022-12-08 15:57:56,250][0m Trial 7 finished with value: 0.7963120908382346 and parameters: {'analyzer': 'char', 'ngram_max_range': 3, 'learning_rate': 0.20897976578372726, 'power_t': 0.3386457261362341}. Best is trial 6 with value: 0.8184118875913987.[0m


Epoch: 60 | train_loss: 0.14949, val_loss: 0.40500
Epoch: 70 | train_loss: 0.14478, val_loss: 0.40305
Epoch: 80 | train_loss: 0.14116, val_loss: 0.40021
Epoch: 90 | train_loss: 0.13671, val_loss: 0.39805
{
  "precision": 0.8817861611611613,
  "recall": 0.7708333333333334,
  "f1": 0.7963120908382346
}
{
  "precision": 0.8817861611611613,
  "recall": 0.7708333333333334,
  "f1": 0.7963120908382346
}


[32m[I 2022-12-08 15:57:56,418][0m Trial 8 pruned. [0m


Epoch: 00 | train_loss: 0.54718, val_loss: 0.64763




Epoch: 00 | train_loss: 1.10578, val_loss: 1.12858
Epoch: 10 | train_loss: 0.41741, val_loss: 0.59280




Epoch: 20 | train_loss: 0.27766, val_loss: 0.50226
Epoch: 30 | train_loss: 0.21880, val_loss: 0.46579




Epoch: 40 | train_loss: 0.18758, val_loss: 0.44646
Epoch: 50 | train_loss: 0.16925, val_loss: 0.43504




Epoch: 60 | train_loss: 0.15774, val_loss: 0.42816
Epoch: 70 | train_loss: 0.15025, val_loss: 0.42365




Epoch: 80 | train_loss: 0.14521, val_loss: 0.42062
Epoch: 90 | train_loss: 0.14126, val_loss: 0.41825


[32m[I 2022-12-08 15:57:58,407][0m Trial 9 finished with value: 0.7864446715494237 and parameters: {'analyzer': 'char', 'ngram_max_range': 9, 'learning_rate': 0.17169756349104826, 'power_t': 0.4208365017560811}. Best is trial 6 with value: 0.8184118875913987.[0m


{
  "precision": 0.9107142857142857,
  "recall": 0.75,
  "f1": 0.7864446715494237
}
{
  "precision": 0.9107142857142857,
  "recall": 0.75,
  "f1": 0.7864446715494237
}
Epoch: 00 | train_loss: 1.36642, val_loss: 1.37222
Epoch: 10 | train_loss: 1.18176, val_loss: 1.24579




Epoch: 20 | train_loss: 1.03341, val_loss: 1.15048
Epoch: 30 | train_loss: 0.91589, val_loss: 1.07889
Epoch: 40 | train_loss: 0.82128, val_loss: 1.02373
Epoch: 50 | train_loss: 0.74386, val_loss: 0.98024
Epoch: 60 | train_loss: 0.67962, val_loss: 0.94528
Epoch: 70 | train_loss: 0.62569, val_loss: 0.91669




[32m[I 2022-12-08 15:57:58,964][0m Trial 10 finished with value: 0.730200194864454 and parameters: {'analyzer': 'word', 'ngram_max_range': 7, 'learning_rate': 0.028387821220702193, 'power_t': 0.1099041304211229}. Best is trial 6 with value: 0.8184118875913987.[0m


Epoch: 80 | train_loss: 0.57997, val_loss: 0.89295
Epoch: 90 | train_loss: 0.54082, val_loss: 0.87296
{
  "precision": 0.7953998729121278,
  "recall": 0.7083333333333334,
  "f1": 0.730200194864454
}
{
  "precision": 0.7953998729121278,
  "recall": 0.7083333333333334,
  "f1": 0.730200194864454
}




Epoch: 00 | train_loss: 1.11836, val_loss: 1.14563
Epoch: 10 | train_loss: 0.49859, val_loss: 0.62260
Epoch: 20 | train_loss: 0.35129, val_loss: 0.51185
Epoch: 30 | train_loss: 0.28039, val_loss: 0.46360




Epoch: 40 | train_loss: 0.23767, val_loss: 0.43578
Epoch: 50 | train_loss: 0.20959, val_loss: 0.41785
Epoch: 60 | train_loss: 0.19030, val_loss: 0.40608
Epoch: 70 | train_loss: 0.17658, val_loss: 0.39779


[32m[I 2022-12-08 15:57:59,720][0m Trial 11 finished with value: 0.8367377522223858 and parameters: {'analyzer': 'char_wb', 'ngram_max_range': 5, 'learning_rate': 0.0945227268767435, 'power_t': 0.4623076563782805}. Best is trial 11 with value: 0.8367377522223858.[0m


Epoch: 80 | train_loss: 0.16621, val_loss: 0.39150
Epoch: 90 | train_loss: 0.15764, val_loss: 0.38625
{
  "precision": 0.9103780864197533,
  "recall": 0.8125,
  "f1": 0.8367377522223858
}
{
  "precision": 0.9103780864197533,
  "recall": 0.8125,
  "f1": 0.8367377522223858
}




Epoch: 00 | train_loss: 1.24709, val_loss: 1.25959
Epoch: 10 | train_loss: 0.71643, val_loss: 0.80248
Epoch: 20 | train_loss: 0.53907, val_loss: 0.65504
Epoch: 30 | train_loss: 0.44230, val_loss: 0.57896




Epoch: 40 | train_loss: 0.37934, val_loss: 0.53206
Epoch: 50 | train_loss: 0.33469, val_loss: 0.50016
Epoch: 60 | train_loss: 0.30143, val_loss: 0.47741
Epoch: 70 | train_loss: 0.27571, val_loss: 0.46024


[32m[I 2022-12-08 15:58:00,478][0m Trial 12 finished with value: 0.8124846987019413 and parameters: {'analyzer': 'char_wb', 'ngram_max_range': 5, 'learning_rate': 0.0425405786625699, 'power_t': 0.28719700844713314}. Best is trial 11 with value: 0.8367377522223858.[0m


Epoch: 80 | train_loss: 0.25517, val_loss: 0.44681
Epoch: 90 | train_loss: 0.23806, val_loss: 0.43575
{
  "precision": 0.9155773420479303,
  "recall": 0.7847222222222222,
  "f1": 0.8124846987019413
}
{
  "precision": 0.9155773420479303,
  "recall": 0.7847222222222222,
  "f1": 0.8124846987019413
}




Epoch: 00 | train_loss: 0.71560, val_loss: 0.79602
Epoch: 10 | train_loss: 0.20656, val_loss: 0.41589
Epoch: 20 | train_loss: 0.15119, val_loss: 0.38227
Epoch: 30 | train_loss: 0.13664, val_loss: 0.37523




Epoch: 40 | train_loss: 0.12974, val_loss: 0.37127
Epoch: 50 | train_loss: 0.12510, val_loss: 0.36738
Epoch: 60 | train_loss: 0.12382, val_loss: 0.36738
Epoch: 70 | train_loss: 0.12363, val_loss: 0.36749


[32m[I 2022-12-08 15:58:01,234][0m Trial 13 finished with value: 0.8307455962302298 and parameters: {'analyzer': 'char_wb', 'ngram_max_range': 5, 'learning_rate': 0.4347989357598529, 'power_t': 0.48110305219548255}. Best is trial 11 with value: 0.8367377522223858.[0m


Epoch: 80 | train_loss: 0.12310, val_loss: 0.36622
Epoch: 90 | train_loss: 0.12002, val_loss: 0.36451
{
  "precision": 0.9091032608695654,
  "recall": 0.8055555555555556,
  "f1": 0.8307455962302298
}
{
  "precision": 0.9091032608695654,
  "recall": 0.8055555555555556,
  "f1": 0.8307455962302298
}


[32m[I 2022-12-08 15:58:01,502][0m Trial 14 pruned. [0m


Epoch: 00 | train_loss: 0.50842, val_loss: 0.61996




Epoch: 00 | train_loss: 0.79356, val_loss: 0.86378
Epoch: 10 | train_loss: 0.23726, val_loss: 0.43902
Epoch: 20 | train_loss: 0.16744, val_loss: 0.39561
Epoch: 30 | train_loss: 0.14543, val_loss: 0.38346




Epoch: 40 | train_loss: 0.13487, val_loss: 0.37718
Epoch: 50 | train_loss: 0.12887, val_loss: 0.37296
Epoch: 60 | train_loss: 0.12615, val_loss: 0.37174
Epoch: 70 | train_loss: 0.12514, val_loss: 0.37128


[32m[I 2022-12-08 15:58:02,362][0m Trial 15 finished with value: 0.8196319317597404 and parameters: {'analyzer': 'char_wb', 'ngram_max_range': 6, 'learning_rate': 0.3473228570006808, 'power_t': 0.4943306441617931}. Best is trial 11 with value: 0.8367377522223858.[0m


Epoch: 80 | train_loss: 0.12434, val_loss: 0.37037
Epoch: 90 | train_loss: 0.12185, val_loss: 0.36882
{
  "precision": 0.9064725783475782,
  "recall": 0.7916666666666666,
  "f1": 0.8196319317597404
}
{
  "precision": 0.9064725783475782,
  "recall": 0.7916666666666666,
  "f1": 0.8196319317597404
}




Epoch: 00 | train_loss: 1.06154, val_loss: 1.14881
Epoch: 10 | train_loss: 0.31945, val_loss: 0.73341
Epoch: 20 | train_loss: 0.22449, val_loss: 0.68764
Epoch: 30 | train_loss: 0.19573, val_loss: 0.67368
Epoch: 40 | train_loss: 0.18432, val_loss: 0.66808
Epoch: 50 | train_loss: 0.17938, val_loss: 0.66562
Epoch: 60 | train_loss: 0.17693, val_loss: 0.66440


[32m[I 2022-12-08 15:58:02,808][0m Trial 16 finished with value: 0.7404967847306796 and parameters: {'analyzer': 'word', 'ngram_max_range': 4, 'learning_rate': 0.4773776244766768, 'power_t': 0.4409189459777905}. Best is trial 11 with value: 0.8367377522223858.[0m


Epoch: 70 | train_loss: 0.17561, val_loss: 0.66366
Epoch: 80 | train_loss: 0.17516, val_loss: 0.66335
Epoch: 90 | train_loss: 0.17430, val_loss: 0.66301
{
  "precision": 0.8043623737373736,
  "recall": 0.7152777777777778,
  "f1": 0.7404967847306796
}
{
  "precision": 0.8043623737373736,
  "recall": 0.7152777777777778,
  "f1": 0.7404967847306796
}




Epoch: 00 | train_loss: 1.24217, val_loss: 1.25545
Epoch: 10 | train_loss: 0.69321, val_loss: 0.78338
Epoch: 20 | train_loss: 0.51611, val_loss: 0.63854
Epoch: 30 | train_loss: 0.42083, val_loss: 0.56553




Epoch: 40 | train_loss: 0.35941, val_loss: 0.52101
Epoch: 50 | train_loss: 0.31621, val_loss: 0.49092
Epoch: 60 | train_loss: 0.28428, val_loss: 0.46958
Epoch: 70 | train_loss: 0.25981, val_loss: 0.45355


[32m[I 2022-12-08 15:58:03,652][0m Trial 17 finished with value: 0.819450469937542 and parameters: {'analyzer': 'char_wb', 'ngram_max_range': 6, 'learning_rate': 0.04752982219058805, 'power_t': 0.35048645527678524}. Best is trial 11 with value: 0.8367377522223858.[0m


Epoch: 80 | train_loss: 0.24041, val_loss: 0.44108
Epoch: 90 | train_loss: 0.22439, val_loss: 0.43086
{
  "precision": 0.9166666666666666,
  "recall": 0.7916666666666666,
  "f1": 0.819450469937542
}
{
  "precision": 0.9166666666666666,
  "recall": 0.7916666666666666,
  "f1": 0.819450469937542
}




Epoch: 00 | train_loss: 0.91277, val_loss: 0.96701
Epoch: 10 | train_loss: 0.30906, val_loss: 0.48249
Epoch: 20 | train_loss: 0.21168, val_loss: 0.41911
Epoch: 30 | train_loss: 0.17417, val_loss: 0.39687




Epoch: 40 | train_loss: 0.15421, val_loss: 0.38483
Epoch: 50 | train_loss: 0.14250, val_loss: 0.37744
Epoch: 60 | train_loss: 0.13585, val_loss: 0.37383
Epoch: 70 | train_loss: 0.13214, val_loss: 0.37187


[32m[I 2022-12-08 15:58:04,407][0m Trial 18 finished with value: 0.8255089551909044 and parameters: {'analyzer': 'char_wb', 'ngram_max_range': 5, 'learning_rate': 0.22210874470077396, 'power_t': 0.31090652198140145}. Best is trial 11 with value: 0.8367377522223858.[0m


Epoch: 80 | train_loss: 0.12954, val_loss: 0.37016
Epoch: 90 | train_loss: 0.12642, val_loss: 0.36825
{
  "precision": 0.9078826832151301,
  "recall": 0.7986111111111112,
  "f1": 0.8255089551909044
}
{
  "precision": 0.9078826832151301,
  "recall": 0.7986111111111112,
  "f1": 0.8255089551909044
}


[32m[I 2022-12-08 15:58:04,635][0m Trial 19 pruned. [0m


Epoch: 00 | train_loss: 0.62181, val_loss: 0.71409


In [21]:
# All trials
trials_df = study.trials_dataframe()
trials_df = trials_df.sort_values(["user_attrs_f1"], ascending=False)  # sort by metric
trials_df.head()


Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_analyzer,params_learning_rate,params_ngram_max_range,params_power_t,user_attrs_f1,user_attrs_precision,user_attrs_recall,state
11,11,0.836738,2022-12-08 15:57:58.975212,2022-12-08 15:57:59.720662,0 days 00:00:00.745450,char_wb,0.094523,5,0.462308,0.836738,0.910378,0.8125,COMPLETE
13,13,0.830746,2022-12-08 15:58:00.489188,2022-12-08 15:58:01.234569,0 days 00:00:00.745381,char_wb,0.434799,5,0.481103,0.830746,0.909103,0.805556,COMPLETE
18,18,0.825509,2022-12-08 15:58:03.662937,2022-12-08 15:58:04.406709,0 days 00:00:00.743772,char_wb,0.222109,5,0.310907,0.825509,0.907883,0.798611,COMPLETE
15,15,0.819632,2022-12-08 15:58:01.512494,2022-12-08 15:58:02.361691,0 days 00:00:00.849197,char_wb,0.347323,6,0.494331,0.819632,0.906473,0.791667,COMPLETE
17,17,0.81945,2022-12-08 15:58:02.819491,2022-12-08 15:58:03.652069,0 days 00:00:00.832578,char_wb,0.04753,6,0.350486,0.81945,0.916667,0.791667,COMPLETE


In [22]:
# Best trial
print (f"Best value (f1): {study.best_trial.value}")
print (f"Best hyperparameters: {json.dumps(study.best_trial.params, indent=2)}")


Best value (f1): 0.8367377522223858
Best hyperparameters: {
  "analyzer": "char_wb",
  "ngram_max_range": 5,
  "learning_rate": 0.0945227268767435,
  "power_t": 0.4623076563782805
}


In [23]:
# Save best parameter values
args = {**args.__dict__, **study.best_trial.params}
print (json.dumps(args, indent=2, cls=NumpyEncoder))


{
  "lower": true,
  "stem": false,
  "analyzer": "char_wb",
  "ngram_max_range": 5,
  "alpha": 0.0001,
  "learning_rate": 0.0945227268767435,
  "power_t": 0.4623076563782805,
  "num_epochs": 100,
  "threshold": 0.7048208881556921
}
