In [1]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


In [2]:
import mlflow
import numpy as np
import re
import pandas as pd
import pickle
import time

from nltk.tokenize import word_tokenize
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from xgboost import XGBClassifier

In [3]:
tags = [
    'javascript',
    'python',
    'java',
    'c_sharp',
    'php',
    'android',
    'html',
    'jquery',
    'c_plus_plus',
    'css',
    'ios',
    'sql',
    'mysql',
    'r',
    'reactjs',
    'node_js',
    'arrays',
    'c',
    'asp_net',
    'json'
]

X_tf_idf = pd.read_csv("X_tf_idf.csv")
X_word2vec = pd.read_csv("X_word2vec.csv")
X_bert = pd.read_csv("X_bert.csv")
X_use = pd.read_csv("X_use.csv")

df = pd.read_csv("Question Extraction V05 year.csv")

y = df[tags]

title_vectorizer = pickle.load(open("title_vectorizer.sav", "rb"))
body_vectorizer = pickle.load(open("body_vectorizer.sav", "rb"))


# We need this function to evaluate data drift.
def clean_for_tf_idf(document):    
    TO_CONVERT = {
        'c#': 'csharp',
        'c++': 'cplusplus',
        'asp.net': 'aspdotnet',
        'node.js': 'nodejs'
    }
    
    def convert(string, to_convert=TO_CONVERT):
        for key, value in to_convert.items():
            string = string.replace(key, value)
        return string
    
    
    def remove_html_tags(string):
        html_tags = [
            '<blockquote>',
            '</blockquote>',
            '<br>',
            '</br>',
            '<code>',
            '</code>',
            '<em>',
            '</em>',
            '<p>',
            '</p>',
            '<pre>',
            '</pre>',
            '<strong>',
            '</strong>'
        ]
        for html_tag in html_tags:
            string = string.replace(html_tag, ' ')
        return string
    
    def keep_only_wanted_chars(string):
        string = re.sub(r'[^a-zA-Z\.,:;?!\']', ' ', string)
        return string
    
    TO_CONVERT_POST_TOKENIZATION = {
        'c': 'clanguage',
        'r': 'rlanguage'
    }
    
    def convert_post_tokenization(tokens, to_convert_post_tokenization=TO_CONVERT_POST_TOKENIZATION):
        for key, value in to_convert_post_tokenization.items():
            for i, token in enumerate(tokens):
                if token == key:
                    tokens[i] = value
        return tokens
    
    document = document.lower()
    document = convert(document)
    document = remove_html_tags(document)
    document = keep_only_wanted_chars(document)
    document = word_tokenize(document)
    document = convert_post_tokenization(document)
    document = ' '.join(document)
    return document


# We need this function to evaluate data drift.
def get_X_tf_idf(df, title_vectorizer, body_vectorizer):
    cleaned_title = df['Title'].apply(clean_for_tf_idf)
    title_tf_idf_features = title_vectorizer.transform(cleaned_title)
    
    title_tf_idf_features_df = pd.DataFrame(
        data=title_tf_idf_features.toarray(),
        columns=['title_' + feature_name for feature_name in title_vectorizer.get_feature_names_out()]
    )
    
    cleaned_body = df['Body'].apply(clean_for_tf_idf)
    body_tf_idf_features = body_vectorizer.transform(cleaned_body)

    body_tf_idf_features_df = pd.DataFrame(
        data=body_tf_idf_features.toarray(),
        columns=['body_' + feature_name for feature_name in body_vectorizer.get_feature_names_out()]
    )

    X_tf_idf = pd.concat(
        [
            title_tf_idf_features_df,
            body_tf_idf_features_df
        ],
        axis=1
    )
    
    return X_tf_idf

In [4]:
test_size = .2

X_tf_idf_train,   X_tf_idf_test,   _,       __     = train_test_split(X_tf_idf,   y, test_size=test_size, random_state=0)
X_word2vec_train, X_word2vec_test, _,       __     = train_test_split(X_word2vec, y, test_size=test_size, random_state=0)
X_bert_train,     X_bert_test,     _,       __     = train_test_split(X_bert,     y, test_size=test_size, random_state=0)
X_use_train,      X_use_test,      y_train, y_test = train_test_split(X_use,      y, test_size=test_size, random_state=0)

In [5]:
URI = "http://127.0.0.1:8080"

client = mlflow.MlflowClient(tracking_uri=URI)

# Provide an Experiment description that will appear in the UI
experiment_description = (
    """Tag prediction V22. Comparing embeddings: tf-idf, Word2Vec, BERT, USE.
    Comparing models: Logistic Regression, Random Forest, XGBoost.
    """
)

# Provide searchable tabs that define characteristics of the Runs
# that will be in this Experiment
experiment_tags = {
    "project_name": "stackoverflow_tagging",
    "mlflow.note.content": experiment_description
}

# Create the Experiment, providing a unique name
experiment_name = "tag_prediction_v22"
if not client.get_experiment_by_name(experiment_name):
    client.create_experiment(name=experiment_name, tags=experiment_tags)

mlflow.set_tracking_uri(URI)

# Sets the current active experiment and
# returns the Experiment metadata
experiment = mlflow.set_experiment(experiment_name)

# Define a run name for this iteration of training.
# If this is not set, a unique name will be auto-generated for your run.

# Define an artifact path that the model will be saved to.
artifact_path = "tag_prediction_22_artifact_path"

In [6]:
for embeddings_name in ['tf_idf', 'word2vec', 'bert', 'use']:
    for model_name in ['logistic_regression', 'random_forest', 'xgboost']:
                
        run_name = embeddings_name + '_default_' + model_name
        
        params = {'embeddings_name': embeddings_name, 'model_name': model_name}

        if embeddings_name == 'tf_idf':
            X_train, X_test = X_tf_idf_train, X_tf_idf_test
        elif embeddings_name == 'word2vec':
            X_train, X_test = X_word2vec_train, X_word2vec_test
        elif embeddings_name == 'bert':
            X_train, X_test = X_bert_train, X_bert_test
        else:
            X_train, X_test = X_use_train, X_use_test

        if model_name == 'logistic_regression':
            model = OneVsRestClassifier(LogisticRegression(random_state=0))
        elif model_name == 'random_forest':
            model = RandomForestClassifier(random_state=0)
        else:
            model = XGBClassifier(random_state=0)
        
        fit_start = time.time()
        model.fit(X_train, y_train)
        fit_duration = time.time() - fit_start
        
        pred_start = time.time()
        y_pred = model.predict(X_test)
        pred_duration = time.time() - pred_start
        
        f1 = f1_score(y_test, y_pred, average='micro')
        precision = precision_score(y_test, y_pred, average='micro')
        recall = recall_score(y_test, y_pred, average='micro')
        
        metrics = {
            'fit_duration': fit_duration, 'pred_duration': pred_duration,
            'f1': f1, 'precision': precision, 'recall': recall
        }
        
        with mlflow.start_run(run_name=run_name) as run:
            mlflow.log_params(params)
            mlflow.log_metrics(metrics)
            mlflow.sklearn.log_model(
                sk_model=model, input_example=X_test, artifact_path=artifact_path
            )
        print(f"run name: {run_name}.")
        print(metrics)

run name: tf_idf_default_logistic_regression.
{'fit_duration': 29.998541116714478, 'pred_duration': 1.1015269756317139, 'f1': 0.696149843912591, 'precision': 0.9304589707927677, 'recall': 0.5561097256857855}




run name: tf_idf_default_random_forest.
{'fit_duration': 16.30429196357727, 'pred_duration': 0.33971571922302246, 'f1': 0.6949420442571128, 'precision': 0.9489208633093525, 'recall': 0.5482128013300083}




run name: tf_idf_default_xgboost.
{'fit_duration': 661.0124881267548, 'pred_duration': 0.25377511978149414, 'f1': 0.7944954128440368, 'precision': 0.8863868986693961, 'recall': 0.7198669991687449}




run name: word2vec_default_logistic_regression.
{'fit_duration': 2.271435022354126, 'pred_duration': 0.09805178642272949, 'f1': 0.49188072040153524, 'precision': 0.8491335372069317, 'recall': 0.3462177888611804}




run name: word2vec_default_random_forest.
{'fit_duration': 23.58149814605713, 'pred_duration': 0.2214362621307373, 'f1': 0.1029874213836478, 'precision': 0.9492753623188406, 'recall': 0.054447215295095594}




run name: word2vec_default_xgboost.
{'fit_duration': 158.26194214820862, 'pred_duration': 0.039849042892456055, 'f1': 0.4904420549581841, 'precision': 0.8715498938428875, 'recall': 0.3412302576891106}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


run name: bert_default_logistic_regression.
{'fit_duration': 4.847101926803589, 'pred_duration': 0.20867085456848145, 'f1': 0.7445023804126049, 'precision': 0.8189526184538654, 'recall': 0.6824605153782212}




run name: bert_default_random_forest.
{'fit_duration': 34.03657412528992, 'pred_duration': 0.2379012107849121, 'f1': 0.13779830638953042, 'precision': 0.9322916666666666, 'recall': 0.0743973399833749}




run name: bert_default_xgboost.
{'fit_duration': 334.00201988220215, 'pred_duration': 0.07483887672424316, 'f1': 0.6625129802699896, 'precision': 0.8824343015214384, 'recall': 0.5303408146300914}




run name: use_default_logistic_regression.
{'fit_duration': 8.170687913894653, 'pred_duration': 0.4488089084625244, 'f1': 0.6956077630234934, 'precision': 0.9019867549668874, 'recall': 0.5660847880299252}




run name: use_default_random_forest.
{'fit_duration': 58.13931584358215, 'pred_duration': 0.34099888801574707, 'f1': 0.0462287104622871, 'precision': 0.95, 'recall': 0.02369077306733167}




run name: use_default_xgboost.
{'fit_duration': 767.6096248626709, 'pred_duration': 0.13119292259216309, 'f1': 0.6347402597402597, 'precision': 0.9093023255813953, 'recall': 0.48753117206982544}


In [7]:
# Best model so far is tf_idf_default_xgboost.

# Model fine-tuning

In [8]:
# We want to fine-tune XGBoost but it takes too long to run as is,
# because X has 4,000 features. We want to decrease the number of features.
# For that we look at the most important features, thanks to the
# Logistic Regression model.
# For each tag, we keep the 20 features with highest coefficient.
# This will give us a smaller X.
# We'll then be able to fine-tune XGBoost on that smaller X.

In [10]:
def get_most_important_features(lr_model, X, tags=tags, n_features_per_tag=20):
    most_important_features = set()
    for i, tag in enumerate(tags):
        coefs = lr_model.estimators_[i].coef_[0]
        feature_importance_for_that_tag = pd.Series(
            data=coefs,
            index=X.columns
        )
        
        important_features_for_that_tag = set(feature_importance_for_that_tag.sort_values(ascending=False).head(n_features_per_tag).index)
        most_important_features = most_important_features.union(important_features_for_that_tag)
    most_important_features = list(most_important_features)
    return most_important_features

In [11]:
lr_model_path = 'mlflow-artifacts:/894393323650292179/46163d3705ac467f8a1b2681ddeddfc0/artifacts/tag_prediction_22_artifact_path'

In [12]:
lr_model = mlflow.sklearn.load_model(lr_model_path)

Downloading artifacts:   0%|          | 0/11 [00:00<?, ?it/s]

2024/05/03 18:56:57 INFO mlflow.store.artifact.artifact_repo: The progress bar can be disabled by setting the environment variable MLFLOW_ENABLE_ARTIFACTS_PROGRESS_BAR to false


In [13]:
most_important_features = get_most_important_features(
    lr_model=lr_model,
    X=X_tf_idf
)

In [14]:
X_tf_idf_20_train = X_tf_idf_train[most_important_features]
X_tf_idf_20_test = X_tf_idf_test[most_important_features]

In [16]:
for embeddings_name in ['tf_idf_top_20']:
    for model_name in ['xgboost']:
        for eta in [.1, .3]:
            for colsample_bytree in [.5, 1]:
                for max_depth in [5, 6, 7]:
                            
                    run_name = (
                        embeddings_name + '_' + model_name + '_eta_' + str(eta) +
                        '_colsample_bytree_' + str(colsample_bytree) + '_max_depth_' + str(max_depth)
                    )
                    
                    params = {
                        'embeddings_name': embeddings_name, 'model_name': model_name,
                        'eta': eta, 'colsample_bytree': colsample_bytree, 'max_depth': max_depth
                    }
            
                    X_train, X_test = X_tf_idf_20_train, X_tf_idf_20_test
            
                    model = XGBClassifier(
                        eta=eta, colsample_bytree=colsample_bytree, max_depth=max_depth,
                        random_state=0
                    )
                    
                    fit_start = time.time()
                    model.fit(X_train, y_train)
                    fit_duration = time.time() - fit_start
                    
                    pred_start = time.time()
                    y_pred = model.predict(X_test)
                    pred_duration = time.time() - pred_start
                    
                    f1 = f1_score(y_test, y_pred, average='micro')
                    precision = precision_score(y_test, y_pred, average='micro')
                    recall = recall_score(y_test, y_pred, average='micro')
                    
                    metrics = {
                        'fit_duration': fit_duration, 'pred_duration': pred_duration,
                        'f1': f1, 'precision': precision, 'recall': recall
                    }
                    
                    with mlflow.start_run(run_name=run_name) as run:
                        mlflow.log_params(params)
                        mlflow.log_metrics(metrics)
                        mlflow.sklearn.log_model(
                            sk_model=model, input_example=X_test, artifact_path=artifact_path
                        )

                    print(f"run name: {run_name}.")
                    print(metrics)



run name: tf_idf_top_20_xgboost_eta_0.1_colsample_bytree_0.5_max_depth_5.
{'fit_duration': 25.53102421760559, 'pred_duration': 0.039369821548461914, 'f1': 0.7944508670520231, 'precision': 0.8952579468473163, 'recall': 0.71404821280133}




run name: tf_idf_top_20_xgboost_eta_0.1_colsample_bytree_0.5_max_depth_6.
{'fit_duration': 32.167316913604736, 'pred_duration': 0.04053688049316406, 'f1': 0.7999080036798527, 'precision': 0.8954685890834192, 'recall': 0.7227763923524522}




run name: tf_idf_top_20_xgboost_eta_0.1_colsample_bytree_0.5_max_depth_7.
{'fit_duration': 35.078999757766724, 'pred_duration': 0.04546499252319336, 'f1': 0.800274223034735, 'precision': 0.8888324873096447, 'recall': 0.727763923524522}




run name: tf_idf_top_20_xgboost_eta_0.1_colsample_bytree_1_max_depth_5.
{'fit_duration': 48.62085318565369, 'pred_duration': 0.03552508354187012, 'f1': 0.7925653969710876, 'precision': 0.8847336065573771, 'recall': 0.7177888611803824}




run name: tf_idf_top_20_xgboost_eta_0.1_colsample_bytree_1_max_depth_6.
{'fit_duration': 58.51288199424744, 'pred_duration': 0.0395350456237793, 'f1': 0.7972540045766591, 'precision': 0.8869653767820774, 'recall': 0.7240232751454697}




run name: tf_idf_top_20_xgboost_eta_0.1_colsample_bytree_1_max_depth_7.
{'fit_duration': 68.07617998123169, 'pred_duration': 0.04442596435546875, 'f1': 0.797436484321355, 'precision': 0.8874172185430463, 'recall': 0.7240232751454697}




run name: tf_idf_top_20_xgboost_eta_0.3_colsample_bytree_0.5_max_depth_5.
{'fit_duration': 25.501012086868286, 'pred_duration': 0.03606700897216797, 'f1': 0.8048007246376813, 'precision': 0.8840796019900498, 'recall': 0.7385702410640067}




run name: tf_idf_top_20_xgboost_eta_0.3_colsample_bytree_0.5_max_depth_6.
{'fit_duration': 31.783084869384766, 'pred_duration': 0.04004812240600586, 'f1': 0.8004519774011299, 'precision': 0.8771669143140168, 'recall': 0.7360764754779717}




run name: tf_idf_top_20_xgboost_eta_0.3_colsample_bytree_0.5_max_depth_7.
{'fit_duration': 35.09420108795166, 'pred_duration': 0.04622626304626465, 'f1': 0.7963841807909605, 'precision': 0.8727092620108965, 'recall': 0.7323358270989194}




run name: tf_idf_top_20_xgboost_eta_0.3_colsample_bytree_1_max_depth_5.
{'fit_duration': 47.94222903251648, 'pred_duration': 0.03644704818725586, 'f1': 0.800996151233869, 'precision': 0.8796618597712581, 'recall': 0.7352452202826267}




run name: tf_idf_top_20_xgboost_eta_0.3_colsample_bytree_1_max_depth_6.
{'fit_duration': 59.43915104866028, 'pred_duration': 0.04077911376953125, 'f1': 0.7998186352301064, 'precision': 0.8798004987531172, 'recall': 0.7331670822942643}
run name: tf_idf_top_20_xgboost_eta_0.3_colsample_bytree_1_max_depth_7.
{'fit_duration': 66.74668788909912, 'pred_duration': 0.04399895668029785, 'f1': 0.797193300135808, 'precision': 0.8752485089463221, 'recall': 0.7319201995012469}




In [None]:
# Best model is tf_idf_top_20_xgboost_eta_0.3_colsample_bytree_0.5_max_depth_5
# with F1 0.8048

# Model for tag prediction app

In [17]:
# We're looking for a light, robust and high-performance model for the tag prediction app.

# Fine-tuning Logistic Regression on full tf_idf embeddings.

In [19]:
for embeddings_name in ['tf_idf']:
    for model_name in ['logistic_regression']:
        for penalty in ['l2', 'none']:
                            
                    run_name = (
                        embeddings_name + '_' + model_name + '_penalty_' + penalty
                    )
                    
                    params = {
                        'embeddings_name': embeddings_name, 'model_name': model_name,
                        'penalty': penalty
                    }
            
                    X_train, X_test = X_tf_idf_train, X_tf_idf_test
            
                    model = OneVsRestClassifier(LogisticRegression(penalty=penalty, random_state=0))
                    
                    fit_start = time.time()
                    model.fit(X_train, y_train)
                    fit_duration = time.time() - fit_start
                    
                    pred_start = time.time()
                    y_pred = model.predict(X_test)
                    pred_duration = time.time() - pred_start
                    
                    f1 = f1_score(y_test, y_pred, average='micro')
                    precision = precision_score(y_test, y_pred, average='micro')
                    recall = recall_score(y_test, y_pred, average='micro')
                    
                    metrics = {
                        'fit_duration': fit_duration, 'pred_duration': pred_duration,
                        'f1': f1, 'precision': precision, 'recall': recall
                    }
                    
                    with mlflow.start_run(run_name=run_name) as run:
                        mlflow.log_params(params)
                        mlflow.log_metrics(metrics)
                        mlflow.sklearn.log_model(
                            sk_model=model, input_example=X_test, artifact_path=artifact_path
                        )

                        print(f"run name: {run_name}.")
                    print(metrics)



run name: tf_idf_logistic_regression_penalty_l2.
{'fit_duration': 25.85276508331299, 'pred_duration': 1.003999948501587, 'f1': 0.696149843912591, 'precision': 0.9304589707927677, 'recall': 0.5561097256857855}




run name: tf_idf_logistic_regression_penalty_none.
{'fit_duration': 26.63776469230652, 'pred_duration': 1.0262012481689453, 'f1': 0.7937000887311446, 'precision': 0.8510941960038059, 'recall': 0.7435577722360764}


In [None]:
# Best Logistic Regression model is with no penalty
# with F1 0.7937.

In [20]:
best_lr_model_path = 'mlflow-artifacts:/894393323650292179/a4f4d7fe75ca484bbf28925bd1926cd0/artifacts/tag_prediction_22_artifact_path'

In [21]:
best_lr_model = mlflow.sklearn.load_model(best_lr_model_path)

Downloading artifacts:   0%|          | 0/11 [00:00<?, ?it/s]

2024/05/04 12:13:34 INFO mlflow.store.artifact.artifact_repo: The progress bar can be disabled by setting the environment variable MLFLOW_ENABLE_ARTIFACTS_PROGRESS_BAR to false


In [22]:
# Saving the model as Pickle file to use it in the tag prediction app.

In [23]:
pickle.dump(best_lr_model, open('tag_predictor.sav', "wb"))

# Evaluating model drift

In [33]:
year_range = range(2008, 2024)

year_X_y_dict = {}
for year in year_range:
    year_df = df.loc[df['year'] == year]
    
    year_X = get_X_tf_idf(
        year_df, title_vectorizer, body_vectorizer
    )
    year_y = year_df[tags]
    year_X_y_dict.update({year: {'X': year_X, 'y': year_y}})

In [34]:
for year, X_y in year_X_y_dict.items():
    print(f"year: {year}. n_rows_x: {len(X_y['X'])}. n_rows_y: {len(X_y['y'])}")

year: 2008. n_rows_x: 972. n_rows_y: 972
year: 2009. n_rows_x: 1659. n_rows_y: 1659
year: 2010. n_rows_x: 1386. n_rows_y: 1386
year: 2011. n_rows_x: 1353. n_rows_y: 1353
year: 2012. n_rows_x: 999. n_rows_y: 999
year: 2013. n_rows_x: 783. n_rows_y: 783
year: 2014. n_rows_x: 526. n_rows_y: 526
year: 2015. n_rows_x: 390. n_rows_y: 390
year: 2016. n_rows_x: 295. n_rows_y: 295
year: 2017. n_rows_x: 219. n_rows_y: 219
year: 2018. n_rows_x: 124. n_rows_y: 124
year: 2019. n_rows_x: 82. n_rows_y: 82
year: 2020. n_rows_x: 54. n_rows_y: 54
year: 2021. n_rows_x: 21. n_rows_y: 21
year: 2022. n_rows_x: 6. n_rows_y: 6
year: 2023. n_rows_x: 3. n_rows_y: 3


In [36]:
for embeddings_name in ['tf_idf']:
    for model_name in ['logistic_regression']:
        for year in year_range:
            run_name = (
                embeddings_name + '_best_' + model_name
                + '_year_' + str(year)
            )
            
            params = {
                'embeddings_name': embeddings_name, 'model_name': model_name,
                'year': year
            }
            
            X_test = year_X_y_dict[year]['X']
            y_test = year_X_y_dict[year]['y']
            
            model = best_lr_model
            
            pred_start = time.time()
            y_pred = model.predict(X_test)
            pred_duration = time.time() - pred_start
            
            f1 = f1_score(y_test, y_pred, average='micro')
            precision = precision_score(y_test, y_pred, average='micro')
            recall = recall_score(y_test, y_pred, average='micro')
            
            metrics = {
                'pred_duration': pred_duration,
                'f1': f1, 'precision': precision, 'recall': recall
            }
            
            with mlflow.start_run(run_name=run_name) as run:
                mlflow.log_params(params)
                mlflow.log_metrics(metrics)
                mlflow.sklearn.log_model(
                    sk_model=model, input_example=X_test, artifact_path=artifact_path
                )

            print(f"run name: {run_name}.")
            print(metrics)



run name: tf_idf_best_logistic_regression_year_2008.
{'pred_duration': 1.2129549980163574, 'f1': 0.9665703673132481, 'precision': 0.9758333333333333, 'recall': 0.9574816026165167}




run name: tf_idf_best_logistic_regression_year_2009.
{'pred_duration': 1.2447571754455566, 'f1': 0.9624236730859558, 'precision': 0.9724727100142383, 'recall': 0.9525801952580195}




run name: tf_idf_best_logistic_regression_year_2010.
{'pred_duration': 1.26112699508667, 'f1': 0.9621087314662274, 'precision': 0.9727928928373126, 'recall': 0.9516567083107007}




run name: tf_idf_best_logistic_regression_year_2011.
{'pred_duration': 1.2160162925720215, 'f1': 0.9596069868995633, 'precision': 0.9782971619365609, 'recall': 0.9416175682913765}




run name: tf_idf_best_logistic_regression_year_2012.
{'pred_duration': 1.274184226989746, 'f1': 0.9583489681050658, 'precision': 0.9659606656580938, 'recall': 0.9508562918838421}




run name: tf_idf_best_logistic_regression_year_2013.
{'pred_duration': 1.1256599426269531, 'f1': 0.963924963924964, 'precision': 0.9756572541382668, 'recall': 0.9524714828897338}




run name: tf_idf_best_logistic_regression_year_2014.
{'pred_duration': 1.1338510513305664, 'f1': 0.9614285714285715, 'precision': 0.9725433526011561, 'recall': 0.9505649717514124}




run name: tf_idf_best_logistic_regression_year_2015.
{'pred_duration': 1.0403571128845215, 'f1': 0.9646418857660924, 'precision': 0.9779411764705882, 'recall': 0.9516994633273703}




run name: tf_idf_best_logistic_regression_year_2016.
{'pred_duration': 0.9988622665405273, 'f1': 0.9353007945516458, 'precision': 0.9603729603729604, 'recall': 0.911504424778761}




run name: tf_idf_best_logistic_regression_year_2017.
{'pred_duration': 1.106081247329712, 'f1': 0.9612903225806452, 'precision': 0.9834983498349835, 'recall': 0.9400630914826499}




run name: tf_idf_best_logistic_regression_year_2018.
{'pred_duration': 0.9343729019165039, 'f1': 0.9659090909090909, 'precision': 0.9714285714285714, 'recall': 0.96045197740113}




run name: tf_idf_best_logistic_regression_year_2019.
{'pred_duration': 0.9455790519714355, 'f1': 0.9351851851851851, 'precision': 0.9528301886792453, 'recall': 0.9181818181818182}




run name: tf_idf_best_logistic_regression_year_2020.
{'pred_duration': 1.237131118774414, 'f1': 0.9659863945578231, 'precision': 0.9726027397260274, 'recall': 0.9594594594594594}




run name: tf_idf_best_logistic_regression_year_2021.
{'pred_duration': 1.0056202411651611, 'f1': 0.927536231884058, 'precision': 0.9696969696969697, 'recall': 0.8888888888888888}




run name: tf_idf_best_logistic_regression_year_2022.
{'pred_duration': 1.0161619186401367, 'f1': 0.7058823529411765, 'precision': 0.75, 'recall': 0.6666666666666666}




run name: tf_idf_best_logistic_regression_year_2023.
{'pred_duration': 0.9800388813018799, 'f1': 0.7499999999999999, 'precision': 1.0, 'recall': 0.6}
