In [24]:
import os

import numpy as np
import pandas as pd
from cleanlab.classification import CleanLearning
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from loguru import logger
from cleanlab.count import compute_confident_joint
from cleanlab.filter import find_label_issues
from sklearn.metrics import precision_score

try:
    import mlflow
except Exception:
    logger.warning("MLflow not installed")

In [25]:
if "mlflow" in globals():
    mlflow.set_tracking_uri("https://mlflow.sidhulabs.ca")
    mlflow.set_experiment("Office Macro Labeling")

In [26]:
word_content_df = pd.read_csv(open(os.path.join(os.getcwd(), "vba_features.csv")),index_col=0)

In [27]:
word_content_df

Unnamed: 0,abs,accelerator,activate,activecell,activecodepane,activecontrol,activedocument,activesheet,activevbproject,activewindow,...,Join,LBound,Split,UBound,CurDir,Dir,FileAttr,FileDateTime,FileLen,GetAttr
4ee2939230a5962bc7937f0e54c27900595c4040ba723f6f31d84cc9e60ac3a7.macro,0.000000,0.0,0.010344,0.000000,0.0,0.0,0.000000,0.106488,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7b409025db2a30a58b42adc20bc81daf7dd6a6f8b7dfd089e90fb9fb232a5c05.macro,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.002899,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2cba5a34d84cba315019a94f15a70ba3f2b013955cd240ccca106aa9a569f827.macro,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
e406765eedf8d315750823133c94acb9d8af9a74afb25af8f983a5d4d48af5b7.macro,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.281521,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3e1b48c1b05736c3125723decf7cfe0e4e4efa51de63ad11eda1fa35e564b72e.macro,0.008199,0.0,0.008457,0.016246,0.0,0.0,0.000000,0.021371,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
f58a9e939d186b69453367e26d1123dc1a03719c8f3c0f69a5157c8a8faca814.macro,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4df4ed09ddcf6e2aa6d682bae21f3b2c243551310bdd565c2d2e0e89f05c4bd6.macro,0.000000,0.0,0.020384,0.000000,0.0,0.0,0.028581,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
a5c6ab5f55851e5a4a665888b4804457a9c27b82bfdadd98185dc733b6ab6ed4.macro,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
985577430d9c17e113d00a3923df0e4e6bcf452452fdd2e3196f89f9b5cffb34.macro,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
classification_df = pd.read_csv(open(os.path.join(os.getcwd(), 'classification.csv')),index_col = 0)
df = classification_df.merge(word_content_df,left_index=True,right_index=True)

In [29]:
df_filtered = df.query("classification == 0 or classification == 2")
df_filtered.loc[df_filtered["classification"] == 2, "classification"] = 1
df_filtered

Unnamed: 0,vt_score,classification,abs,accelerator,activate,activecell,activecodepane,activecontrol,activedocument,activesheet,...,Join,LBound,Split,UBound,CurDir,Dir,FileAttr,FileDateTime,FileLen,GetAttr
00027b55ffe7329faff173bc3046f579d176c5a79091bf21f31062e17bfec922.macro,0,0,0.0,0.0,0.168157,0.058883,0.0,0.0,0.000000,0.196722,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00125528f276afcb74b5607e38b03edd41efafac58570589ef08b983cfa1231d.macro,14,0,0.0,0.0,0.013680,0.000000,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
001331fc0a289089ddeaab9ece4b1cf919f4852afe42b6ad64e672e0afccc588.macro,0,0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0014a559b4421bcc8f002e9a8b130f47ca04b7944ba89cf6e80524ed2912474c.macro,16,1,0.0,0.0,0.000000,0.000000,0.0,0.0,0.337177,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
002d9d8f1664df94a985beff2388badaca96ae46bafa92df76eb19d18c154dcd.macro,5,0,0.0,0.0,0.028296,0.000000,0.0,0.0,0.000000,0.013241,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ffd3aad8aa41d987e1640d4919f8c3cc87452cbbf4469aeb506bdd6520ef3f58.macro,39,1,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ffde3b4dfa144508fb40ff4a57d1659f57e2a3432ed2aad7fe710fd60ed6e271.macro,0,0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.687522,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ffe3c1134a7ca913889b1ce47dabd218bc45d4d8bedeb7fd8448b6cb05d93d84.macro,13,0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
fff7628f553c5df11fd36add8eaf80910bb80293fb1d01ec8cb79e03108de057.macro,0,0,0.0,0.0,0.000000,0.058392,0.0,0.0,0.000000,0.022760,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
train_data, test_data = train_test_split(df_filtered, test_size=0.33)

In [31]:
features = df_filtered.columns[2:]
labels = df_filtered.columns[1]

In [32]:
def adj_recall(y_true, label_issues):
    """Ratio of samples we are able to provide a positive label for."""

    y_pred = label_issues.astype(int)

    return np.sum(np.logical_and(y_true == 0, y_pred == 1)) / np.sum(y_true == 0)

def adj_precision(y_true, label_issues, vt_positives, vt_threshold):
    """
    Ratio of samples we are able to provide a positive label for that are most likely positive.
    
    Of the samples we now labeled 1, how many are most likely 1.
    """

    y_pred = label_issues.astype(int)

    likely_pos_samples = np.where((y_true == 0) & (vt_positives > vt_threshold), 1, y_true)

    return precision_score(likely_pos_samples, y_pred)


In [36]:
def train_pu_model(model, train_data, test_data, log_to_mlflow: bool = False):
    """Train a PU model."""

    cl = CleanLearning(clf=model, pulearning=1)

    cl.fit(X=train_data[features].to_numpy(), labels=train_data[labels].to_numpy())

    predicted_proba = cl.predict_proba(test_data[features].to_numpy())

    cj = compute_confident_joint(test_data[labels].to_numpy(), predicted_proba)

    label_issues = find_label_issues(test_data[labels].to_numpy(), predicted_proba, confident_joint=cj)

    recall = adj_recall(test_data["classification"], label_issues)
    precision = adj_precision(test_data["classification"], label_issues, test_data["vt_score"], 3)

    if log_to_mlflow and "mlfow" in globals():
        with mlflow.start_run(run_name=str(model), nested=True):
            mlflow.log_metrics(
                {"adj_recall": recall, "adj_precision": precision}
            )

    return cl.get_la

In [34]:
if "mlflow" in globals():
    mlflow.start_run(run_name="default", description="First run, no params, using defaults for everything")

Exception: Run with UUID a1e20f9eb85940c081e25138abc0f35e is already active. To start a new run, first end the current run with mlflow.end_run(). To start a nested run, call start_run with nested=True

## Baseline

In [37]:
preds = train_pu_model(LogisticRegression(), train_data, test_data)

0.05964514911287278 0.8629441624365483


## Random Forest

In [38]:
preds = train_pu_model(RandomForestClassifier(), train_data, test_data)

0.01887504718761797 0.8148148148148148


## Boosting Sklearn

In [39]:
preds = train_pu_model(HistGradientBoostingClassifier(), train_data, test_data)

0.026802567006417515 0.8709677419354839


## XGBoost

In [40]:
preds = train_pu_model(XGBClassifier(), train_data, test_data)

0.02491506228765572 0.8740157480314961


In [91]:
if "mlflow" in globals():
    mlflow.end_run()