In [32]:
import os
import re
from functools import lru_cache

import matplotlib.pyplot as plt
import mlflow
import nltk
import numpy as np
import pandas as pd
from nltk.stem.snowball import EnglishStemmer
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.metrics import ConfusionMatrixDisplay, accuracy_score, precision_score, recall_score

In [12]:
DATASET_TRAIN_FILE = "./../data/raw-train.csv"
DATASET_TEST_FILE = "./../data/raw-test.csv"
RANDOM_STATE = 42
N_JOBS = 8

In [3]:
nltk.download("punkt")
stemmer = EnglishStemmer()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
@lru_cache(maxsize=10240, typed=True)
def stem_with_cache(w: str) -> str:
    """Run stemming with using cache.

    :param w:
    :return:
    """
    return str(stemmer.stem(w))

In [9]:
prepared_data = dict()
for dataset_file in (DATASET_TRAIN_FILE, DATASET_TEST_FILE):
    df = pd.read_csv(DATASET_TRAIN_FILE, names=["class", "title", "text"])
    # Missing values
    df = df[~(df["title"].isnull() & df["text"].isnull())]
    df["title"] = df["title"].apply(lambda x: "" if pd.isna(x) else x)
    # Text processing - concatenating
    df["all_text"] = df["title"] + ", " + df["text"]
    del df["title"]
    del df["text"]
    # Text processing - lowering
    df["lowered_text"] = df["all_text"].apply(lambda x: x.lower())
    del df["all_text"]
    # Text processing - cleaning
    df["cleaned_text"] = df["lowered_text"].apply(lambda x: " ".join(re.findall("[-a-z]{2,}", x)))
    del df["lowered_text"]
    # Text processing - stemming
    df["text"] = df["cleaned_text"].apply(
        lambda x: " ".join(stem_with_cache(w) for w in x.split(" "))
    )
    del df["cleaned_text"]
    # Transform class to target
    df["target"] = df["class"].apply(lambda x: 0 if x == 1 else 1)
    del df["class"]
    prepared_data[dataset_file] = df

In [10]:
prepared_data[DATASET_TRAIN_FILE]["text"] = prepared_data[DATASET_TRAIN_FILE]["text"].fillna("")
# vectorization train dataframe
tfidf_vectorizer = TfidfVectorizer(max_features=1500, stop_words="english")
train_data = tfidf_vectorizer.fit_transform(prepared_data[DATASET_TRAIN_FILE]["text"])
train_target = prepared_data[DATASET_TRAIN_FILE]["target"]
del prepared_data[DATASET_TRAIN_FILE]
# test dataframe
prepared_data[DATASET_TEST_FILE]["text"] = prepared_data[DATASET_TEST_FILE]["text"].fillna("")
test_data = tfidf_vectorizer.transform(prepared_data[DATASET_TEST_FILE]["text"])
test_target = prepared_data[DATASET_TEST_FILE]["target"]
del prepared_data[DATASET_TEST_FILE]

In [23]:
def conf_matrix(y_true: pd.Series, pred: np.ndarray) -> plt.Figure:  # type: ignore[name-defined, type-arg]
    """Run creating confusion matrix as figure.

    :param y_true: true values
    :param pred: predictions
    :return: figure from matplotlib
    """
    plt.ioff()  
    figure, ax = plt.subplots(figsize=(5, 5))  
    ConfusionMatrixDisplay.from_predictions(y_true, pred, ax=ax, colorbar=False)  
    ax.xaxis.set_tick_params(rotation=90)  
    _ = ax.set_title("Confusion Matrix")  
    plt.tight_layout()  
    return figure

In [13]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")
amazon_reviews_experiment = mlflow.set_experiment("amazon_reviews")

2024/05/05 23:30:31 INFO mlflow.tracking.fluent: Experiment with name 'amazon_reviews' does not exist. Creating a new experiment.


In [22]:
os.environ["AWS_ACCESS_KEY_ID"] = "mlflow"
os.environ["AWS_SECRET_ACCESS_KEY"] = "password"
os.environ["MLFLOW_S3_ENDPOINT_URL"] = "http://127.0.0.1:9000"

In [26]:
run_name = "log-reg"
with mlflow.start_run(run_name=run_name) as run:  
    model_params = {  
        "penalty": "l2",  
        "solver": "saga",  
        "max_iter": 100,
        "multi_class": "ovr",
        "random_state": RANDOM_STATE
    }  
    mlflow.log_params(model_params)
    model = LogisticRegression(**model_params)  
    model.fit(train_data, train_target)  
    predictions = model.predict(test_data)  
    metrics = {
        "accuracy": accuracy_score(test_target, predictions),
        "precision": precision_score(test_target, predictions),
        "recall": recall_score(test_target, predictions),
    }
    for metric_name, metric_value in metrics.items():
        mlflow.log_metric(metric_name, metric_value)    
    mlflow.sklearn.log_model(  
        sk_model=model,     
        artifact_path=f"mlflow/{run_name}/model"  
    )  
    fig = conf_matrix(test_target, predictions)  
    mlflow.log_figure(fig, f'{run_name}_confusion_matrix.png')



In [27]:
run_name = "random-forest"
with mlflow.start_run(run_name=run_name) as run:  
    model_params = {  
        "n_estimators": 100,  
        "max_depth": 12,  
        "n_jobs": N_JOBS,
        "random_state": RANDOM_STATE
    }  
    mlflow.log_params(model_params)
    model = RandomForestClassifier(**model_params)  
    model.fit(train_data, train_target)  
    predictions = model.predict(test_data)  
    metrics = {
        "accuracy": accuracy_score(test_target, predictions),
        "precision": precision_score(test_target, predictions),
        "recall": recall_score(test_target, predictions),
    }
    for metric_name, metric_value in metrics.items():
        mlflow.log_metric(metric_name, metric_value)    
    mlflow.sklearn.log_model(  
        sk_model=model,     
        artifact_path=f"mlflow/{run_name}/model"  
    )  
    fig = conf_matrix(test_target, predictions)  
    mlflow.log_figure(fig, f'{run_name}_confusion_matrix.png')



In [33]:
run_name = "ridge"
with mlflow.start_run(run_name=run_name) as run:  
    model_params = {
        "tol": 1e-2,
        "max_iter": 100,
        "solver": "sparse_cg",
        "random_state": RANDOM_STATE,
    }  
    mlflow.log_params(model_params)
    model = RidgeClassifier(**model_params)  
    model.fit(train_data, train_target)  
    predictions = model.predict(test_data)  
    metrics = {
        "accuracy": accuracy_score(test_target, predictions),
        "precision": precision_score(test_target, predictions),
        "recall": recall_score(test_target, predictions),
    }
    for metric_name, metric_value in metrics.items():
        mlflow.log_metric(metric_name, metric_value)    
    mlflow.sklearn.log_model(  
        sk_model=model,     
        artifact_path=f"mlflow/{run_name}/model"  
    )  
    fig = conf_matrix(test_target, predictions)  
    mlflow.log_figure(fig, f'{run_name}_confusion_matrix.png')

