In [None]:
import os
import sys
from collections import Counter
from pathlib import Path

import numpy as np
import pandas as pd
import rich
from rich.pretty import Pretty
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

from modules.data.processeddatainstance import ProcessedDataInstance
from modules.dl.tester.utils import confusion_matrix_with_class
from modules.shared.config import load_config
from modules.shared.utils import get_repo_root

In [None]:
# notebook name
notebook_name = Path("3.a.ml_single_cellfeat.ipynb").stem

In [None]:
# load config
config = load_config("ml_analysis.toml")
palmskin_result_name: Path = Path(config["data_processed"]["palmskin_result_name"])
cluster_desc: str = config["data_processed"]["cluster_desc"]
dark: int = config["SLIC"]["dark"]
rich.print("", Pretty(config, expand_all=True))

In [None]:
processed_di = ProcessedDataInstance()
processed_di.parse_config("ml_analysis.toml")

# src
repo_root = get_repo_root()
slic_dirname = f"{palmskin_result_name.stem}_{{dark_{dark}}}"
ml_csv = repo_root.joinpath("data/generated/ML", processed_di.instance_name,
                            cluster_desc, slic_dirname, "ml_dataset.csv")

# dst
dst_dir = ml_csv.parent

In [None]:
df = pd.read_csv(ml_csv, encoding='utf_8_sig')
print(f"Read ML Dataset: '{ml_csv}'")
df

In [None]:
labels = sorted(Counter(df["class"]).keys())
label2idx = {label: idx for idx, label in enumerate(labels)}
rich.print(f"labels = {labels}")
rich.print(f"label2idx = {label2idx}")

In [None]:
training_df = df[(df["dataset"] == "train") | (df["dataset"] == "valid")]
test_df = df[(df["dataset"] == "test")]

training_df

## Training

In [None]:
feature = "cell_count"
assert feature in df.columns, f"Feature should be one of followings: {list(df.columns)[3:]}"

In [None]:
# 初始化 Random Forest 分類器
rand_seed = int(cluster_desc.split("_")[-1].replace("RND", ""))
random_forest = RandomForestClassifier(n_estimators=100, random_state=rand_seed)

input_training = training_df[feature].to_numpy()[:, None]
gt_training = [label2idx[c_label] for c_label in training_df["class"]]

# 訓練模型
random_forest.fit(input_training, gt_training)

In [None]:
# 預測訓練集
pred_train = random_forest.predict(input_training)
pred_train = [labels[c_idx] for c_idx in pred_train]

gt_train = list(training_df["class"])

# reports
cls_report = classification_report(y_true=gt_train,
                                   y_pred=pred_train, digits=5)
_, confusion_matrix = confusion_matrix_with_class(prediction=pred_train,
                                                  ground_truth=gt_train)
# display report
print("Classification Report:\n\n", cls_report)
print(f"{confusion_matrix}\n")

# log file
with open(dst_dir.joinpath(f"{notebook_name}.train.log"), mode="w") as f_writer:
    f_writer.write("Classification Report:\n\n")
    f_writer.write(f"{cls_report}\n\n")
    f_writer.write(f"{confusion_matrix}\n")

## Test

In [None]:
input_test = test_df[feature].to_numpy()[:, None]

# 預測測試集
pred_test = random_forest.predict(input_test)
pred_test = [labels[c_idx] for c_idx in pred_test]

gt_test = list(test_df["class"])

# reports
cls_report = classification_report(y_true=gt_test,
                                   y_pred=pred_test, digits=5)
_, confusion_matrix = confusion_matrix_with_class(prediction=pred_test,
                                                  ground_truth=gt_test)
# display report
print("Classification Report:\n\n", cls_report)
print(f"{confusion_matrix}\n")

# log file
with open(dst_dir.joinpath(f"{notebook_name}.test.log"), mode="w") as f_writer:
    f_writer.write("Classification Report:\n\n")
    f_writer.write(f"{cls_report}\n\n")
    f_writer.write(f"{confusion_matrix}\n")

In [None]:
np.array(pred_test)

In [None]:
np.array(gt_test)