# Notebook for Applied AI mini project

In [15]:
import pandas as pd
from sklearn import set_config
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, roc_curve, homogeneity_score, adjusted_rand_score, silhouette_score, v_measure_score

from dataset import Dataset
from configuration import (
    CLASSIFIER_CONFIGS,
    DATASET_CONFIGS,
    DATASET_CONFIGS_DEMO,
    METRICS_CONFIG,
)
from inspect_dataset import inspect
from pipelines import Pipelines
from utils import seperate

In [16]:
DATASET_IDS = [
    # 22, # Chess (King-Rook vs. King-Pawn)
    70, # Monk's problem
    184 # Acute Inflammations
]
DATASET_ID = 184
CLASSIFIERS = CLASSIFIER_CONFIGS["supervised"]
# Index of target to be used if there are multiple target columns
# e.g., target_col = ['bladder-inflammation', 'nephritis']
# set target_index=0 will use 'bladder-inflammation' as target column
TARGET_INDEX = 0

## Data inspection

In [17]:
inspect(DATASET_ID)

----------------------------------------------------------------------------------------------------
Dataset Name
184: Acute Inflammations
----------------------------------------------------------------------------------------------------
Metaname
{'uci_id': 184, 'name': 'Acute Inflammations', 'repository_url': 'https://archive.ics.uci.edu/dataset/184/acute+inflammations', 'data_url': 'https://archive.ics.uci.edu/static/public/184/data.csv', 'abstract': 'The data was created by a medical expert as a data set to test the expert system, \nwhich will perform the presumptive diagnosis of two diseases of the urinary system.\n', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 120, 'num_features': 6, 'feature_types': ['Categorical', 'Integer'], 'demographics': [], 'target_col': ['bladder-inflammation', 'nephritis'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2003, 

In [18]:
dataset = Dataset(DATASET_ID)
X = dataset.X
y = dataset.y_target
labels = y.unique()

In [19]:
y.head()

0     no
1    yes
2     no
3    yes
4     no
Name: bladder-inflammation, dtype: object

Select the target column

Encode target

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

X_non_numeric_columns = dataset.categorical_columns
X_numeric_columns = dataset.numeric_columns
print(f"non_numeric_columns: {X_non_numeric_columns}")
print(f"numeric_columns: {X_numeric_columns}")

non_numeric_columns: ['nausea', 'lumbar-pain', 'urine-pushing', 'micturition-pains', 'burning-urethra']
numeric_columns: ['temperature']


## Create pipeline

In [21]:
pipelines = {}
for classifier_name in CLASSIFIERS:
    pipeline = Pipelines(X_non_numeric_columns, X_numeric_columns, classifier_name).create()
    pipelines[classifier_name] = pipeline

pipelines

{'KNN': Pipeline(steps=[('preprocessing',
                  ColumnTransformer(transformers=[('categorical',
                                                   Pipeline(steps=[('imputer_frequent',
                                                                    SimpleImputer(strategy='most_frequent')),
                                                                   ('onehot',
                                                                    OneHotEncoder(handle_unknown='ignore',
                                                                                  sparse_output=False))]),
                                                   ['nausea', 'lumbar-pain',
                                                    'urine-pushing',
                                                    'micturition-pains',
                                                    'burning-urethra']),
                                                  ('numeric',
                                                

In [22]:
# display the pipeline in diagram
set_config(display="diagram")
pipelines.get("KNN")

In [23]:
models = {}
for name, pipe in pipelines.items():
    model = pipe.fit(X_train, y_train)
    models[name] = model

In [24]:
for name, model in models.items():
    score = model.score(X_test, y_test)
    print(f"Model name: {name}, Score: {score}")

Model name: KNN, Score: 1.0
Model name: RandomForest, Score: 1.0
Model name: SVC, Score: 1.0


In [25]:
def unsupervised_scores(X_test, y_test, models):
    scores = {}
    for name, model in models.items():
        y_pred = model.predict(X_test)
        scores[name] = {
            "homogeneity": accuracy_score(y_test, y_pred),
            "completeness": f1_score(y_test, y_pred),
            "v_measure": recall_score(y_test, y_pred),
            "rand_index": adjusted_rand_score(y_test, y_pred),
            "silhouette": roc_curve(y_test, y_pred)
        }
    return scores

In [26]:
def evaluate_scores(labels, X_test, y_test, models):
    pos_label = labels[0]
    scores = {}
    for name, model in models.items():
        y_pred = model.predict(X_test)
        accquracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, pos_label=pos_label)
        recall = recall_score(y_test, y_pred, pos_label=pos_label)
        f1 = f1_score(y_test, y_pred, pos_label=pos_label)
        scores[name] = [accquracy, precision, recall, f1]
    scores_df = pd.DataFrame(scores, index=["Accuracy", "Precision", "Recall", "F1"])
    return scores_df

In [27]:
evaluate_scores(labels, X_test, y_test, models)

Unnamed: 0,KNN,RandomForest,SVC
Accuracy,1.0,1.0,1.0
Precision,1.0,1.0,1.0
Recall,1.0,1.0,1.0
F1,1.0,1.0,1.0
