# Sprawozdanie z laboratorium 4

***Autor: Adam Dąbkowski***

## 0. Importowanie niezbędnych bibliotek

In [16]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics
from pprint import pprint

## 1. Przygotowanie danych

In [17]:
data = pd.read_csv("./data/cardio_train.csv", sep=';')
data.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,988,22469,1,155,69.0,130,80,2,2,0,0,1,0
1,989,14648,1,163,71.0,110,70,1,1,0,0,1,1
2,990,21901,1,165,70.0,120,80,1,1,0,0,1,0
3,991,14549,2,165,85.0,120,80,1,1,1,1,1,0
4,992,23393,1,155,62.0,120,80,1,1,0,0,1,0


In [18]:
data.drop(["id"], axis=1, inplace=True)

In [19]:
data["age"] = data["age"].apply(lambda x: "young" if x < 20 * 365 else "middle" if x < 60 * 365 else "old")
data["height"] = data["height"].apply(lambda x: "low" if x < 165 else "middle" if x < 190 else "tall")
data["weight"] = data["weight"].apply(lambda x: "skinny" if x < 60 else "middle" if x < 100 else "fat")
data["ap_hi"] = data["ap_hi"].apply(lambda x: "low" if x < 110 else "normal" if x < 130 else "high")
data["ap_lo"] = data["ap_lo"].apply(lambda x: "low" if x < 75 else "normal" if x < 85 else "high")

In [20]:
data.head()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,old,1,low,middle,high,normal,2,2,0,0,1,0
1,middle,1,low,middle,normal,low,1,1,0,0,1,1
2,old,1,middle,middle,normal,normal,1,1,0,0,1,0
3,middle,2,middle,middle,normal,normal,1,1,1,1,1,0
4,old,1,low,middle,normal,normal,1,1,0,0,1,0


In [21]:
target_label = "cardio"

In [22]:
data.dropna(inplace=True)

In [23]:
y = data[target_label]
X = data.drop([target_label], axis=1)

In [24]:
X.head()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active
0,old,1,low,middle,high,normal,2,2,0,0,1
1,middle,1,low,middle,normal,low,1,1,0,0,1
2,old,1,middle,middle,normal,normal,1,1,0,0,1
3,middle,2,middle,middle,normal,normal,1,1,1,1,1
4,old,1,low,middle,normal,normal,1,1,0,0,1


In [25]:
y.head()

0    0
1    1
2    0
3    0
4    0
Name: cardio, dtype: int64

In [26]:
X_train, X_val_test, y_train, y_val_test = train_test_split(X, y, train_size=0.75, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size=0.5, random_state=42)

## 2. Implementacja algorytmu

In [27]:
class ID3Solver():
    def __init__(self, depth=float('inf')):
        self.target_label = None
        self.tree = None
        self.depth = depth

    def get_parameters(self):
        return {
            "depth": self.depth
        }

    def fit(self, X, y, target_label):
        self.target_label = target_label
        X[self.target_label] = y
        X.dropna(inplace=True)
        self.tree = self.create_tree(X, self.depth)

    def get_entropy(self, feature_data):
        feature_elements, feature_counts = np.unique(feature_data, return_counts=True)
        entropy = np.sum(
            [(-feature_counts[i] / np.sum(feature_counts)) * np.log2(feature_counts[i] / np.sum(feature_counts)) for i
             in range(len(feature_elements))])
        return entropy

    def get_info_gain(self, data, decision_feature):
        total_entropy = self.get_entropy(data[self.target_label])
        feature_values, feature_counts = np.unique(data[decision_feature], return_counts=True)

        for i in range(len(feature_values)):
            probability = feature_counts[i] / np.sum(feature_counts)
            feature_data = data[data[decision_feature] == feature_values[i]][self.target_label]
            total_entropy -= self.get_entropy(feature_data) * probability

        return total_entropy

    def get_decision_feature(self, data):
        features = data.columns.drop(self.target_label)
        info_gain_values = [self.get_info_gain(data, feature) for feature in features]
        decision_feature = features[np.argmax(info_gain_values)]
        return decision_feature

    def create_tree(self, data, depth):
        if data.shape[0] == 0:
            return {}

        if depth == 0:
            return data[self.target_label].value_counts().idxmax()

        if len(np.unique(data[self.target_label])) == 1:
            return np.unique(data[self.target_label])[0]

        decision_feature = self.get_decision_feature(data)

        tree = {decision_feature: {}}

        for feature_value in np.unique(data[decision_feature]):
            subset = data[data[decision_feature] == feature_value]
            subset.pop(decision_feature)
            subtree = self.create_tree(subset, depth - 1)
            tree[decision_feature][feature_value] = subtree

        return tree

    def predict_item(self, tree, item):
        if isinstance(tree, dict):
            decision_feature = next(iter(tree))
            value = item[decision_feature]

            if value in tree[decision_feature]:
                return self.predict_item(tree[decision_feature][value], item)
            else:
                return False

        else:
            return tree

    def predict(self, X):
        prediction = []
        for i in range(X.shape[0]):
            prediction.append(self.predict_item(self.tree, X.iloc[i]))
        return prediction

## 3. Implementacja funkcji pomocniczych

In [28]:
def get_metrics(y_true, y_pred, printing=True):
    accuracy = metrics.accuracy_score(y_true, y_pred)
    precision = metrics.precision_score(y_true, y_pred)
    recall = metrics.recall_score(y_true, y_pred)
    f1_score = metrics.recall_score(y_true, y_pred)

    if printing:
        print("Accuracy: ", accuracy)
        print("Precision: ", precision)
        print("Recall: ", recall)
        print("F1 score: ", f1_score)

    return [accuracy, precision, recall, f1_score]

In [29]:
def determine_the_best_model(target_label, X_train, y_train, X_val, y_val, X_test, y_test, depth_range=7, main_metric="f1_score", print_model=False):
    best_model = None
    best_metric_score = 0

    print(f"\n************************** Validation set ******************************")

    for depth in range(1, depth_range + 1):
        print("  ")
        print(f"Depth {depth}")
        print(f"--------------------------------------------------------------------------------")
        model = ID3Solver(depth)
        model.fit(X_train, y_train, target_label)
        y_pred = model.predict(X_val)

        if main_metric == "accuracy":
            metric_type = 0
        elif main_metric == "precision":
            metric_type = 1
        elif main_metric == "recall":
            metric_type = 2
        elif main_metric == "f1_score":
            metric_type = 3
        else:
            raise "Invalid metric"

        metric_score = get_metrics(y_val, y_pred)[metric_type]

        if metric_score > best_metric_score:
            best_metric_score = metric_score
            best_model = model

    print(f"\n************************** Best model (validation set) ******************************\n")

    if print_model:
        print("Best model: ")
        pprint(best_model.tree)

    print("Best depth: ", best_model.depth)
    print(f"Best {main_metric}: ", best_metric_score)

    print(f"\n************************** Test set ******************************\n")

    y_pred_test = best_model.predict(X_test)
    get_metrics(y_test, y_pred_test)[metric_type]

    return best_model

## 4. Zastosowanie algorytmu

In [None]:
best_model = determine_the_best_model(target_label=target_label,
                         X_train=X_train,
                         y_train=y_train,
                         X_val=X_val,
                         y_val=y_val,
                         X_test=X_test,
                         y_test=y_test,
                         depth_range=10,
                         main_metric='f1_score',
                         print_model=False
                         )


************************** Validation set ******************************
  
Depth 1
--------------------------------------------------------------------------------
Accuracy:  0.7156874062103198
Precision:  0.7557811618725324
Recall:  0.6266074351180734
F1 score:  0.6266074351180734
  
Depth 2
--------------------------------------------------------------------------------
Accuracy:  0.7297702874292971
Precision:  0.7479508196721312
Recall:  0.6827215337853636
F1 score:  0.6827215337853636
  
Depth 3
--------------------------------------------------------------------------------
Accuracy:  0.7289622532609951
Precision:  0.7423975873335009
Recall:  0.690671031096563
F1 score:  0.690671031096563
  
Depth 4
--------------------------------------------------------------------------------
Accuracy:  0.7296548539766824
Precision:  0.7373068432671082
Recall:  0.7028290858078092
F1 score:  0.7028290858078092
  
Depth 5
-------------------------------------------------------------------------