In [292]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as plticker
import random
from collections import defaultdict

In [91]:
data = pd.read_csv('diabetes.csv')
# Nowy plik na rozmyte od 0

In [92]:
data.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [93]:
data = data[data['Glucose'] != 0]
data = data[data['BloodPressure'] != 0]
data = data[data['SkinThickness'] != 0]
data = data[data['Insulin'] != 0]
data = data[data['BMI'] != 0]
data.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,392.0,392.0,392.0,392.0,392.0,392.0,392.0,392.0,392.0
mean,3.30102,122.627551,70.663265,29.145408,156.056122,33.086224,0.523046,30.864796,0.331633
std,3.211424,30.860781,12.496092,10.516424,118.84169,7.027659,0.345488,10.200777,0.471401
min,0.0,56.0,24.0,7.0,14.0,18.2,0.085,21.0,0.0
25%,1.0,99.0,62.0,21.0,76.75,28.4,0.26975,23.0,0.0
50%,2.0,119.0,70.0,29.0,125.5,33.2,0.4495,27.0,0.0
75%,5.0,143.0,78.0,37.0,190.0,37.1,0.687,36.0,1.0
max,17.0,198.0,110.0,63.0,846.0,67.1,2.42,81.0,1.0


In [94]:
# Klasa statyczna zawierająca metody przydatne to przetwarzania danych
class ProcessingData:

    # metoda tasująca wiersze pd.DataFrame
    @staticmethod
    def shuffle(df: pd.DataFrame) -> pd.DataFrame:
        """
        Receives a DataFrame, randomly shuffles it and returns a shuffled version.
        :param df: pd.DataFrame
        :return df: pd.DataFrame
        """
        for idx in range(len(df) - 1, 0, -1):
            rand_idx = random.randint(0, idx)
            df.iloc[idx], df.iloc[rand_idx] = df.iloc[rand_idx], df.iloc[idx]
        return df.reset_index(drop=True)

    # metoda normalizująca kolumny pd.DataFrame za pomocą normalizacji z użyciem odchylenia standardowego
    @staticmethod
    def normalize_std(df: pd.DataFrame, label: str) -> pd.DataFrame:
        """
        Receives a DataFrame and a class label to skip during normalization, normalizes data using min-max normalization and returns the normalized version.
        :param label: str
        :param df: pd.DataFrame
        :return df: pd.DataFrame
        """
        normalized = df.drop(labels=label, axis=1)
        normalized = (normalized-normalized.mean())/(normalized.std())
        return normalized.join(df[label])

    # metoda normalizująca kolumny pd.DataFrame za pomocą normalizacji minmax
    @staticmethod
    def normalize_minmax(df: pd.DataFrame, label: str) -> pd.DataFrame:
        """
        Receives a DataFrame and a class label to skip during normalization, normalizes data using min-max normalization and returns the normalized version.
        :param label: str
        :param df: pd.DataFrame
        :return df: pd.DataFrame
        """
        normalized = df.drop(labels=label, axis=1)
        normalized = (normalized-normalized.min())/(normalized.max()-normalized.min())
        return normalized.join(df[label])

    # metoda dzieląca pd.DataFrame na zbiór treningowy oraz walidacyjny
    @staticmethod
    def split(df: pd.DataFrame, ratio: float) -> tuple[pd.DataFrame, pd.DataFrame]:
        """
        Receives a DataFrame and returns two dataframes, split into training and validation DataFrames.
        :param df: pd.DataFrame
        :param ratio: float
        :return training_df, validation_df: tuple[pd.DataFrame, pd.DataFrame]
        """
        cutoff = int(len(df) * ratio)
        return df[:cutoff].reset_index(drop=True), df[cutoff:].reset_index(drop=True)

In [95]:
shuffled_data = ProcessingData.shuffle(data)
training_data, validation_data = ProcessingData.split(shuffled_data, 0.7)

In [389]:
class Fuzzy:
    def __init__(self):
        # słownik słowników rozmycia dla każdej kolumny
        self.antecedents: dict[str: dict[str: tuple[float]]] = {}
        # lista zawierająca nazwę etykiety klasy oraz słowników z jej rozmyciem
        self.consequent: dict[str: dict[str: tuple[float]]] = {}
        # lista słowników; każda reguła to jeden słownik
        self.rules: list[dict[str: str]] = []

    # metoda pozwalająca dodać poprzednika
    def add_antecedent(self, parameter: str, linguistic_value: str, *args) -> None:
        if parameter not in self.antecedents:
            self.antecedents[parameter] = {}
        self.antecedents[parameter][linguistic_value] = args

    # metoda pozwalająca dodać konsekwencję
    def add_consequent(self, parameter: str, linguistic_value: str, *args) -> None:
        if parameter not in self.consequent:
            self.consequent[parameter] = {}
        self.consequent[parameter][linguistic_value] = args

    # metoda pozwalająca dodać regułę
    def add_rule(self, rule: dict[str: str]) -> None:
        # assert set(rule.keys()) == set.union(set(self.antecedents.keys()), set(self.consequent.keys())), print("Nieprawidłowa liczba kategorii w regule.")
        self.rules.append(rule)
        # print(set.union(set(self.antecedents.keys()), set(self.consequent.keys())))
        # print(self.rules)

    # metoda obliczająca przynależność dla funkcji typu trójkąt
    @staticmethod
    def triangular_function(n: float, a: float, b: float, c: float) -> float:
        assert a <= b <= c
        # print(n, a, b, c)
        if n <= a:
            return 0
        if a < n < b:
            return (n - a) / (b - a)
        if n == b:
            return 1
        if b < n < c:
            return (c - n) / (c - b)
        if n >= c:
            return 0

    # metoda obliczająca przynależność dla funkcji typu trapez
    @staticmethod
    def trapezoidal_function(n: float, a: float, b: float, c: float, d: float) -> float:
        assert a <= b <= c <= d
        # print(n, a, b, c, d)
        if c == d and n >= d:
            return 1
        if a == b and n <= a:
            return 1
        if n <= a:
            return 0
        if a < n < b:
            return (n - a) / (b - a)
        if b <= n <= c:
            return 1
        if c < n < d:
            return (d - n) / (d - c)
        if n >= d:
            return 0

    # metoda wywołująca funkcję przynależności
    def membership_function(self, n: float, *args) -> float:
        # print(len(args))
        # print(*args)
        assert len(args) in (3, 4), print("Nieprawidłowa liczba argumentów")
        if len(args) == 3:
            return self.triangular_function(n, *args)
        else:
            return self.trapezoidal_function(n, *args)

    # metoda rozmywająca dane wejściowe
    def fuzzify(self, column: str, n: float) -> dict[str, float]:
        return {antecedent: self.membership_function(n, *self.antecedents[column][antecedent]) for antecedent in self.antecedents[column].keys()}

    # metoda obliczająca stopień spełnienia danej zasady
    @staticmethod
    def rule_fulfillment(rule: dict[str: str], fuzzy_values: dict[str: dict[str: float]]) -> tuple[str, float]:
        # outputs = [{name: 0} for name in self.consequent[list(self.consequent.keys())[0]]]
        label = list(rule.keys())[-1]
        minimum = float('inf')
        for name, linguistic_value in list(rule.items())[:-1]:
            maximum = float('-inf')
            for lvs in linguistic_value.split(' | '):
                maximum = max(fuzzy_values[name][lvs], maximum)
                # print(f'{fuzzy_values[name]}; {maximum:}')
            minimum = min(minimum, maximum)
        return rule[label], minimum


    # metoda obliczająca powierzchnię danego wyniku
    @staticmethod
    def area(a: float, c: float) -> float:
        return (c - a) / 2

    # metoda środka ciężkości
    @staticmethod
    def cog(a: float, b: float, c: float) -> float:
        return (a + b + c) / 3

    # metoda agregująca, wyostrzająca wartość końcową za pomocą metody środka ciężkości
    def aggregate(self, label: str, outputs: defaultdict) -> float:
        counter = []
        denominator = []
        for name, membership in outputs.items():
            a, b, c = self.consequent[label][name]
            area = self.area(a, c)
            # print(f'{area=}')
            cog = self.cog(a, b, c)
            # print(f'{cog=}')
            counter.append(membership * area * cog)
            denominator.append(membership * area)
        print(f'{counter=}\n{denominator=}')
        return round(sum(counter) / sum(denominator), 3)

    # metoda przewidująca etykietę klasy
    def compute(self, sample: pd.Series) -> tuple[str, float]:
        # rozmycie próbki
        fuzzy_values = {column: self.fuzzify(column, sample[column]) for column in self.antecedents.keys()}
        print(fuzzy_values)
        # for column in fuzzy_values:
        #     # fuzzy_values[column] = {k: v for k, v in sorted(fuzzy_values[column].items(), key=lambda x: x[1], reverse=True)}
        #     print(fuzzy_values[column])

        # obliczenie stopnia spełnienia zasad
        fulfillments = [self.rule_fulfillment(rule, fuzzy_values) for rule in self.rules]
        print(f'{fulfillments=}')
        outputs = defaultdict(lambda: 0)
        for linguistic_value, fulfillment in fulfillments:
            outputs[linguistic_value] = max(outputs[linguistic_value], fulfillment)
        print(outputs)
        label = list(self.consequent.keys())[0]
        # print(label)
        output_names = sorted(outputs.keys(), key=lambda x: (-1 * outputs[x], x), reverse=True)
        print(output_names)
        return output_names[-1], self.aggregate(label, outputs)

    # metoda pomocnicza wyświetlająca wykres rozmycia cechy
    @staticmethod
    def view(parameter: str) -> None:
        # TODO: plotting an antecedent or a consequent
        x = np.linspace(0., 7., num=101)
        y = np.array([fuzzy.membership_function(i, 1, 1, 3, 7) for i in x])

        fig, ax = plt.subplots()
        ax.plot(x, y)
        loc = plticker.MultipleLocator(base=1.0)
        ax.xaxis.set_major_locator(loc)

        plt.plot(x, y, 'r')
        plt.show()

In [403]:
# Dodawanie poprzedników i konsekwencji
fuzzy = Fuzzy()

fuzzy.add_antecedent('Pregnancies', 'low', 0, 0, 5)
fuzzy.add_antecedent('Pregnancies', 'medium', 0, 5, 10)
fuzzy.add_antecedent('Pregnancies', 'high', 5, 10, 17, 17)

fuzzy.add_antecedent('Glucose', 'low', 44, 44, 86, 99)
fuzzy.add_antecedent('Glucose', 'medium_low', 86, 99, 112)
fuzzy.add_antecedent('Glucose', 'medium', 99, 112, 125)
fuzzy.add_antecedent('Glucose', 'medium_high', 112, 125, 138)
fuzzy.add_antecedent('Glucose', 'high', 125, 138, 199, 199)

fuzzy.add_antecedent('BloodPressure', 'low', 24, 24, 75, 80)
fuzzy.add_antecedent('BloodPressure', 'medium_low', 75, 80, 85)
fuzzy.add_antecedent('BloodPressure', 'medium', 80, 85, 90)
fuzzy.add_antecedent('BloodPressure', 'medium_high', 85, 90, 95)
fuzzy.add_antecedent('BloodPressure', 'high', 125, 190, 846, 846)

fuzzy.add_antecedent('SkinThickness', 'low', 7, 7, 15, 22)
fuzzy.add_antecedent('SkinThickness', 'medium_low', 15, 22, 29)
fuzzy.add_antecedent('SkinThickness', 'medium', 22, 29, 36)
fuzzy.add_antecedent('SkinThickness', 'medium_high', 29, 36, 43)
fuzzy.add_antecedent('SkinThickness', 'high', 36, 43, 99, 99)

fuzzy.add_antecedent('Insulin', 'low', 14, 14, 76, 125)
fuzzy.add_antecedent('Insulin', 'medium', 76, 125, 190)
fuzzy.add_antecedent('Insulin', 'high', 125, 190, 846, 846)

fuzzy.add_antecedent('BMI', 'underweight', 0, 0, 16, 22)
fuzzy.add_antecedent('BMI', 'healthy_weight', 16, 22, 28)
fuzzy.add_antecedent('BMI', 'overweight', 22, 28, 34)
fuzzy.add_antecedent('BMI', 'obese', 28, 34, 67.1, 67.1)

fuzzy.add_antecedent('DiabetesPedigreeFunction', 'low', 0, 0, 0.25, 0.5)
fuzzy.add_antecedent('DiabetesPedigreeFunction', 'medium', 0.25, 0.5, 0.75)
fuzzy.add_antecedent('DiabetesPedigreeFunction', 'high', 0.5, 0.75, 2.42, 2.42)

fuzzy.add_antecedent('Age', 'low', 21, 21, 34, 45)
fuzzy.add_antecedent('Age', 'medium', 34, 45, 56)
fuzzy.add_antecedent('Age', 'high', 45, 56, 81, 81)

for elem in fuzzy.antecedents:
    print(f'{elem}: {fuzzy.antecedents[elem]}')

fuzzy.add_consequent('Outcome', 'low', 0, 0, 1)
fuzzy.add_consequent('Outcome', 'high', 0, 1, 1)
print(f'{list(fuzzy.consequent.keys())[0]}: {fuzzy.consequent["Outcome"]}')

Pregnancies: {'low': (0, 0, 5), 'medium': (0, 5, 10), 'high': (5, 10, 17, 17)}
Glucose: {'low': (44, 44, 86, 99), 'medium_low': (86, 99, 112), 'medium': (99, 112, 125), 'medium_high': (112, 125, 138), 'high': (125, 138, 199, 199)}
BloodPressure: {'low': (24, 24, 75, 80), 'medium_low': (75, 80, 85), 'medium': (80, 85, 90), 'medium_high': (85, 90, 95), 'high': (125, 190, 846, 846)}
SkinThickness: {'low': (7, 7, 15, 22), 'medium_low': (15, 22, 29), 'medium': (22, 29, 36), 'medium_high': (29, 36, 43), 'high': (36, 43, 99, 99)}
Insulin: {'low': (14, 14, 76, 125), 'medium': (76, 125, 190), 'high': (125, 190, 846, 846)}
BMI: {'underweight': (0, 0, 16, 22), 'healthy_weight': (16, 22, 28), 'overweight': (22, 28, 34), 'obese': (28, 34, 67.1, 67.1)}
DiabetesPedigreeFunction: {'low': (0, 0, 0.25, 0.5), 'medium': (0.25, 0.5, 0.75), 'high': (0.5, 0.75, 2.42, 2.42)}
Age: {'low': (21, 21, 34, 45), 'medium': (34, 45, 56), 'high': (45, 56, 81, 81)}
Outcome: {'low': (0, 0, 1), 'high': (0, 1, 1)}


In [404]:
# Dodawanie reguł
# TODO: add rules and testing
fuzzy.add_rule({'DiabetesPedigreeFunction': 'low | medium',
                'Outcome': 'low'})
fuzzy.add_rule({'DiabetesPedigreeFunction': 'high',
                'Outcome': 'high'})
fuzzy.add_rule({'Age': 'low',
                'Outcome': 'low'})
fuzzy.add_rule({'Age': 'high',
                'Outcome': 'high'})
fuzzy.add_rule({'Insulin': 'low',
                'Outcome': 'low'})
fuzzy.add_rule({'Insulin': 'high',
                'Outcome': 'high'})

In [405]:
# przewidywanie etykiety klasy próbki
value = validation_data.iloc[0]
print(fuzzy.compute(value))
print(value['Outcome'])

{'Pregnancies': {'low': 0.6, 'medium': 0.4, 'high': 0}, 'Glucose': {'low': 0, 'medium_low': 0, 'medium': 0.23076923076923078, 'medium_high': 0.7692307692307693, 'high': 0}, 'BloodPressure': {'low': 0.8, 'medium_low': 0.2, 'medium': 0, 'medium_high': 0, 'high': 0}, 'SkinThickness': {'low': 0, 'medium_low': 0.2857142857142857, 'medium': 0.7142857142857143, 'medium_high': 0, 'high': 0}, 'Insulin': {'low': 0, 'medium': 0, 'high': 1}, 'BMI': {'underweight': 0, 'healthy_weight': 0, 'overweight': 0, 'obese': 1}, 'DiabetesPedigreeFunction': {'low': 0.06800000000000006, 'medium': 0.9319999999999999, 'high': 0}, 'Age': {'low': 1, 'medium': 0, 'high': 0}}
fulfillments=[('low', 0.9319999999999999), ('high', 0), ('low', 1), ('high', 0), ('low', 0), ('high', 1)]
defaultdict(<function Fuzzy.compute.<locals>.<lambda> at 0x000001954CD20A60>, {'low': 1, 'high': 1})
['low', 'high']
counter=[0.16666666666666666, 0.3333333333333333]
denominator=[0.5, 0.5]
('high', 0.5)
0.0
