In [46]:
import pandas as pd
import numpy as np
import seaborn as sns

In [47]:
class ProcessingData:
    @staticmethod
    def splitSet(x: pd.DataFrame,k: int) -> pd.DataFrame: 
        n = int(len(x)*k) 
        xTrain = x[:n] 
        xVal = x[n:] 
        return xTrain, xVal
    
    @staticmethod
    def shuffle(x: pd.DataFrame) -> pd.DataFrame: 
        for i in range(len(x)-1, -1, -1):
            j = np.random.randint(i, len(x))
            x.iloc[i], x.iloc[j] = x.iloc[j], x.iloc[i]
        return x
    
    @staticmethod
    def normalize(x: pd.DataFrame) -> pd.DataFrame:
        values = x.loc[:, x.columns != 'Outcome']
        columnNames=values.columns.tolist()
        for column in columnNames:
            data = x.loc[:,column]
            max1 = max(data)
            min1 = min(data)
            for row in range(0,len(x),1):
                x.at[row,column] = (x.at[row,column]-min1)/(max1-min1)
        return x

In [48]:
class soft_set:
    
    @staticmethod
    def build_soft_set(x: pd.DataFrame) -> dict:
        soft_set = {}
        soft_set[0] = {}
        soft_set[1] = {}
        for key in soft_set:
            data1 = x
            data2 = x.loc[x['Outcome']==key]
            data1 = data1.drop(['Outcome'], axis=1)
            data2 = data2.drop(['Outcome'], axis=1)
            for col in data1.columns:
                mean = data1[col].mean()
                temp1 = 0 # lower or equal to mean
                temp2 = 0 # greater than mean
                for val in data2[col]:
                    if val < mean:
                        temp1 += 1
                    else:
                        temp2 += 1
                if temp1 > temp2:
                    soft_set[key][col] = 0
                else:
                    soft_set[key][col] = 1
        return soft_set
 
    @staticmethod
    def get_membership(soft_set: dict, vector: pd.Series) -> float:
        vector = vector.to_dict()
        result = dict()
        for key in soft_set:
            result[key] = 1
            for key2 in soft_set[key]:
                result[key] += soft_set[key][key2] * vector[key2]
        return float(result[max(result, key=result.get)])

In [49]:
class soft_KNN:
    @staticmethod
    def calc_euclidian_distance(val1: float, val2: float) -> float:
        return np.abs(val1-val2)
        
    @staticmethod
    def clustering(x: pd.DataFrame, sample: pd.Series, k: int) -> str:
        soft_x = soft_set.build_soft_set(x)
        distances = []
        for i in range(0, len(x)):
            temp_x = soft_set.get_membership(soft_x, x.iloc[i])
            temp_sample = soft_set.get_membership(soft_x, sample)
            distances.append(soft_KNN.calc_euclidian_distance(temp_x, temp_sample))
        tempdf = x.copy()
        tempdf['distance'] = distances
        tempdf = tempdf.sort_values(by='distance')
        classes = {
                   0: 0,
                   1: 0
                  }
        for i in range(k):
            classes[tempdf.iloc[i].Outcome] += 1
        
        return max(classes, key = classes.get)

In [53]:
# Wynik tworzenia zbioru miękkiego dla danych z bazy diabetes.csv
example = pd.read_csv(r'diabetes\\diabetes.csv')
example = ProcessingData.shuffle(example)
example = ProcessingData.normalize(example)
exampleT, exampleV = ProcessingData.splitSet(example, 0.7)
soft_set_T = soft_set.build_soft_set(exampleT)
soft_set_V = soft_set.build_soft_set(exampleV)
print(soft_set_T)
#print(soft_set_V)

{0: {'Pregnancies': 0, 'Glucose': 0, 'BloodPressure': 1, 'SkinThickness': 1, 'Insulin': 0, 'BMI': 0, 'DiabetesPedigreeFunction': 0, 'Age': 0}, 1: {'Pregnancies': 1, 'Glucose': 1, 'BloodPressure': 1, 'SkinThickness': 1, 'Insulin': 0, 'BMI': 1, 'DiabetesPedigreeFunction': 0, 'Age': 1}}


In [51]:
arr_of_results = []
for f in range(100):
    diabetes = pd.read_csv(r'diabetes\\diabetes.csv')
    diabetes = ProcessingData.shuffle(diabetes)
    diabetes = ProcessingData.normalize(diabetes)
    diabetesT, diabetesV = ProcessingData.splitSet(diabetes, 0.7)

    acc = 0
    for i in range(0, len(diabetesV)):
        if diabetesV.iloc[i].Outcome == soft_KNN.clustering(diabetesT, diabetesV.iloc[i], int(np.sqrt(len(diabetesV)))):
            acc += 1
    print(f"Accuracy {f}:", acc/len(diabetesV)*100)
    arr_of_results.append(acc/len(diabetesV)*100)

Accuracy 0: 64.93506493506493
Accuracy 1: 64.5021645021645
Accuracy 2: 65.80086580086581
Accuracy 3: 70.12987012987013
Accuracy 4: 67.53246753246754
Accuracy 5: 67.53246753246754
Accuracy 6: 67.53246753246754
Accuracy 7: 67.96536796536796
Accuracy 8: 72.72727272727273
Accuracy 9: 70.56277056277057
Accuracy 10: 65.36796536796537
Accuracy 11: 67.96536796536796
Accuracy 12: 63.63636363636363
Accuracy 13: 68.83116883116884
Accuracy 14: 73.16017316017316
Accuracy 15: 67.53246753246754
Accuracy 16: 66.66666666666666
Accuracy 17: 67.09956709956711
Accuracy 18: 73.16017316017316
Accuracy 19: 70.56277056277057
Accuracy 20: 66.66666666666666
Accuracy 21: 70.995670995671
Accuracy 22: 70.995670995671
Accuracy 23: 70.995670995671
Accuracy 24: 68.83116883116884
Accuracy 25: 68.83116883116884
Accuracy 26: 65.36796536796537
Accuracy 27: 65.36796536796537
Accuracy 28: 66.66666666666666
Accuracy 29: 70.995670995671
Accuracy 30: 63.63636363636363
Accuracy 31: 68.83116883116884
Accuracy 32: 64.06926406926

In [52]:
print(arr_of_results)
# Zapytac kiedy mozna oddac projekt
# Zrobic wizualizacje wynikow
# Zrobic analize danych

[64.93506493506493, 64.5021645021645, 65.80086580086581, 70.12987012987013, 67.53246753246754, 67.53246753246754, 67.53246753246754, 67.96536796536796, 72.72727272727273, 70.56277056277057, 65.36796536796537, 67.96536796536796, 63.63636363636363, 68.83116883116884, 73.16017316017316, 67.53246753246754, 66.66666666666666, 67.09956709956711, 73.16017316017316, 70.56277056277057, 66.66666666666666, 70.995670995671, 70.995670995671, 70.995670995671, 68.83116883116884, 68.83116883116884, 65.36796536796537, 65.36796536796537, 66.66666666666666, 70.995670995671, 63.63636363636363, 68.83116883116884, 64.06926406926407, 61.471861471861466, 72.72727272727273, 66.23376623376623, 63.20346320346321, 71.42857142857143, 63.63636363636363, 74.45887445887446, 69.6969696969697, 68.3982683982684, 70.12987012987013, 65.80086580086581, 66.66666666666666, 67.96536796536796, 69.6969696969697, 66.66666666666666, 67.53246753246754, 63.63636363636363, 66.23376623376623, 65.80086580086581, 65.36796536796537, 77.