<a href="https://colab.research.google.com/github/AmpF5/AmpF5/blob/main/eksploracja_danych_klasyfikacja.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [78]:
import numpy as np
from scipy.spatial.distance import cdist
from collections import Counter

# Dane
data = np.array([
    [1, 5.3, 2],
    [2.8, 7.6, 1],
    [4.2, 9.3, 2],
    [1.5, 3.1, 1],
    [9.8, 7.5, 2],
    [6.1, 0.5, 2],
    [4.7, 8.9, 2],
    [1.2, 8, 1],
    [8.2, 3.3, 1],
    [6.4, 5.5, 1],
])

# Punkt do klasyfikacji
target_point = np.array([5, 5])

# Podział danych na cechy (X) i etykiety (y)
X = data[:, :2]
y = data[:, 2]

# Funkcja klasyfikująca kNN
def knn_classify(X, y, target_point, k, metric):
    distances = cdist(X, [target_point], metric=metric).flatten()
    nearest_indices = distances.argsort()[:k]
    nearest_labels = y[nearest_indices]
    most_common_label = Counter(nearest_labels).most_common(1)[0][0]
    return most_common_label

# Miary odległości
metrics = ["euclidean", "cityblock", "chebyshev", "mahalanobis"]

# Macierz kowariancji dla Mahalanobisa
cov_matrix = np.cov(X, rowvar=False)
inv_cov_matrix = np.linalg.inv(cov_matrix)

# Klasyfikacja dla różnych wartości k i miar odległości
k_values = [3, 5, 7]
results = {}

for metric in metrics:
    results[metric] = {}
    for k in k_values:
      label = knn_classify(X, y, target_point, k, metric=metric)
      results[metric][k] = label
results


{'euclidean': {3: 1.0, 5: 1.0, 7: 1.0},
 'cityblock': {3: 2.0, 5: 1.0, 7: 1.0},
 'chebyshev': {3: 1.0, 5: 1.0, 7: 1.0},
 'mahalanobis': {3: 1.0, 5: 1.0, 7: 1.0}}

In [100]:
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, KFold, LeaveOneOut
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_predict

def calculate_metrics(y_true, y_pred, all_labels):
  # Macierz pomyłek
  cm = confusion_matrix(y_true, y_pred, labels=all_labels)
  TP = cm[1, 1] if cm.shape[0] > 1 and cm.shape[1] > 1 else 0
  TN = cm[0, 0] if cm.shape[0] > 0 and cm.shape[1] > 0 else 0
  FP = cm[0, 1] if cm.shape[0] > 0 and cm.shape[1] > 1 else 0
  FN = cm[1, 0] if cm.shape[0] > 1 and cm.shape[1] > 0 else 0

  average = 'weighted'

  accuracy = accuracy_score(y_true, y_pred)
  sensitivity = recall_score(y_true, y_pred, average='binary', labels=all_labels, pos_label=2)
  specificity = TN / (TN + FP)
  auc = roc_auc_score(y_true, y_pred, average=average, labels=all_labels)
  precision = precision_score(y_true, y_pred, average=average, labels=all_labels)
  f1 = f1_score(y_true, y_pred, average=average, labels=all_labels)

  return cm, accuracy, sensitivity, specificity, auc, f1, precision

# Dane
data = np.array([
    [1, 5.3, 2],
    [2.8, 7.6, 1],
    [4.2, 9.3, 2],
    [1.5, 3.1, 1],
    [9.8, 7.5, 2],
    [6.1, 0.5, 2],
    [4.7, 8.9, 2],
    [1.2, 8, 1],
    [8.2, 3.3, 1],
    [6.4, 5.5, 1],
])
X = data[:, :2]
y = data[:, 2]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

ks = [1, 3, 5, 7]
methods = ['Resubstitution', 'Train/Test Split', 'Cross-Validation', 'Leave-One-Out']
results = []

all_labels = np.unique(y)

for method in methods:
  for k in ks:
    model = KNeighborsClassifier(n_neighbors=k)
    if method == 'Resubstitution':
      model.fit(X, y)
      y_pred = model.predict(X)
      cm, accuracy, sensitivity, specificity, auc, precision, f1 = calculate_metrics(y, y_pred, all_labels)
      results.append([k, method, cm, accuracy, sensitivity, specificity, auc, f1, precision])
    elif method == 'Train/Test Split':
      # Podzielenie danych na testowe i uczące
      X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)
      model.fit(X_train, y_train)
      y_pred = model.predict(X_test)
      cm, accuracy, sensitivity, specificity, auc, precision, f1 = calculate_metrics(y_test, y_pred, all_labels)
      results.append([k, method, cm, accuracy, sensitivity, specificity, auc, f1, precision])
    elif method == 'Cross-Validation':
      # Ustawienie splita dla cross-validation
      cv = KFold(n_splits=6, shuffle=True, random_state=42)
      scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
      y_pred = cross_val_predict(model, X, y, cv=cv)
      cm, accuracy, sensitivity, specificity, auc, precision, f1 = calculate_metrics(y, y_pred, all_labels)
      results.append([k, method, cm, accuracy, sensitivity, specificity, auc, f1, precision])
    elif method == 'Leave-One-Out':
      y_pred = cross_val_predict(model, X, y, cv=LeaveOneOut())
      cm, accuracy, sensitivity, specificity, auc, precision, f1 = calculate_metrics(y, y_pred, all_labels)
      results.append([k, method, cm, accuracy, sensitivity, specificity, auc, f1, precision])


# Wyświetlanie wyników
import pandas as pd
df = pd.DataFrame(results, columns=['k', 'Method', 'cm', 'accuracy', 'sensitivity', 'specificity', 'auc', 'f1', 'precision'])
df


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,k,Method,cm,accuracy,sensitivity,specificity,auc,f1,precision
0,1,Resubstitution,"[[5, 0], [0, 5]]",1.0,1.0,1.0,1.0,1.0,1.0
1,3,Resubstitution,"[[5, 0], [3, 2]]",0.7,0.4,1.0,0.7,0.8125,0.67033
2,5,Resubstitution,"[[2, 3], [4, 1]]",0.3,0.2,0.4,0.3,0.291667,0.292929
3,7,Resubstitution,"[[4, 1], [2, 3]]",0.7,0.6,0.8,0.7,0.708333,0.69697
4,1,Train/Test Split,"[[2, 0], [0, 1]]",1.0,1.0,1.0,1.0,1.0,1.0
5,3,Train/Test Split,"[[0, 2], [1, 0]]",0.0,0.0,0.0,0.0,0.0,0.0
6,5,Train/Test Split,"[[1, 1], [0, 1]]",0.666667,1.0,0.5,0.75,0.833333,0.666667
7,7,Train/Test Split,"[[0, 2], [0, 1]]",0.333333,1.0,0.0,0.5,0.111111,0.166667
8,1,Cross-Validation,"[[4, 1], [3, 2]]",0.6,0.4,0.8,0.6,0.619048,0.583333
9,3,Cross-Validation,"[[2, 3], [4, 1]]",0.3,0.2,0.4,0.3,0.291667,0.292929
