In [118]:
import numpy as np
import pandas as pd
import time
from sklearn.preprocessing import StandardScaler

class CancerClassifier:
    def __init__(self):
        self.breast_labels = None
        self.breast_train = None
        self.breast_validate = None
        self.scaler = StandardScaler()
        self.feature_names = []
        self.quadratic_feature_indices = []
        
    def read_data(self):
        # Wczytanie danych
        self.breast_labels = pd.read_csv("dataset/breast-cancer.labels", header=None)[0].tolist()
        self.breast_train = pd.read_csv("dataset/breast-cancer-train.dat", header=None)
        self.breast_validate = pd.read_csv("dataset/breast-cancer-validate.dat", header=None)
        self.breast_train.columns = self.breast_labels
        self.breast_validate.columns = self.breast_labels
        
        self.feature_names = [col for col in self.breast_train.columns 
                             if col not in ['patient ID', 'Malignant/Benign']]
        
        quadratic_features = ['radius (mean)', 'perimeter (mean)', 'area (mean)', 'symmetry (mean)']
        self.quadratic_feature_indices = [self.feature_names.index(f) for f in quadratic_features]
    
    def _prepare_data(self, df, scaler=None):
        """Przygotowanie danych: usunięcie ID, mapowanie targetu, normalizacja, dodanie biasu"""
        # Kopia danych
        data = df.copy()
        
        if 'patient ID' in data.columns:
            data = data.drop(columns=['patient ID'])
        
        data['Malignant/Benign'] = data['Malignant/Benign'].map({'M': 1, 'B': 0})
        
        y = data['Malignant/Benign'].values
        X = data.drop(columns=['Malignant/Benign']).values
        
        if scaler is None:
            scaler = StandardScaler()
            X_scaled = scaler.fit_transform(X)
        else:
            X_scaled = scaler.transform(X)
        
        X_final = np.c_[np.ones(X_scaled.shape[0]), X_scaled]
        
        return X_final, y, scaler
    
    # Metody z poprzedniego laboratorium
    def create_linear_matrix(self, X):
        """Tworzy macierz cech z dodanym biasem"""
        return np.c_[np.ones(X.shape[0]), X]
    
    def create_quadratic_matrix(self, X):
        """Tworzy macierz cech dla reprezentacji kwadratowej"""
        # Wybór odpowiednich cech
        X_sub = X[:, self.quadratic_feature_indices]
        
        # Terminy liniowe i kwadratowe
        X_squared = X_sub ** 2
        
        # Terminy interakcyjne
        interactions = []
        n_features = X_sub.shape[1]
        for i in range(n_features):
            for j in range(i+1, n_features):
                interactions.append(X_sub[:, i] * X_sub[:, j])
        interactions = np.column_stack(interactions) if interactions else np.zeros((X_sub.shape[0], 0))
        
        # Połączenie wszystkich terminów
        return np.hstack((X_sub, X_squared, interactions))
    
    def create_target_vector(self, df):
        """Tworzy wektor celu (1 dla M, -1 dla B)"""
        return np.where(df['Malignant/Benign'] == 'M', 1, -1)
    
    def compute_weights(self, A, b):
        """Oblicza wagi metodą równania normalnego"""
        ATA = A.T @ A
        ATb = A.T @ b
        return np.linalg.solve(ATA, ATb)
    
    def compute_weights_SVD(self, A, lambda_coeff, b):
        """Oblicza wagi z regularyzacją SVD"""
        ATA = A.T @ A 
        ATA += lambda_coeff * np.identity(A.shape[1])
        ATb = A.T @ b
        return np.linalg.lstsq(ATA, ATb, rcond=None)[0]
    
    # Metody dla GD
    def _compute_learning_rate(self, A):
        """Obliczenie stałej uczenia na podstawie ekstremalnych wartości własnych macierzy A^T A."""
        ATA = A.T @ A
        eigenvalues = np.linalg.eigvals(ATA)
        lambda_min = np.min(eigenvalues)
        lambda_max = np.max(eigenvalues)
        # krok wg wzoru 2 / (λ_max + λ_min)
        return 2.0 / (lambda_max + lambda_min)

    
    def gradient_descent(self, X, y, alpha, max_iters=10000, tol=1e-6):
        """Implementacja gradient_descent"""
        n_samples, n_features = X.shape
        beta = np.zeros(n_features)
        start_ns = time.perf_counter_ns()
        
        for i in range(max_iters):
            # Obliczenie predykcji i błędu
            y_pred = X @ beta
            error = y_pred - y
            
            # Obliczenie gradientu
            gradient = (X.T @ error) / n_samples
            
            # Aktualizacja wag
            beta_new = beta - alpha * gradient
            
            # Warunek stopu
            if np.linalg.norm(beta_new - beta) < tol:
                beta = beta_new
                break
                
            beta = beta_new
        
        time_elapsed_ns = time.perf_counter_ns() - start_ns
        return beta, time_elapsed_ns
    
    def predict_01(self, X, beta, threshold=0.5):
        """Predykcja dla etykiet 0/1"""
        y_continuous = X @ beta
        return (y_continuous >= threshold).astype(int)
    
    def predict_1m(self, X, beta):
        """Predykcja dla etykiet 1/-1"""
        y_continuous = X @ beta
        return np.sign(y_continuous)
    
    def accuracy(self, y_true, y_pred):
        """Obliczenie dokładności"""
        return np.mean(y_true == y_pred)
    
    def run(self):
        # Przygotowanie danych treningowych
        self.read_data()
        X_train, y_train, scaler = self._prepare_data(self.breast_train)
        
        # Przygotowanie danych do walidacji
        X_val, y_val, _ = self._prepare_data(self.breast_validate, scaler)
        
        # Dane bez kolumny biasu dla metod z poprzedniego laboratorium
        X_train_no_bias = X_train[:, 1:]
        X_val_no_bias = X_val[:, 1:]
        
        # Obliczenie stałej uczącej dla GD
        alpha = self._compute_learning_rate(X_train)
        print(f"Obliczona stała ucząca: {alpha:.6f}")
        
        # Gradient Descent (reprezentacja liniowa)
        beta_gd, time_gd_ns = self.gradient_descent(X_train, y_train, alpha)
        y_pred_gd = self.predict_01(X_val, beta_gd)
        acc_gd = self.accuracy(y_val, y_pred_gd)
        
        # Least Squares - Równanie normalne (liniowa)
        A_lin_train = self.create_linear_matrix(X_train_no_bias)
        A_lin_val = self.create_linear_matrix(X_val_no_bias)
        b_train_1m = self.create_target_vector(self.breast_train)
        
        start_ns = time.perf_counter_ns()
        weights_lin_normal = self.compute_weights(A_lin_train, b_train_1m)
        time_lin_normal_ns = time.perf_counter_ns() - start_ns
        y_pred_lin_normal = self.predict_1m(A_lin_val, weights_lin_normal)
        y_pred_lin_normal_01 = (y_pred_lin_normal > 0).astype(int)
        acc_lin_normal = self.accuracy(y_val, y_pred_lin_normal_01)
        
        # Least Squares - Równanie normalne (kwadratowa)
        A_quad_train = self.create_quadratic_matrix(X_train_no_bias)
        A_quad_val = self.create_quadratic_matrix(X_val_no_bias)
        
        start_ns = time.perf_counter_ns()
        weights_quad_normal = self.compute_weights(A_quad_train, b_train_1m)
        time_quad_normal_ns = time.perf_counter_ns() - start_ns
        y_pred_quad_normal = self.predict_1m(A_quad_val, weights_quad_normal)
        y_pred_quad_normal_01 = (y_pred_quad_normal > 0).astype(int)
        acc_quad_normal = self.accuracy(y_val, y_pred_quad_normal_01)
        
        # Least Squares - SVD bez regularyzacji
        start_ns = time.perf_counter_ns()
        weights_lin_svd = self.compute_weights_SVD(A_lin_train, 0.0, b_train_1m)
        time_lin_svd_ns = time.perf_counter_ns() - start_ns
        y_pred_lin_svd = self.predict_1m(A_lin_val, weights_lin_svd)
        y_pred_lin_svd_01 = (y_pred_lin_svd > 0).astype(int)
        acc_lin_svd = self.accuracy(y_val, y_pred_lin_svd_01)
        
        # Least Squares - SVD z regularyzacją
        start_ns = time.perf_counter_ns()
        weights_lin_reg = self.compute_weights_SVD(A_lin_train, 0.001, b_train_1m)
        time_lin_reg_ns = time.perf_counter_ns() - start_ns
        y_pred_lin_reg = self.predict_1m(A_lin_val, weights_lin_reg)
        y_pred_lin_reg_01 = (y_pred_lin_reg > 0).astype(int)
        acc_lin_reg = self.accuracy(y_val, y_pred_lin_reg_01)
        
        # Wyniki
        results = [
            ("Gradient Descent", acc_gd, time_gd_ns, "O(k·n·p)"),
            ("LS (Normal Eq - Linear)", acc_lin_normal, time_lin_normal_ns, "O(n·p² + p³)"),
            ("LS (Normal Eq - Quadratic)", acc_quad_normal, time_quad_normal_ns, "O(n·q² + q³)"),
            ("LS (SVD)", acc_lin_svd, time_lin_svd_ns, "O(min(n²·p, n·p²))"),
            ("LS (Regularized SVD)", acc_lin_reg, time_lin_reg_ns, "O(min(n²·p, n·p²))")
        ]
        
        # Printowanie wyników
        print(f"{'Metoda':<30} | {'Dokładność':<10} | {'Czas [s]':<12} | {'Złożoność teoretyczna'}")
        print("-"*100)
        for method, acc, t_ns, comp in results:
            print(f"{method:<30} | {acc:.6f}   | {t_ns / 10e9 :<12f} | {comp}")
        print("gdzie: k to liczba iteracji - n liczba próbek - p liczba cech. q - liczba cech w macierzy kwadratowej")
        print("="*100)
        
        return results

# Uruchomienie klasyfikatora
if __name__ == "__main__":
    classifier = CancerClassifier()
    classifier.run()


Obliczona stała ucząca: 0.000512
Metoda                         | Dokładność | Czas [s]     | Złożoność teoretyczna
----------------------------------------------------------------------------------------------------
Gradient Descent               | 0.976923   | 0.014721     | O(k·n·p)
LS (Normal Eq - Linear)        | 0.973077   | 0.000009     | O(n·p² + p³)
LS (Normal Eq - Quadratic)     | 0.926923   | 0.000004     | O(n·q² + q³)
LS (SVD)                       | 0.973077   | 0.000020     | O(min(n²·p, n·p²))
LS (Regularized SVD)           | 0.973077   | 0.000019     | O(min(n²·p, n·p²))
gdzie: k to liczba iteracji - n liczba próbek - p liczba cech. q - liczba cech w macierzy kwadratowej
