In [17]:
from ucimlrepo import fetch_ucirepo 
import numpy as np
from itertools import combinations
  
# Fetch dataset
W = fetch_ucirepo(id = 17)

# Data (features and targets as pandas DataFrames)
X = W.data.features
y = W.data.targets

In [18]:
print("Feature space dimensions:", X.shape)

# Optional: explicitly print the number of samples and features
n_samples, n_features = X.shape
print("Number of samples:", n_samples)
print("Number of features:", n_features)

Feature space dimensions: (569, 30)
Number of samples: 569
Number of features: 30


In [19]:
from sklearn.decomposition import PCA


In [20]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

  y = column_or_1d(y, warn=True)


In [21]:
from sklearn.preprocessing import StandardScaler


# Fetch dataset
# wine = fetch_ucirepo(id=109)
breast_cancer_wisconsin_diagnostic = fetch_ucirepo(id=17)

# Data (features and targets as pandas DataFrames)
# X = wine.data.features
# y = wine.data.targets
X = breast_cancer_wisconsin_diagnostic.data.features
y = breast_cancer_wisconsin_diagnostic.data.targets

In [22]:
from metric_learn import LMNN, NCA, ITML_Supervised
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_encoded, test_size=0.3, random_state=42)
y_train = np.ravel(y_train)  # Convert to shape (n_samples,)
y_test = np.ravel(y_test)

In [23]:
import warnings

# Suppress the specific FutureWarning
warnings.filterwarnings("ignore", category=FutureWarning)

In [24]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from metric_learn import LMNN
import numpy as np
import matplotlib.pyplot as plt


In [None]:
def evaluate_lmnn(X_train, y_train, number_components):   

    best_acurracy = 0
    best_lambda = None

    # Setting 5-Fold cross validation
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    lambdas = np.linspace(0, 1, 20)

    results = []

    # Performing cross-validation on the training data
    for reg in lambdas:

        fold_accuracies = []  
        
        for train_idx, val_idx in cv.split(X_train, y_train):
            
            # Spliting data into current training and validation set for this fold
            X_train_cv, X_val_cv = X_train[train_idx], X_train[val_idx]
            y_train_cv, y_val_cv = y_train[train_idx], y_train[val_idx]
            lmnn = LMNN(k=7, n_components=number_components, max_iter=500,learn_rate=1e-6, regularization=reg)
            lmnn.fit(X_train_cv, y_train_cv)

            # Transforming data
            X_train_lmnn = lmnn.transform(X_train_cv)
            X_val_lmnn = lmnn.transform(X_val_cv)

            # Training kNN on the transformed data
            knn = KNeighborsClassifier(n_neighbors=7)
            knn.fit(X_train_lmnn, y_train_cv)

            # Prediction on the validation set
            y_pred = knn.predict(X_val_lmnn)
            acc = accuracy_score(y_val_cv, y_pred)
            fold_accuracies.append(acc)  # Store fold accuracy

        # Calculating average accuracy across the folds for this lambda
        mean_acc = np.mean(fold_accuracies)
        results.append((reg, mean_acc))  

        if(mean_acc > best_acurracy):
            best_acurracy = mean_acc
            best_lambda = reg

    # Plot of the results
    lambdas, accuracies = zip(*results)
    plt.figure(figsize=(8, 6))
    plt.plot(lambdas, accuracies, marker='o', linestyle='-', color='b')
    plt.title(f'Cross-Validation Results with {number_components} Principal Components', fontsize=14)
    plt.xlabel('Lambda', fontsize=12)
    plt.ylabel('Mean Accuracy', fontsize=12)
    plt.grid(True)
    plt.show()

    return best_lambda,best_acurracy


In [11]:
def evaluate_itml(X_train, y_train):   
    from numpy.linalg import LinAlgError
    best_acurracy = 0
    best_gamma = None

    # Setting 5-Fold cross validation
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    gammas = np.logspace(-3, 3, 20)

    results = []

    # Performing cross-validation on the training data
    for gam in gammas:
        
        fold_accuracies = []  

        for train_idx, val_idx in cv.split(X_train, y_train):
            
            # Spliting data into current training and validation set for this fold
            X_train_cv, X_val_cv = X_train[train_idx], X_train[val_idx]
            y_train_cv, y_val_cv = y_train[train_idx], y_train[val_idx]
            
            try:
                itml = ITML_Supervised(gamma=gam,random_state=42) 
                itml.fit(X_train_cv, y_train_cv)
                X_train_itml = itml.transform(X_train_cv)
                X_val_itml = itml.transform(X_val_cv)
            except LinAlgError:
                continue

            # Training kNN on the transformed data
            knn = KNeighborsClassifier(n_neighbors=3)
            knn.fit(X_train_itml, y_train_cv)

            # Prediction on the validation set
            y_pred = knn.predict(X_val_itml)
            acc = accuracy_score(y_val_cv, y_pred)
            fold_accuracies.append(acc)  # Store fold accuracy

        # Calculating average accuracy across the folds for this gamma
        mean_acc = np.mean(fold_accuracies)
        results.append((gam, mean_acc))  

        if(mean_acc > best_acurracy):
            best_acurracy = mean_acc
            best_gamma = gam

    # Plot of the results
    gammas, accuracies = zip(*results)
    plt.figure(figsize=(8, 6))
    plt.plot(gammas, accuracies, marker='o', linestyle='-', color='b')
    plt.title(f'Cross-Validation Results', fontsize=14)
    plt.xlabel('Gamma', fontsize=12)
    plt.ylabel('Mean Accuracy', fontsize=12)
    plt.grid(True)
    plt.show()

    return best_gamma,best_acurracy


In [12]:
def evaluate_nca(X_train, y_train,number_components):   

    best_acurracy = 0
    best_k = None

    # Setting 5-Fold cross validation
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    Neighbours = [1,2,3,4,5,6,7,8,9,10]


    # Performing cross-validation on the training data
    for N_Neighbours in Neighbours:
        
        fold_accuracies = []  

        for train_idx, val_idx in cv.split(X_train, y_train):
            
            # Spliting data into current training and validation set for this fold
            X_train_cv, X_val_cv = X_train[train_idx], X_train[val_idx]
            y_train_cv, y_val_cv = y_train[train_idx], y_train[val_idx]
            
            #Transforming Data - inner and outer loops can be interchanged

            nca = NCA(n_components=number_components)
            nca.fit(X_train_cv,y_train_cv)
            X_train_nca = nca.transform(X_train_cv)
            X_val_nca = nca.transform(X_val_cv)
            

            # Training kNN on the transformed data
            knn = KNeighborsClassifier(n_neighbors=N_Neighbours)
            knn.fit(X_train_nca, y_train_cv)

            # Prediction on the validation set
            y_pred = knn.predict(X_val_nca)
            acc = accuracy_score(y_val_cv, y_pred)
            fold_accuracies.append(acc)  # Store fold accuracy

        # Calculating average accuracy across the folds for this gamma
        mean_acc = np.mean(fold_accuracies)
          
        if(mean_acc > best_acurracy):
            best_acurracy = mean_acc
            best_k = N_Neighbours

    return best_k


In [26]:
def plotwithpca(X, y,title):
    pca = PCA(n_components=2,random_state=42)
    X_pca = pca.fit_transform(X)

    plt.figure(figsize=(8, 6))
    scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis', alpha=0.7)
    plt.colorbar(scatter, label="Class Labels")
    plt.title(title)
    plt.xlabel('Principal Component 1')
    plt.ylabel('Principal Component 2')
    plt.grid(True)
    plt.show()

In [30]:
# Fetch dataset
rtiot = fetch_ucirepo(id=942)

In [32]:
print(rtiot.keys())  # Should give something like: dict_keys(['data'])
print(rtiot['data'].keys())  # Should give: dict_keys(['ids', 'features'])


dict_keys(['data', 'metadata', 'variables'])
dict_keys(['ids', 'features', 'targets', 'original', 'headers'])


In [41]:
features_df = rtiot['data']['features']
print(features_df.head())  # Preview the first few rows


   id.orig_p  id.resp_p proto service  flow_duration  fwd_pkts_tot  \
0      38667       1883   tcp    mqtt      32.011598             9   
1      51143       1883   tcp    mqtt      31.883584             9   
2      44761       1883   tcp    mqtt      32.124053             9   
3      60893       1883   tcp    mqtt      31.961063             9   
4      51087       1883   tcp    mqtt      31.902362             9   

   bwd_pkts_tot  fwd_data_pkts_tot  bwd_data_pkts_tot  fwd_pkts_per_sec  ...  \
0             5                  3                  3          0.281148  ...   
1             5                  3                  3          0.282277  ...   
2             5                  3                  3          0.280164  ...   
3             5                  3                  3          0.281593  ...   
4             5                  3                  3          0.282111  ...   

    active.avg  active.std     idle.min     idle.max     idle.tot  \
0  2282414.913         0.0  2

In [33]:
# Data (features and targets as pandas DataFrames)
X = rtiot.data.features
y = rtiot.data.targets

In [34]:
# Split the dataset
n_splits = 1
# Scale features
scaler = StandardScaler()

accuracies_knn = []
accuracies_knn_scaled = []
accuracies_lmnn = []
accuracies_nca = []
accuracies_itml = []

for n in range(n_splits):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    y_train = np.ravel(y_train)  # Convert to shape (n_samples,)
    y_test = np.ravel(y_test)
    # Generate pairs and labels for the training data
    pairs_train, labels_train = create_pairs_iloc(X_train, y_train)

    # LMNN
    lmnn = LMNN(k=3, learn_rate=1e-6)
    lmnn.fit(X_train, y_train)
    X_train_lmnn = lmnn.transform(X_train)
    X_test_lmnn = lmnn.transform(X_test)

    # NCA
    nca = NCA(max_iter=1000)
    nca.fit(X_train, y_train)
    X_train_nca = nca.transform(X_train)
    X_test_nca = nca.transform(X_test)

    # ITML
    itml = ITML()
    itml.fit(pairs_train, labels_train)
    # Transform the training and testing data (not pairs anymore)
    X_train_itml = itml.transform(X_train)
    X_test_itml = itml.transform(X_test)

    # Train a k-NN classifier on the transformed data
    knn = KNeighborsClassifier(n_neighbors=1)

    # Evaluate k-NN on original data
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    accuracies_knn.append(acc)

    knn.fit(X_train_scaled, y_train)
    y_pred_scaled = knn.predict(X_test_scaled)
    acc_scaled = accuracy_score(y_test, y_pred_scaled)
    accuracies_knn_scaled.append(acc_scaled)

    # Evaluate LMNN
    knn.fit(X_train_lmnn, y_train)
    y_pred_lmnn = knn.predict(X_test_lmnn)
    lmnn_acc = accuracy_score(y_test, y_pred_lmnn)
    accuracies_lmnn.append(lmnn_acc)

    # Evaluate NCA
    knn.fit(X_train_nca, y_train)
    y_pred_nca = knn.predict(X_test_nca)
    nca_acc = accuracy_score(y_test, y_pred_nca)
    accuracies_nca.append(nca_acc)

    # Evaluate ITML
    knn.fit(X_train_itml, y_train)
    y_pred_itml = knn.predict(X_test_itml)
    itml_acc = accuracy_score(y_test, y_pred_itml)
    accuracies_itml.append(itml_acc)

ValueError: could not convert string to float: 'tcp'