In [6]:
import pickle
import numpy as np
import os

In [7]:
with open('../data/features/img_names.pkl', 'rb') as f:
    img_names = pickle.load(f)
with open('../data/features/X.pkl', 'rb') as f:
    X = np.array(pickle.load(f))
with open('../data/features/y.pkl', 'rb') as f:
    y = np.array(pickle.load(f))
with open('../data/training/training_splits.pkl', 'rb') as f:
    training_splits = pickle.load(f)
with open('../data/training/validation_splits.pkl', 'rb') as f:
    validation_splits = pickle.load(f)
with open('../data/training/test_data.pkl', 'rb') as f:
    test_data = pickle.load(f)

In [8]:
def img_name_to_features(imgs: list[list[str]]):
    indices = [img_names.index(img) for img in imgs]
    return [X[indices], y[indices]]

In [9]:
training_feat_splits = [img_name_to_features(split) for split in training_splits]
validation_feat_splits = [img_name_to_features(split) for split in validation_splits]
test_feat = img_name_to_features(test_data)

In [14]:
# Normalise training data
for i in range(len(training_feat_splits)):
    # Normalise training data
    train_means = np.mean(training_feat_splits[i][0], axis=0)
    train_stds = np.std(training_feat_splits[i][0], axis=0)
    training_feat_splits[i][0] = (training_feat_splits[i][0] - train_means) / train_stds
    # Normalise validation data
    val_means = np.mean(validation_feat_splits[i][0], axis=0)
    val_stds = np.std(validation_feat_splits[i][0], axis=0)
    validation_feat_splits[i][0] = (validation_feat_splits[i][0] - val_means) / val_stds

# Normalise test data
mean = np.mean(test_feat[0], axis=0)
std = np.std(test_feat[0], axis=0)
test_feat[0] = (test_feat[0] - mean) / std

In [15]:
# Extract only the first 8 features
training_feat_splits_less_features = [(X[:, :8], y) for X, y in training_feat_splits]
validation_feat_splits_less_features = [(X[:, :8], y) for X, y in validation_feat_splits]
test_feat_less_features = (test_feat[0][:, :8], test_feat[1])

In [None]:
from sklearn.feature_selection import VarianceThreshold
# Remove features with low variance
selector = VarianceThreshold(threshold=0.1)
training_feat_splits_less_features_pruned = [(selector.fit_transform(X), y) for X, y in training_feat_splits_less_features]
validation_feat_splits_less_features_pruned = [(selector.transform(X), y) for X, y in validation_feat_splits_less_features]
test_feat_less_features_pruned = (selector.transform(test_feat_less_features[0]), test_feat_less_features[1])

# Print difference
print(f"Original shape: {training_feat_splits_less_features[0][0].shape}")
print(f"Pruned shape: {training_feat_splits_less_features_pruned[0][0].shape}")

In [16]:
from sklearn.neighbors import KNeighborsClassifier

def f1(y_true, y_pred):
    # Calculate F1 score
    tp = np.sum(y_true * y_pred)
    fp = np.sum((1 - y_true) * y_pred)
    fn = np.sum(y_true * (1 - y_pred))
    return 2 * tp / (2 * tp + fp + fn)

def train_and_evaluate_model(X_train, y_train, X_val, y_val, k):
    model = KNeighborsClassifier(n_neighbors=k)
    model.fit(X_train, y_train)
    # Get F1 score on validation set
    y_pred = model.predict(X_val)
    return f1(y_val, y_pred)

In [17]:
# Train and evaluate model on validation set
k = 15
confusion_matrices = []
for i in range(len(training_feat_splits)):
    X_train, y_train = training_feat_splits[i]
    X_val, y_val = validation_feat_splits[i]
    model = KNeighborsClassifier(n_neighbors=k)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_val)
    TP = np.sum(y_val * y_pred)
    FP = np.sum((1 - y_val) * y_pred)
    FN = np.sum(y_val * (1 - y_pred))
    TN = np.sum((1 - y_val) * (1 - y_pred))
    confusion_matrices.append(np.array([[TP, FP], [FN, TN]]))

# Average confusion matrix
confusion_matrix = np.mean(confusion_matrices, axis=0)
TP, FP, FN, TN = confusion_matrix.flatten()
# print confusion matrix
print(f'Confusion matrix:\n{confusion_matrix}')

Confusion matrix:
[[181.   47.4]
 [ 33.4 136.6]]


In [18]:
# Train and evaluate model on validation set
k = 15
confusion_matrices = []
for i in range(len(training_feat_splits_less_features)):
    X_train, y_train = training_feat_splits_less_features[i]
    X_val, y_val = validation_feat_splits_less_features[i]
    model = KNeighborsClassifier(n_neighbors=k)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_val)
    TP = np.sum(y_val * y_pred)
    FP = np.sum((1 - y_val) * y_pred)
    FN = np.sum(y_val * (1 - y_pred))
    TN = np.sum((1 - y_val) * (1 - y_pred))
    confusion_matrices.append(np.array([[TP, FP], [FN, TN]]))

# Average confusion matrix
confusion_matrix = np.mean(confusion_matrices, axis=0)
TP, FP, FN, TN = confusion_matrix.flatten()
print(f'Confusion matrix:\n{confusion_matrix}')

Confusion matrix:
[[173.6  46.6]
 [ 40.8 137.4]]
