In [1]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import maxabs_scale
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import matplotlib.pyplot as plt
from sklearn.base import clone
from matplotlib import colors
import copy

In [2]:
path = "../features/"
names = ["betweenness", "community_count", "community_size", "ks_level_1", "ks_level_2"]
used_names = ["ks_level_1", "ks_level_2"]
np.random.seed(42)
samples_per_class = 35
model = LogisticRegression(random_state=42)

In [3]:
# Load features from .npz files
original_labels = np.load(path + names[0] + ".npz")['y']
original_features = np.load(path + names[0] + ".npz")['X']
columns = []
for filename in names:
    file_path = path + filename + ".npz"
    data = np.load(file_path)
    feature_matrix = maxabs_scale(data['X'], axis=1)
    columns.append(feature_matrix.shape[1])
    original_features = np.concatenate((original_features, feature_matrix), axis=1)

# Over-sample
original_samples_per_class = {label: np.sum(original_labels == label) for label in np.unique(original_labels)}
sampling_strategy = {label: max(samples_per_class, original_samples) for label, original_samples in original_samples_per_class.items()}
ros = RandomOverSampler(random_state=42, sampling_strategy=sampling_strategy)
oversampled_features, oversampled_labels = ros.fit_resample(original_features, original_labels)
# Under-sample
updated_samples_per_class = {label: np.sum(oversampled_labels == label) for label in np.unique(original_labels)}
sampling_strategy = {label: min(samples_per_class, original_samples) for label, original_samples in updated_samples_per_class.items()}
rus = RandomUnderSampler(random_state=42, sampling_strategy=sampling_strategy)
undersampled_features, labels = rus.fit_resample(oversampled_features, oversampled_labels)

# Build features list
all_features = []
current = 0
for c in columns:
    feature = undersampled_features[:, current:current + c]
    all_features.append(feature)
    current += c

In [4]:
# # set thresholds as hyperparameters
# L1_threshold = thr_data[:, 0].mean()*0.5
# L2_threshold = 8

In [5]:
# # Feature selection based on a specific criterion

# # Compute the threshold
# threshold_feature = "ks_level_1"

# thr_data = features[names.index(threshold_feature)]

# percentiles = np.percentile(np.mean(thr_data, axis=0), np.linspace(50, 0, 2))

# accuracies_filt = []
# percentages = []
# for perc in percentiles:
#     filtered_coefficients = np.where(np.mean(thr_data, axis=0) > perc)[0]
#     if filtered_coefficients.size == 0:
#         continue
#     X = np.hstack([f[:, filtered_coefficients] if f.shape[1] >= filtered_coefficients.max() + 1 else f for f in features])
#     data_percentage = filtered_coefficients.shape[0]/thr_data.shape[1]

#     # Perform logistic regression and compute accuracy
#     X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42, stratify=labels)
#     model_ = clone(model)
#     model_.fit(X_train, y_train)
#     y_pred = model_.predict(X_test)
#     accuracy = accuracy_score(y_test, y_pred)

#     print(f"Accuracy = {accuracy*100:4.2f}; retained features = {data_percentage*100:4.2f}%")
#     accuracies_filt.append(accuracy)
#     percentages.append(data_percentage)

In [6]:
def evaluate(X):
    # Perform logistic regression and compute accuracy
    X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42, stratify=labels)
    model_ = clone(model)
    model_.fit(X_train, y_train)
    y_pred = model_.predict(X_test)
    return accuracy_score(y_test, y_pred)

In [7]:
ks1 = all_features[names.index("ks_level_1")]
ks2 = all_features[names.index("ks_level_2")]
features = [all_features[names.index(f)] for f in used_names]

l1_perc = np.percentile(np.mean(ks1, axis=0), 50)
l2_perc = np.percentile(np.mean(ks2, axis=0), 50)


In [8]:

# class 1: ks[:,0] < L1_threshold and ks[:,1] < L2_threshold
filtered_coefficients = np.where((np.mean(ks1, axis=0) < l1_perc) & (np.mean(ks2, axis=0) < l2_perc))[0]
if filtered_coefficients.size > 0:
    X = np.hstack([f[:, filtered_coefficients] if f.shape[1] >= filtered_coefficients.max() + 1 else f for f in features])
    accuracy = evaluate(X)
    data_percentage = (filtered_coefficients.shape[0]/ks1.shape[1])*(filtered_coefficients.shape[0]/ks2.shape[1])
    print(f"Class 1 accuracy = {accuracy*100:4.2f}%; retained features = {data_percentage*100:4.2f}%")

# class 2: ks[:,0] < L1_threshold and ks[:,1] > L2_threshold
filtered_coefficients = np.where((np.mean(ks1, axis=0) < l1_perc) & (np.mean(ks2, axis=0) > l2_perc))[0]
if filtered_coefficients.size > 0:
    X = np.hstack([f[:, filtered_coefficients] if f.shape[1] >= filtered_coefficients.max() + 1 else f for f in features])
    accuracy = evaluate(X)
    data_percentage = (filtered_coefficients.shape[0]/ks1.shape[1])*(filtered_coefficients.shape[0]/ks2.shape[1])
    print(f"Class 2 accuracy = {accuracy*100:4.2f}%; retained features = {data_percentage*100:4.2f}%")

# class 3: ks[:,0] > L1_threshold and ks[:,1] < L2_threshold
filtered_coefficients = np.where((np.mean(ks1, axis=0) > l1_perc) & (np.mean(ks2, axis=0) < l2_perc))[0]
if filtered_coefficients.size > 0:
    X = np.hstack([f[:, filtered_coefficients] if f.shape[1] >= filtered_coefficients.max() + 1 else f for f in features])
    accuracy = evaluate(X)
    data_percentage = (filtered_coefficients.shape[0]/ks1.shape[1])*(filtered_coefficients.shape[0]/ks2.shape[1])
    print(f"Class 3 accuracy = {accuracy*100:4.2f}%; retained features = {data_percentage*100:4.2f}%")

# class 4: ks[:,0] > L1_threshold and ks[:,1] > L2_threshold
filtered_coefficients = np.where((np.mean(ks1, axis=0) > l1_perc) & (np.mean(ks2, axis=0) > l2_perc))[0]
if filtered_coefficients.size > 0:
    X = np.hstack([f[:, filtered_coefficients] if f.shape[1] >= filtered_coefficients.max() + 1 else f for f in features])
    accuracy = evaluate(X)
    data_percentage = (filtered_coefficients.shape[0]/ks1.shape[1])*(filtered_coefficients.shape[0]/ks2.shape[1])
    print(f"Class 4 accuracy = {accuracy*100:4.2f}%; retained features = {data_percentage*100:4.2f}%")

Class 1 accuracy = 17.76%; retained features = 21.87%
Class 2 accuracy = 1.57%; retained features = 0.08%
Class 3 accuracy = 1.68%; retained features = 0.08%
Class 4 accuracy = 80.35%; retained features = 21.87%
