In [10]:
#Import packages

import numpy as np
from numpy.typing import NDArray
import pickle
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold

#Auxiliary functions

def onehot2cat(y:NDArray) -> NDArray:
    '''Convert y in one_hot to categorical'''
    return np.argmax(y, axis=1) 


def loadDataset(fn:str, toCat:bool=False) -> NDArray:
    '''load dataset'''
    with open(fn, 'rb') as f:
        data = pickle.load(f)
        
    X = data['X'] 
    if toCat: y = onehot2cat(data['Y'])
    else:     y = data['Y'] 
    
    return X, y


def saveSKLModel(fn:str, model) -> None:
    '''save SKLearn model as pickle'''
    with open(fn, 'wb') as f:
        pickle.dump(model, f)
        
#load dataset

fnt = 'wtdt-part.pickle'
X, y = loadDataset(fnt, toCat=True)
print(X.shape)
print(y.shape)

(14000, 787)
(14000,)


In [17]:
model = RandomForestClassifier(n_estimators= 278, max_depth= 9, max_features= 'log2', min_samples_split= 6, min_samples_leaf= 1)
model.fit(X,y)
saveSKLModel("T1-randomForest.pickle", model)

In [22]:
import numpy as np
from sklearn.model_selection import StratifiedKFold

def objective_function(model, X, y, weight_accuracy=0.7, weight_stability=0.2, weight_class_penalty=0.1, target_class=None):
    """
    Objective function that balances accuracy with feature importance stability and class-specific penalties.
    
    Parameters:
    - model: The classifier model (e.g., RandomForestClassifier).
    - X: The feature matrix.
    - y: The target variable.
    - weight_accuracy: Weight for accuracy in the final combined score.
    - weight_stability: Weight for feature importance stability in the final score.
    - weight_class_penalty: Weight for penalizing the performance on a specific class.
    - target_class: The specific class you want to penalize (if any).
    
    Returns:
    - Combined score based on accuracy, stability, and class penalty.
    """
    accuracies = []
    importances_list = []
    class_performance = []

    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    
    for train_idx, val_idx in skf.split(X, y):
        # Train and validate the model
        model.fit(X[train_idx], y[train_idx])
        accuracy = model.score(X[val_idx], y[val_idx])  # Calculate accuracy on validation set
        accuracies.append(accuracy)
        
        # Store feature importances
        importances_list.append(model.feature_importances_)
        
        # Calculate performance on each class
        if target_class is not None:
            class_accuracy = np.sum(y[val_idx] == target_class) / len(val_idx)  # Accuracy for target class
            class_performance.append(class_accuracy)

    # Calculate mean and variance of feature importances
    importances_array = np.array(importances_list)
    variance_importances = np.var(importances_array, axis=0)
    mean_variance = np.mean(variance_importances)  # Average variance across features

    # Calculate the mean accuracy
    mean_accuracy = np.mean(accuracies)

    # Penalize the performance on the specific class
    if target_class is not None:
        mean_class_accuracy = np.mean(class_performance)
        class_penalty = 1 - mean_class_accuracy  # Penalize lower performance on the specific class
    else:
        class_penalty = 0

    # Normalize metrics to balance their scales
    normalized_accuracy = mean_accuracy  # Accuracy is already on a normalized scale (0 to 1)
    normalized_stability = 1 / (1 + mean_variance)  # Lower variance means better stability

    # Combined score
    combined_score = (weight_accuracy * normalized_accuracy) + (weight_stability * normalized_stability) - (weight_class_penalty * class_penalty)

    # Print evaluation metrics for debugging
    print(f"Mean Accuracy: {mean_accuracy:.4f}")
    print(f"Mean Feature Importance Variance: {mean_variance:.4f}")
    print(f"Normalized Stability: {normalized_stability:.4f}")
    print(f"Class Performance (for target class): {np.mean(class_performance) if target_class else 'N/A'}")
    print(f"Class Penalty: {class_penalty:.4f}")
    print(f"Combined Score: {combined_score:.4f}")

    return combined_score

# Example of using the function
target_class = 3  # If you know which class is problematic
combined_score = objective_function(model, X, y, weight_accuracy=0.7, weight_stability=0.2, weight_class_penalty=0.1, target_class=target_class)



print(objective_function(model, X, y))

Mean Accuracy: 0.9996
Mean Feature Importance Variance: 0.0000
Normalized Stability: 1.0000
Feature Importance Balance (Entropy): 6.0871
Combined Score: 0.8280
0.827962947166482


In [13]:
model.fit(X,y)
saveSKLModel("T1-randomForest.pickle", model)