In [None]:
!pip install ucimlrepo



In [None]:
from collections import Counter

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from scipy.sparse import csr_matrix

from ucimlrepo import fetch_ucirepo

# fetch dataset
def fetch_data(id):
    data = fetch_ucirepo(id=id)
    X = data.data.features
    y = data.data.targets
    return X, y

def preprocess_y(y):
    le = LabelEncoder()
    y_encoded = le.fit_transform(y.values.ravel())
    if le.classes_.size == 2:
        return y_encoded, le.classes_
    else:
        return preprocess_y_binary_most_common(y_encoded)


def preprocess_y_binary_most_common(y):
    """
    Convert the most common class in y to 1 and all other classes to -1.

    Args:
    y (array-like): Input labels.

    Returns:
    tuple: (preprocessed_y, most_common_class, class_mapping)
        preprocessed_y: numpy array with 1 for most common class, -1 for others
        most_common_class: the original label of the most common class
        class_mapping: dictionary mapping original classes to new values
    """
    # Find the most common class
    class_counts = Counter(y)
    most_common_class = class_counts.most_common(1)[0][0]

    # Create a mapping dictionary
    class_mapping = {class_label: -1 for class_label in class_counts.keys()}
    class_mapping[most_common_class] = 1

    # Convert y to numpy array if it's not already
    y_array = np.array(y)

    # Apply the mapping
    preprocessed_y = np.array([class_mapping[label] for label in y_array])
    return preprocessed_y, class_counts.keys()

def preprocess(X):
    # Identify numeric and categorical columns
    numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
    categorical_features = X.select_dtypes(include=['object']).columns

    # Create preprocessing pipelines
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    # Combine preprocessing steps
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ])


    X_preprocessed = preprocessor.fit_transform(X)

    numeric_feature_names = numeric_features.tolist()
    if categorical_features.size > 0:
        categorical_feature_names = preprocessor.named_transformers_['cat']\
            .named_steps['onehot'].get_feature_names_out(categorical_features).tolist()
    else:
        categorical_feature_names = []


    feature_names = numeric_feature_names + categorical_feature_names
    if isinstance(X_preprocessed, csr_matrix):
        return pd.DataFrame(X_preprocessed.toarray(), columns=feature_names)
    return pd.DataFrame(X_preprocessed, columns=feature_names)

# X, y = fetch_data(222) # bank
  X, y = fetch_data(143) # australian
# X, y = fetch_data(19) # car
# X, y = fetch_data(30) # contraceptive
# X, y = fetch_data(38) # echocardiogram
# X, y = fetch_data(144) # German
# X, y = fetch_data(46) # Hepatitis
# X, y = fetch_data(225) # ILPD
# X, y = fetch_data(236) # Seeds, not available for import
# X, y = fetch_data(95) # SPECT Heart
# X, y = fetch_data(100) # TAE,  not available for import
# X, y = fetch_data(107) # Waveform v1
# X, y = fetch_data(108) # Waveform v2,　not available for import
# X, y = fetch_data(292) # Wholesale
# Preprocess the data
X_preprocessed_df = preprocess(X)
y, y_classes = preprocess_y(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed_df, y, test_size=0.3, random_state=0)

print("Preprocessed data shape:", X_preprocessed_df.shape)
print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 98)

In [None]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import precision_recall_curve
from sklearn.datasets import make_classification

# Function to calculate Precision-Recall Break-Even Point (PRBEP)
def precision_recall_break_even_point(y_true, y_pred_proba):
    precision, recall, thresholds = precision_recall_curve(y_true, y_pred_proba)
    diff = np.abs(precision - recall)
    idx = np.argmin(diff)
    return (precision[idx] + recall[idx]) / 2

# Custom scorer function for GridSearchCV
def prbep_scorer(estimator, X, y):
    y_pred_proba = estimator.predict_proba(X)[:, 1]
    return precision_recall_break_even_point(y, y_pred_proba)

# Define the parameter grid
param_grid = {
    'C': np.logspace(-4, 4, 20),
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']  # liblinear supports both L1 and L2
}

# Create a logistic regression model
logreg = LogisticRegression(random_state=0)

# Perform grid search
grid_search = GridSearchCV(
    logreg,
    param_grid,
    scoring=prbep_scorer,
    cv=5,
    n_jobs=-1,
    verbose=1
)

# Fit the grid search
grid_search.fit(X_train, y_train)

# Get the best parameters and score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f"Best parameters: {best_params}")
print(f"Best PRBEP score: {best_score}")


best_model = grid_search.best_estimator_
y_pred_proba = best_model.predict_proba(X_test)[:, 1]
test_prbep = precision_recall_break_even_point(y_test, y_pred_proba)

print(f"PRBEP on test set: {test_prbep}")

Fitting 5 folds for each of 40 candidates, totalling 200 fits
Best parameters: {'C': 0.23357214690901212, 'penalty': 'l1', 'solver': 'liblinear'}
Best PRBEP score: 0.5662423769027947
PRBEP on test set: 0.55375


In [None]:
from scipy import stats

# Repeat the evaluation 10 times
n_iterations = 10
prbep_scores = []

for i in range(n_iterations):
    # Generate a new test set for each iteration
    _, X_test, _, y_test = train_test_split(X_preprocessed_df, y, test_size=0.3, random_state=i)

    # Evaluate on the test set
    best_model = grid_search.best_estimator_
    y_pred_proba = best_model.predict_proba(X_test)[:, 1]
    test_prbep = precision_recall_break_even_point(y_test, y_pred_proba)
    prbep_scores.append(test_prbep)

    print(f"Iteration {i+1}: PRBEP on test set: {test_prbep}")

# Perform one-sample t-test
t_statistic, p_value = stats.ttest_1samp(prbep_scores, 0.5)  # Test against PRBEP of 0.5

# Create a P-value table
p_value_table = pd.DataFrame({
    'Metric': ['PRBEP'],
    'Mean': [np.mean(prbep_scores)],
    'Std Dev': [np.std(prbep_scores)],
    't-statistic': [t_statistic],
    'p-value': [p_value]
})

print("\nP-value table:")
print(p_value_table)

Iteration 1: PRBEP on test set: 0.5442006269592476
Iteration 2: PRBEP on test set: 0.5622179239200515
Iteration 3: PRBEP on test set: 0.5565495207667731
Iteration 4: PRBEP on test set: 0.5755485893416928
Iteration 5: PRBEP on test set: 0.5484508899143046
Iteration 6: PRBEP on test set: 0.5709818636647905
Iteration 7: PRBEP on test set: 0.5655384615384615
Iteration 8: PRBEP on test set: 0.5697522816166884
Iteration 9: PRBEP on test set: 0.5688585607940446
Iteration 10: PRBEP on test set: 0.55375

P-value table:
  Metric      Mean   Std Dev  t-statistic       p-value
0  PRBEP  0.561585  0.009901     18.65935  1.672831e-08
