In [1]:
import import_ipynb
import numpy as np
import matplotlib.pyplot as plt
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.metrics import accuracy_score, log_loss, brier_score_loss
from scipy.stats import multivariate_normal
from scipy.integrate import nquad
import pandas as pd
np.random.seed(42)

from simulation_utils import (
    setup_simulation,
    generate_data,
    compute_bayes_predictions,
    compute_lda_predictions,
    compute_qda_predictions
)
import numpy as np
import pandas as pd
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.metrics import accuracy_score
from simulation_utils import setup_simulation, generate_data


Define Parameters and Simulate the data

In [2]:
k = 3  
n_per_class = 100
etas = np.linspace(0, 1, 5)
n_values = [100, 200, 500, 1000, 5000]
results = []

Scoring functions

In [3]:
def brier_score(p, y_true):
    y_onehot = np.zeros_like(p)
    y_onehot[np.arange(len(y_true)), y_true] = 1
    return np.mean(np.sum((y_onehot - p)**2, axis=1))

def spherical_score(p, y_true):
    norms = np.linalg.norm(p, axis=1)
    return np.mean([p[i, y_true[i]] / norms[i] for i in range(len(y_true))])

def log_score(p, y_true):
    return -np.mean([np.log(p[i, y_true[i]] + 1e-15) for i in range(len(y_true))])


Evaluation loop

In [4]:
for eta in etas:
    for n_train in n_values:
        X_train, y_train, means, shared_cov, class_covs = setup_simulation(K=k, eta=eta, n_train=n_train)

        X_test, y_test = generate_data( n_per_class=n_per_class, eta=eta, K=k, means=means, shared_cov=shared_cov, class_covs=class_covs)

        # --- Fit models ---
        lda = LinearDiscriminantAnalysis().fit(X_train, y_train)
        qda = QuadraticDiscriminantAnalysis().fit(X_train, y_train)

        # --- Predict ---
        y_pred_lda = lda.predict(X_test)
        y_pred_qda = qda.predict(X_test)
        p_lda = lda.predict_proba(X_test)
        p_qda = qda.predict_proba(X_test)

        # --- Evaluate ---
        results.append((
            eta, n_train,
            accuracy_score(y_test, y_pred_lda),
            accuracy_score(y_test, y_pred_qda),
            brier_score(p_lda, y_test),
            brier_score(p_qda, y_test),
            log_score(p_lda, y_test),
            log_score(p_qda, y_test),
            spherical_score(p_lda, y_test),
            spherical_score(p_qda, y_test)
        ))

Results as DataFrame

In [5]:
columns = [
    'eta', 'n_train',
    'acc_lda', 'acc_qda',
    'brier_lda', 'brier_qda',
    'log_lda', 'log_qda',
    'spherical_lda', 'spherical_qda'
]
df_results = pd.DataFrame(results, columns=columns)


print(df_results)

     eta  n_train   acc_lda   acc_qda  brier_lda  brier_qda   log_lda  \
0   0.00      100  0.746667  0.750000   0.346556   0.342884  0.617606   
1   0.00      200  0.760000  0.763333   0.312393   0.308919  0.538777   
2   0.00      500  0.773333  0.770000   0.306569   0.312645  0.521100   
3   0.00     1000  0.800000  0.800000   0.301439   0.301685  0.522597   
4   0.00     5000  0.736667  0.736667   0.339394   0.339579  0.568442   
5   0.25      100  0.736667  0.733333   0.361053   0.360791  0.612321   
6   0.25      200  0.746667  0.743333   0.347299   0.343333  0.570481   
7   0.25      500  0.763333  0.760000   0.326510   0.332523  0.571251   
8   0.25     1000  0.760000  0.750000   0.325555   0.323338  0.547088   
9   0.25     5000  0.756667  0.760000   0.323970   0.325655  0.533626   
10  0.50      100  0.730000  0.726667   0.360088   0.357737  0.610963   
11  0.50      200  0.746667  0.750000   0.351088   0.343901  0.578315   
12  0.50      500  0.753333  0.746667   0.328416   

for specific simulations

In [6]:
k = 3
eta = 0
n_train = 100
n_per_class = 50

X_train, y_train, means, shared_cov, class_covs = setup_simulation(K=k, eta=eta, n_train=n_train)
X_test, y_test = generate_data( n_per_class=n_per_class, eta=eta, K=k, means=means, shared_cov=shared_cov, class_covs=class_covs)

lda = LinearDiscriminantAnalysis().fit(X_train, y_train)
qda = QuadraticDiscriminantAnalysis().fit(X_train, y_train)

y_pred_lda = lda.predict(X_test)
y_pred_qda = qda.predict(X_test)
p_lda = lda.predict_proba(X_test)
p_qda = qda.predict_proba(X_test)

print("Accuracy - LDA:", accuracy_score(y_test, y_pred_lda))
print("Accuracy - QDA:", accuracy_score(y_test, y_pred_qda))

print("Brier - LDA:", brier_score(p_lda, y_test))
print("Brier - QDA:", brier_score(p_qda, y_test))

print("Log - LDA:", log_score(p_lda, y_test))
print("Log - QDA:", log_score(p_qda, y_test))

print("Spherical - LDA:", spherical_score(p_lda, y_test))
print("Spherical - QDA:", spherical_score(p_qda, y_test))

Accuracy - LDA: 0.7733333333333333
Accuracy - QDA: 0.7466666666666667
Brier - LDA: 0.35652327150755614
Brier - QDA: 0.3630422154753553
Log - LDA: 0.6358834075817208
Log - QDA: 0.6230297681619879
Spherical - LDA: 0.8022184961687332
Spherical - QDA: 0.796417880601983


In [7]:
lda_scores, qda_scores = [], []

for seed in range(100):
    np.random.seed(seed)
    X_train, y_train, means, shared_cov, class_covs = setup_simulation(K=3, eta=0.0, n_train=100)
    X_test, y_test = generate_data(n_per_class=50, eta=0.0, K=3, means=means, shared_cov=shared_cov, class_covs=class_covs)

    lda = LinearDiscriminantAnalysis().fit(X_train, y_train)
    qda = QuadraticDiscriminantAnalysis().fit(X_train, y_train)

    acc_lda = accuracy_score(y_test, lda.predict(X_test))
    acc_qda = accuracy_score(y_test, qda.predict(X_test))

    lda_scores.append(acc_lda)
    qda_scores.append(acc_qda)

print("Avg LDA:", np.mean(lda_scores))
print("Avg QDA:", np.mean(qda_scores))


Avg LDA: 0.773333333333333
Avg QDA: 0.7466666666666668
