In [6]:
from sklearn.metrics import f1_score, recall_score, precision_score, confusion_matrix
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import json
import os

EMBEDDING_SIZE = 1536

def process_json(file):
    with open(file, "r") as f:
        embeddings_dict = json.load(f)

    embeddings = np.array(list(embeddings_dict.values()))
    return embeddings

    
def construct_dataset(sources, verbose=False):
    X_arrays = []
    y_arrays = []
    for (i, source) in enumerate(sources):
        assert source in os.listdir()
        embeddings = process_json(source)
        labels = np.full((embeddings.shape[0],), i)
        X_arrays.append(embeddings)
        y_arrays.append(labels)

    X = np.concatenate(X_arrays, axis=0)
    y = np.concatenate(y_arrays, axis=None)

    # X = np.empty((len(files), EMBEDDING_SIZE))
    # y = np.full((len(files),), label)
    # for i, file in enumerate(files):
    #     print(file) if verbose else None
    #     embedding = process_json(os.path.join(data_path, file))
    #     X[i] = embedding

    return X, y

def multiclass_model(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    
    # Increase max_iter and add scaling
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    model = LogisticRegression(
        class_weight='balanced',
        max_iter=1000,  # Increase from default 100
        tol=1e-4        # Optionally, adjust tolerance
    ).fit(X_train_scaled, y_train)
    
    y_pred = model.predict(X_test_scaled)
    matrix = confusion_matrix(y_test, y_pred)
    
    # Additional metrics
    f1 = f1_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    precision = precision_score(y_test, y_pred, average='weighted')
    
    return matrix, f1, recall, precision, model

In [None]:
# X1, y1 = construct_dataset("ACPAS-processed", 0)
# X2, y2 = construct_dataset("CBF-trimmed", 1)
# X3, y3 = construct_dataset("birdsong-unique", 2)

sources = [
    "acpas-unique-trimmed-s30-t30-embeddings.json",
    "CMR-trimmed-s30-t30-embeddings.json"
]

X, y = construct_dataset(sources)
print(y.shape)

# X = np.concatenate((X1, X2, X3), axis=0)
# y = np.concatenate((y1,y2, y3), axis=None)
cm, f1, recall, precision, _ = multiclass_model(X, y)
print(cm)