In [None]:
import pandas as pd
from sklearn.metrics import f1_score, recall_score, precision_score, confusion_matrix
import numpy as np
from pprint import pprint

In [None]:
df = pd.read_csv('FFfull.csv', low_memory=False)

web_df = df[df['study'] == 'web'].copy()
field_df = df[df['study'] == 'field'].copy()

song_functions = df.groupby("song").agg({'songfunction': 'first'})
web_ratings = web_df[["songfunction", "song", "danc","heal","baby","love"]].copy()

In [None]:
# # Evil bit hacking
# arr = web_ratings[["danc","heal","baby","love"]].to_numpy()
# max_values = np.max(arr, axis=1, keepdims=True)
# mask = (arr == max_values)
# random_masked = np.random.random(arr.shape) * mask
# web_ratings["y_pred"] = np.argmax(random_masked, axis=1)


def policy_argmax_per_row(arr, true_labels, policy='random'):
    """
    Vectorized implementation for finding argmax with specific tie-breaking policies:
    - 'random': randomly select from ties
    - 'generous': select the index that matches true_labels if possible (best case)
    - 'strict': select the index that differs from true_labels if possible (worst case)
    
    Args:
        arr: array of shape (n_samples, n_classes)
        true_labels: array of true labels of shape (n_samples,)
        policy: tie-breaking policy ('random', 'generous', or 'strict')
    """
    # Get mask of all max values
    max_vals = np.max(arr, axis=1, keepdims=True)
    mask = (arr == max_vals)
    
    if policy == 'random':
        # Original efficient random method
        random_values = np.random.random(arr.shape) * mask
        return np.argmax(random_values, axis=1)
    
    # Create a range array for comparing with true_labels
    row_indices = np.arange(arr.shape[0])
    
    if policy == 'generous':
        # For "best case" - prioritize the true label when it's among the max values
        
        # Check if true label is among the max values
        true_label_is_max = mask[row_indices, true_labels]
        
        # Where true label is max, use it; otherwise use random tie-breaking
        result = np.zeros(arr.shape[0], dtype=int)
        
        # For rows where true label is max, use the true label
        result[true_label_is_max] = true_labels[true_label_is_max]
        
        # For rows where true label is not max, use random tie-breaking
        non_match_rows = ~true_label_is_max
        if np.any(non_match_rows):
            # Create random values just for these rows
            random_subset = np.random.random(arr[non_match_rows].shape) * mask[non_match_rows]
            result[non_match_rows] = np.argmax(random_subset, axis=1)
        
        return result
        
    elif policy == 'strict':
        # For "worst case" - avoid the true label when other max values exist
        
        # Create a penalty matrix - make true labels less favorable
        penalty = np.zeros(arr.shape)
        penalty[row_indices, true_labels] = 1
        
        # Apply penalty only to elements that are max
        masked_penalty = penalty * mask
        
        # When choosing argmax with penalty, true labels will only be chosen
        # if they're the only max value
        random_values = np.random.random(arr.shape) * 0.1  # Small random values for secondary tie-breaking
        selection_values = mask * (1 - masked_penalty + random_values)
        
        return np.argmax(selection_values, axis=1)

In [None]:
# Create ordered categorical with your specific order
ordered_cats = pd.CategoricalDtype(categories=['danc', 'heal', 'baby', 'love'], ordered=True)
series_cat = web_ratings.songfunction.astype(ordered_cats)

# Now the codes will match your order
web_ratings["y_true"] = series_cat.cat.codes

In [None]:
arr = web_ratings[["danc","heal","baby","love"]].copy().to_numpy()
web_ratings["y_pred_random"] = policy_argmax_per_row(arr, web_ratings.y_true, policy='random')
web_ratings["y_pred_generous"] = policy_argmax_per_row(arr, web_ratings.y_true, policy='generous')
web_ratings["y_pred_strict"] = policy_argmax_per_row(arr, web_ratings.y_true, policy='strict')

In [None]:
def compute_metrics(y_test, y_pred):
    """Compute classification metrics."""
    metrics = {
        "f1": f1_score(y_test, y_pred, average='macro'),
        "recall": recall_score(y_test, y_pred, average='macro'),
        "precision": precision_score(y_test, y_pred, average='macro'),
        "per_class": {
            "precision": precision_score(y_test, y_pred, average=None).tolist(),
            "recall": recall_score(y_test, y_pred, average=None).tolist(),
            "f1": f1_score(y_test, y_pred, average=None).tolist()
        }
    }
        
    return metrics

In [None]:
y_true = web_ratings.y_true.to_numpy()
y_pred_random = web_ratings.y_pred_random.to_numpy()
y_pred_generous = web_ratings.y_pred_generous.to_numpy()
y_pred_strict = web_ratings.y_pred_strict.to_numpy()

print("Random:")
pprint(compute_metrics(y_true, y_pred_random))
pprint(confusion_matrix(y_true, y_pred_random))

print("Generous:")
pprint(compute_metrics(y_true, y_pred_generous))
pprint(confusion_matrix(y_true, y_pred_generous))

print("Strict:")
pprint(compute_metrics(y_true, y_pred_strict))
pprint(confusion_matrix(y_true, y_pred_strict))


In [None]:
# Uncomment to save results to numpy files

# np.save("web_survey_ratings_random", y_pred_random)
# np.save("web_survey_ratings_generous", y_pred_generous)
# np.save("web_survey_ratings_strict", y_pred_strict)