In [1]:
import glob
from pathlib import Path
import numpy as np
import pandas as pd

def retrieve_attrbute_log_probs(dimension, return_stats=False):
    """ Dimension in [year, gender, partisan]
    """
    dimension = 'demo_rep' if dimension == "partisan" else dimension
    # Load all files for the given dimension
    files = glob.glob(f"../data/results/log_probs/dimension{dimension}-use_declaration_in_trainTrue-seed*log_prob.npy")

    # Read and stack numpy arrays
    data_list = [np.load(file) for file in files]
    data = np.stack(data_list, axis=0)

    # Convert log probabilities to probabilities for each class
    probabilities = np.exp(data)  # Shape will still be (num_seeds, 2, num_selected_features)

    # Calculate log odds for each seed
    log_odds = np.log((probabilities[:, 1, :] / (1 - probabilities[:, 1, :])) / 
                    (probabilities[:, 0, :] / (1 - probabilities[:, 0, :])))

    # Calculate mean and standard deviation of log odds
    mean_log_odds = np.mean(log_odds, axis=0)
    
    if not return_stats:
        return mean_log_odds
    
    std_log_odds = (np.max(log_odds, axis=0) - np.min(log_odds, axis=0)) / 2

    # Get indices of the top features
    top_indices = np.argsort(np.abs(mean_log_odds))[::-1]

    # Extract top features and their log odds
    top_features = [subreddits[idx] for idx in top_indices]
    
    return mean_log_odds, std_log_odds, top_features


subreddits = np.load("../data/input/list_subreddits.npy", allow_pickle=True)
data = {"subreddit": subreddits}
data["year"] = retrieve_attrbute_log_probs("year")
data["gender"] = retrieve_attrbute_log_probs("gender")
data["partisan"] = retrieve_attrbute_log_probs("partisan")

df_scores = pd.DataFrame(data)
df_scores.to_csv(Path("../models/subreddit-logprobs.csv"), index=False)
