In [1]:
import pandas as pd
import csv
import os
import json

import matplotlib.pyplot as plt
import networkx as nx
import numpy as np

from datasets import load_from_disk
from IPython.display import display
from datasets import Dataset, ClassLabel
from datetime import datetime

from llm_mri import ActivationAreas, Evaluation
from llm_mri.dimensionality_reduction import PCA, UMAP, SVD

  from .autonotebook import tqdm as notebook_tqdm


## Utils

In [2]:
# Constants
GRID_SIZES = [10, 20, 30, 40, 50]
SAMPLE_SIZE = 1000

# Paths
fakenews_data_path = "../data/fakenews"
processed_graphs_path = f"{fakenews_data_path}/processed/graphs"

In [3]:
model_ckpt = "distilbert/distilbert-base-multilingual-cased"
# The model can also be an encoder, such as 'openai-community/gpt2'

# Correct dataset path (note: folder is `datasets`, not `dataset`)
dataset_path = '/workspaces/master_degree_lab/datasets/dataset_encoded.hf'

# Load the dataset using Hugging Face's `load_dataset` function
dataset = load_from_disk(dataset_path)
dataset.cleanup_cache_files()

0

## Get Graphs

In [None]:
for sample in range(SAMPLE_SIZE):

    for grid_size in GRID_SIZES:
        umap = UMAP(n_components=2, random_state=42, gridsize=grid_size)
        llm_mri = ActivationAreas(
            model=model_ckpt, device="cpu", dataset=dataset, reduction_method=umap
        )
        llm_mri.process_activation_areas()

        g_true = llm_mri.get_graph("true")
        g_fake = llm_mri.get_graph("fake")

        print(g_true.graph)
        print(g_fake.graph)

        nx.write_gexf(g_true, f"{processed_graphs_path}/{grid_size}/g_true_{sample}.gexf")
        nx.write_gexf(g_fake, f"{processed_graphs_path}/{grid_size}/g_fake_{sample}.gexf")