In [25]:
import json
from functools import lru_cache
from pathlib import Path
from typing import Any, Dict, List, Union
import numpy as np
import pandas as pd

try:
    from typing import Literal  # type: ignore
except:
    from typing_extensions import Literal  # type: ignore

from transformers.utils import logging

logger = logging.get_logger("probing")


class BasicPlot:
    PARAMS_FIELD = "params"
    RESULTS_FIELD = "results"

    LANGUAGE_FIELD = "task_language"
    TASK_FIELD = "task_category"
    MODEL_NAME_FIELD = "hf_model_name"
    CLASSIFIER_FIELD = "classifier_name"
    METRIC_FIELD = "metric_names"
    FAMILY_FIELD = "family"
    FULL_LANGUAGE_FIELD = "full_language"


    def __init__(
        self,
        x_field: str = "layer",
        y_field: str = "task_category",
        value_field: str = "metric_scores",
    ):
        self.x_field = x_field
        self.y_field = y_field
        self.value_field = value_field

    @staticmethod
    def get_logs(paths: List[Path], filename: str = "log.json") -> List[Path]:
        logs_path = []
        for path in paths:
            internal_log_paths = path.glob(f"**/*/{filename}")
            for log_path in internal_log_paths:
                if log_path not in logs_path:
                    logs_path.append(log_path)
        return logs_path

    @lru_cache()
    def aggregation(
        self,
        res_paths: Union[Path, List[Path]],
        metric_name: Literal["f1", "accuracy"] = "f1",
        stage: Literal["val", "test"] = "test",
    ) -> pd.DataFrame:
        aggregated_data_dict: Dict[Any, Any] = {
            BasicPlot.LANGUAGE_FIELD: [],
            BasicPlot.TASK_FIELD: [],
            BasicPlot.MODEL_NAME_FIELD: [],
            BasicPlot.CLASSIFIER_FIELD: [],
            BasicPlot.METRIC_FIELD: [],
            BasicPlot.FAMILY_FIELD: [],
            BasicPlot.FULL_LANGUAGE_FIELD: [],
            "layer": [],
            "metric_scores": [],
            "log_path": [],
        }
        lang_file = pd.read_csv('all_languages.csv', delimiter=';')
        if not isinstance(res_paths, list):
            res_paths = [res_paths]
        res_paths = [Path(path).resolve() for path in res_paths]

        log_paths = BasicPlot.get_logs(res_paths)
        if len(log_paths) == 0:
            logger.warning("None logs were found for the given paths.")
        
        remm = []
        for log_path in log_paths:
            with open(log_path) as f:
                data = json.load(f)

            params = data[BasicPlot.PARAMS_FIELD]
            all_results = data[BasicPlot.RESULTS_FIELD]

            lang = params[BasicPlot.LANGUAGE_FIELD]
            task_category = params[BasicPlot.TASK_FIELD]
            model_name = params[BasicPlot.MODEL_NAME_FIELD]
            classifier_name = params[BasicPlot.CLASSIFIER_FIELD]
            stage_scores = all_results[f"{stage}_score"][metric_name]
            lang_full = lang_file.loc[lang_file['Codes'] == lang]['Language'].values[0]
            family = lang_file.loc[lang_file['Codes'] == lang]['Family'].values[0]
            
            if {'task_language': lang, 'task_category': task_category, 'hf_model_name': model_name, 'classifier_name': classifier_name, 'metric_name': metric_name} not in remm:
                remm.append({'task_language': lang, 'task_category': task_category, 'hf_model_name': model_name, 'classifier_name': classifier_name, 'metric_name': metric_name})
                for layer_num, stage_res in stage_scores.items():
                    layer = int(layer_num) + 1

                    if isinstance(stage_res, list):
                        aggregated_scores = np.mean(stage_res)
                    else:
                        raise NotImplementedError()

                    aggregated_data_dict[BasicPlot.LANGUAGE_FIELD].append(lang)
                    aggregated_data_dict[BasicPlot.TASK_FIELD].append(task_category)
                    aggregated_data_dict[BasicPlot.MODEL_NAME_FIELD].append(model_name)
                    aggregated_data_dict[BasicPlot.CLASSIFIER_FIELD].append(classifier_name)
                    aggregated_data_dict[BasicPlot.METRIC_FIELD].append(metric_name)
                    aggregated_data_dict[BasicPlot.FAMILY_FIELD].append(family)
                    aggregated_data_dict[BasicPlot.FULL_LANGUAGE_FIELD].append(lang_full)


                    aggregated_data_dict["layer"].append(layer)
                    aggregated_data_dict["metric_scores"].append(aggregated_scores)
                    aggregated_data_dict["log_path"].append(str(log_path))

        return pd.DataFrame(aggregated_data_dict)
    
pivot_table = BasicPlot().aggregation(res_paths = Path("Probing_framework/"))
df_data = pivot_table.reset_index()
df_data.to_csv('data.csv')

In [1]:
from glob import glob
import pandas as pd
import json

lang_file = pd.read_csv('all_languages.csv', delimiter=';')
hits = glob("Probing_framework/results/*/*/*.json", recursive=True)
datasets = {}
for file_name in hits: 
    file = open(file_name)
    data_file = json.loads(file.read())
    lang = data_file['params']['task_language']
    model_name = data_file['params']['hf_model_name']
    if model_name not in datasets.keys():
        datasets[model_name] = {}
    a = lang_file.loc[lang_file['Codes'].isin([lang])]
    lang_full = a.iloc[0]['Language']
    cat = data_file['params']['task_category']
    if lang_full not in datasets[model_name].keys():
        datasets[model_name][lang_full] = {}
    datasets[model_name][lang_full][cat] = {}
    datasets[model_name][lang_full][cat]['training'] = data_file['params']['original_classes_ratio']['tr']
    datasets[model_name][lang_full][cat]['validation'] = data_file['params']['original_classes_ratio']['va']
    datasets[model_name][lang_full][cat]['test'] = data_file['params']['original_classes_ratio']['te']
with open('datasets.json', 'w', encoding='utf-8') as f:
    json.dump(datasets, f, ensure_ascii=False, indent=4)