In [46]:
import pandas as pd
from pathlib import Path

p = Path(".")
files = list(p.resolve().rglob("0*.jsonl"))
print(files)

df = pd.concat([pd.read_json(f, orient="records", lines=True) for f in files])

from sklearn.metrics import accuracy_score, f1_score

df["accuracy"] = df.apply(
    lambda row: accuracy_score(row["y_true"], row["y_pred"]), axis=1
)
df["macroF1"] = df.apply(
    lambda row: f1_score(row["y_true"], row["y_pred"], average="macro"), axis=1
)
df["model"] = df.model_name_or_path.apply(
    lambda s: "/".join(
        s.split("/")[-2].split("_")[0:2]
    )
)

df.shape


[PosixPath('/home/peterr/macocu/task11/010_results.jsonl'), PosixPath('/home/peterr/macocu/task11/011_results.jsonl'), PosixPath('/home/peterr/macocu/task11/008_results.jsonl'), PosixPath('/home/peterr/macocu/task11/012_results_nonslavic.jsonl'), PosixPath('/home/peterr/macocu/task11/012_results.jsonl'), PosixPath('/home/peterr/macocu/task11/013_results_slavic_asr.jsonl')]


(87, 10)

Let us only keep rows that were calculated on test split:

In [51]:
df = df[df.eval_file.str.contains("test")]
df.shape

(41, 10)

In [52]:
df.columns

Index(['output_column', 'model_name_or_path', 'eval_file', 'clip_seconds',
       'y_true', 'y_pred', 'train_config', 'accuracy', 'macroF1', 'model'],
      dtype='object')

In [58]:
gb = df.groupby([
"output_column", 
"model", 
"eval_file", 
"clip_seconds"
]).agg(
    {"macroF1": "mean", "accuracy": "mean"}
).reset_index()

print(gb.to_markdown())

gb

|    | output_column     | model                                        | eval_file                            |   clip_seconds |   macroF1 |   accuracy |
|---:|:------------------|:---------------------------------------------|:-------------------------------------|---------------:|----------:|-----------:|
|  0 | Party_status      | facebook/wav2vec2-large-slavic-voxpopuli-v2  | 012_test.csv                         |             -1 | 0.587285  |     0.59   |
|  1 | Speaker_age_group | classla/wav2vec2-large-slavic-parlaspeech-hr | 006_age_test.csv                     |             -1 | 0.721715  |     0.722  |
|  2 | Speaker_age_group | facebook/wav2vec2-large-960h-lv60-self       | 006_age_test.csv                     |             -1 | 0.672112  |     0.678  |
|  3 | Speaker_age_group | facebook/wav2vec2-large-slavic-voxpopuli-v2  | 006_age_test.csv                     |             -1 | 0.689971  |     0.694  |
|  4 | Speaker_gender    | classla/wav2vec2-large-slavic-parlaspeech-h

Unnamed: 0,output_column,model,eval_file,clip_seconds,macroF1,accuracy
0,Party_status,facebook/wav2vec2-large-slavic-voxpopuli-v2,012_test.csv,-1,0.587285,0.59
1,Speaker_age_group,classla/wav2vec2-large-slavic-parlaspeech-hr,006_age_test.csv,-1,0.721715,0.722
2,Speaker_age_group,facebook/wav2vec2-large-960h-lv60-self,006_age_test.csv,-1,0.672112,0.678
3,Speaker_age_group,facebook/wav2vec2-large-slavic-voxpopuli-v2,006_age_test.csv,-1,0.689971,0.694
4,Speaker_gender,classla/wav2vec2-large-slavic-parlaspeech-hr,001_gender_test.csv,-1,0.984997,0.985
5,Speaker_gender,classla/wav2vec2-large-slavic-parlaspeech-hr,001_gender_test.csv,2,0.984997,0.985
6,Speaker_gender,facebook/wav2vec2-large-960h-lv60-self,001_gender_test.csv,-1,0.999,0.999
7,Speaker_gender,facebook/wav2vec2-large-960h-lv60-self,001_gender_test.csv,2,0.9935,0.9935
8,Speaker_gender,facebook/wav2vec2-large-slavic-voxpopuli-v2,001_gender_test.csv,-1,0.997,0.997
9,Speaker_gender,facebook/wav2vec2-large-slavic-voxpopuli-v2,001_gender_test.csv,2,0.989499,0.9895
