In [1]:
import gspread
from googleapiclient.discovery import build
from google.oauth2.service_account import Credentials
import pandas as pd
pd.set_option('future.no_silent_downcasting', True)
from tqdm import tqdm as tqdm
from pathlib import Path

In [2]:
gc = gspread.oauth(http_client=gspread.BackOffHTTPClient)

In [5]:
FOLDER_ID = "16vFggD_RFkWa-sKZipSi1Ugh_br8zvNX"

In [34]:
files = gc.list_spreadsheet_files(folder_id=FOLDER_ID)

In [40]:
def load_worksheet(sheet):
    sample_id = int(sheet.get("A1")[0][0])
    data = pd.DataFrame(sheet.get_all_records(head=2))
    data = data.iloc[:, 1:]
    data["A ne pas évaluer"] = data["A ne pas évaluer"].replace({'TRUE': True, 'FALSE': False}).astype(bool)
    data = data[~data["A ne pas évaluer"]]
    data.index = data.loc[:, "Code critère"]
    data = data.loc[:, "Respecté ?"]
    data["sample_id"] = sample_id
    
    return data

def load_eval_spreadsheet(file_id, annotator_id):
    file = gc.open_by_key(file_id)
    sheets = file.worksheets()
    df = pd.DataFrame([load_worksheet(s) for s in tqdm(sheets)])
    df = df.set_index("sample_id")
    df.loc[:, "annotator_id"] = annotator_id
    df = df.set_index("annotator_id", append=True)
    return df

def load_all(files, local_dir, save=True, load=True):
    dfs = []
    for f in tqdm(files):
        annot = int(f["name"].split(".")[-1])
        f_id = f["id"]
        path = local_dir / f"{annot}.csv"
        if load and path.exists():
            df = pd.read_csv(path)
        else:
            df = load_eval_spreadsheet(f_id, annot)
        if save:
            df.to_csv(path)
        dfs.append(df)
            
    return pd.concat(dfs)

In [41]:
# eval_df = load_all(
#     files,
#     Path("./data/annotated_df/").resolve()
# )

  0%|                                                                                                                                          | 0/10 [00:00<?, ?it/s]
  0%|                                                                                                                                          | 0/24 [00:00<?, ?it/s][A
  4%|█████▍                                                                                                                            | 1/24 [00:01<00:31,  1.37s/it][A
  8%|██████████▊                                                                                                                       | 2/24 [00:02<00:23,  1.06s/it][A
 12%|████████████████▎                                                                                                                 | 3/24 [00:03<00:27,  1.33s/it][A
 17%|█████████████████████▋                                                                                                            | 4/24 [00:04<00:2

In [42]:
eval_df

Unnamed: 0_level_0,Code critère,CI3,CI4,CI5,CI6,CI8,CPM1,CPM2,CPM3,CPM4,CPM5,...,I1,I2,CA1,CA2,CA3,CA4,CA5,CA6,CA7,CA8
sample_id,annotator_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
609,1,Non Applicable,Non Applicable,Non Applicable,Non Applicable,Non Applicable,Non Applicable,Non Applicable,Non Applicable,Non Applicable,Non Applicable,...,Non Applicable,Non Applicable,,,,,,,,
1555,1,Non Applicable,Non Applicable,Non Applicable,Non Applicable,Non Applicable,Non Applicable,Non Applicable,Non Applicable,Non Applicable,Non Applicable,...,Non Applicable,Non Applicable,,,,,,,,
1135,1,Non Applicable,Non Applicable,Non Applicable,Non Applicable,Non Applicable,Non Applicable,Non Applicable,Non Applicable,Non Applicable,Non Applicable,...,Non Applicable,Non Applicable,,,,,,,,
923,1,Non Applicable,Non Applicable,Non Applicable,Non Applicable,Non Applicable,Non Applicable,Non Applicable,Non Applicable,Non Applicable,Non Applicable,...,Non Applicable,Non Applicable,,,,,,,,
915,1,Non Applicable,Non Applicable,Non Applicable,Non Applicable,Non Applicable,Non Applicable,Non Applicable,Non Applicable,Non Applicable,Non Applicable,...,Non Applicable,Non Applicable,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
647,2,Non Applicable,Non Applicable,Non Applicable,Non Applicable,Non Applicable,Non Applicable,Non Applicable,Non Applicable,Non Applicable,Non Applicable,...,Non Applicable,Non Applicable,,,,,,,,
1588,2,Non Applicable,Non Applicable,Non Applicable,Non Applicable,Non Applicable,Non Applicable,Non Applicable,Non Applicable,Non Applicable,Non Applicable,...,Non Applicable,Non Applicable,,,,,,,,
1142,2,Non Applicable,Non Applicable,Non Applicable,Non Applicable,Non Applicable,Non Applicable,Non Applicable,Non Applicable,Non Applicable,Non Applicable,...,Non Applicable,Non Applicable,,,,,,,,
620,2,Non Applicable,Non Applicable,Non Applicable,Non Applicable,Non Applicable,Non Applicable,Non Applicable,Non Applicable,Non Applicable,Non Applicable,...,Non Applicable,Non Applicable,,,,,,,,


In [55]:
gen_df = pd.read_csv("./data/texts.csv", index_col=list(range(4)))
gen_df.index = gen_df.index.set_names(['sample_id'] + gen_df.index.names[1:])

# TODO: select in gen_df les sample_id in eval_df

In [73]:
gen_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,inputs,value
sample_id,model,test_set,input_id,Unnamed: 4_level_1,Unnamed: 5_level_1
0,"('llamaVIIIinst', 'rag', 'etrfr+orangesum+wikilarge')",test_etr_fr_politic,0,"Travailleuses, travailleurs, En avril 2020, p...",Macron a dit en avril 2020 que le pays tenait ...
1,"('llamaVIIIinst', 'rag', 'etrfr+orangesum+wikilarge')",test_etr_fr_politic,1,PROFITS EN HAUSSE ET POUVOIR D’ACHAT EN BAISSE...,Les grandes entreprises sont de plus en plus r...
2,"('llamaVIIIinst', 'rag', 'etrfr+orangesum+wikilarge')",test_etr_fr_politic,2,AUGMENTATION DES SALAIRES ET DES PENSIONS! Les...,Les travailleurs et les retraités veulent des ...
3,"('llamaVIIIinst', 'rag', 'etrfr+orangesum+wikilarge')",test_etr_fr_politic,3,CONTRÔLE DES TRAVAILLEURS SUR LES COMPTES DES ...,Nous devons contrôler les comptes des grandes ...
4,"('llamaVIIIinst', 'rag', 'etrfr+orangesum+wikilarge')",test_etr_fr_politic,4,"HÔPITAUX, EHPAD: À BAS LA LOI DU PROFIT! 100 0...",La gestion comptable de la santé publique est ...
...,...,...,...,...,...
1801,"('mistVIIinst', 'rag', 'etrfr+orangesum+wikilarge')",test_etr_fr,48,– Tu me permets de rester dormir ici cette nui...,Bou demande à Jules s'il peut dormir chez lui ...
1802,"('mistVIIinst', 'rag', 'etrfr+orangesum+wikilarge')",test_etr_fr,49,Un intrus dans la maison – Maow… cela faisait ...,Un intrus a pénétré dans la maison. Bou se rév...
1803,"('mistVIIinst', 'rag', 'etrfr+orangesum+wikilarge')",test_etr_fr,50,"– J'ai fini la bouteille qui était au frais, i...",Le père de Jules a fini sa bouteille et demand...
1804,"('mistVIIinst', 'rag', 'etrfr+orangesum+wikilarge')",test_etr_fr,51,"Et puis, d'un coup, tout se déroule très vite....","Jules a un nouveau compagnon : Bou, un chat ab..."


In [77]:
df = (gen_df
    .reset_index(["model", "test_set", "input_id"])
    .merge(
        eval_df.reset_index(["annotator_id"]), 
        left_on="sample_id", right_on='sample_id'
    )
)
df

Unnamed: 0_level_0,model,test_set,input_id,inputs,value,annotator_id,CI3,CI4,CI5,CI6,...,I1,I2,CA1,CA2,CA3,CA4,CA5,CA6,CA7,CA8
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
602,"('llamaVIII', 'lora', 'etrfr')",test_etr_fr_politic,0,"Travailleuses, travailleurs, En avril 2020, p...","Les travailleurs des hôpitaux et des Ehpad, le...",7,Non Applicable,Non Applicable,Non Applicable,Non Applicable,...,Non Applicable,Non Applicable,,,,,,,,
603,"('llamaVIII', 'lora', 'etrfr')",test_etr_fr_politic,1,PROFITS EN HAUSSE ET POUVOIR D’ACHAT EN BAISSE...,Les profits des grandes entreprises sont en ha...,3,Non Applicable,Non Applicable,Non Applicable,Non Applicable,...,Non Applicable,Non Applicable,,,,,,,,
605,"('llamaVIII', 'lora', 'etrfr')",test_etr_fr_politic,3,CONTRÔLE DES TRAVAILLEURS SUR LES COMPTES DES ...,Les travailleurs doivent contrôler les comptes...,9,Non Applicable,Non Applicable,Non Applicable,Non Applicable,...,Non Applicable,Non Applicable,,,,,,,,
606,"('llamaVIII', 'lora', 'etrfr')",test_etr_fr_politic,4,"HÔPITAUX, EHPAD: À BAS LA LOI DU PROFIT! 100 0...",100 000 lits d’hôpitaux ont été supprimés en 2...,6,Non Applicable,Non Applicable,Non Applicable,Non Applicable,...,Non Applicable,Non Applicable,,,,,,,,
607,"('llamaVIII', 'lora', 'etrfr')",test_etr_fr_politic,5,TRAVAILLEURS FRANÇAIS ET IMMIGRÉS: UN MÊME CAM...,Les travailleurs étrangers sont très utiles. S...,7,Non Applicable,Non Applicable,Non Applicable,Non Applicable,...,Non Applicable,Non Applicable,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1627,"('llamaVIII', 'mtllora', 'etrfr+orangesum+wiki...",test_etr_fr,46,Victoire! Jules n'arrive pas à s'endormir. Il ...,Jules n'arrive pas à s'endormir. Jules a peur ...,6,Non Applicable,Non Applicable,Non Applicable,Non Applicable,...,Non Applicable,Non Applicable,,,,,,,,
1628,"('llamaVIII', 'mtllora', 'etrfr+orangesum+wiki...",test_etr_fr,47,"Non, on n'abandonne pas ses amis! Jules souffl...","« Non, on n'abandonne pas ses amis! » Jules so...",8,Non Applicable,Non Applicable,Non Applicable,Non Applicable,...,Non Applicable,Non Applicable,,,,,,,,
1630,"('llamaVIII', 'mtllora', 'etrfr+orangesum+wiki...",test_etr_fr,49,Un intrus dans la maison – Maow… cela faisait ...,Un intrus dans la maison – Bou et Jules se rév...,7,Non Applicable,Non Applicable,Non Applicable,Non Applicable,...,Non Applicable,Non Applicable,,,,,,,,
1631,"('llamaVIII', 'mtllora', 'etrfr+orangesum+wiki...",test_etr_fr,50,"– J'ai fini la bouteille qui était au frais, i...",Le père de Jules va chercher une autre bouteil...,4,Non Applicable,Non Applicable,Non Applicable,Non Applicable,...,Non Applicable,Non Applicable,,,,,,,,


In [88]:
criteria = ['CI3', 'CI4', 'CI5', 'CI6', 'CI8', 'CPM1',
       'CPM2', 'CPM3', 'CPM4', 'CPM5', 'CPM6', 'CPM7', 'CPM8', 'CPM9', 'CPM10',
       'CPM11', 'CPM12', 'CPM13', 'CPM14', 'CPM15', 'CPM16', 'CPM17', 'CPM18',
       'CPM19', 'CPM20', 'CPM21', 'I1', 'I2', 'CA1', 'CA2', 'CA3', 'CA4',
       'CA5', 'CA6', 'CA7', 'CA8']

# Melting the DataFrame
df_melted = df.reset_index().melt(
    id_vars=df.columns[:6].to_list(),
    value_vars=criteria,
    var_name='criteria',
    # value_name='value'
)

ValueError: value_name (value) cannot match an element in the DataFrame columns.

In [14]:
themes_crit = {
    "Choix de l'information": ['CI3', 'CI4', 'CI5', 'CI6', 'CI8'],
    "Choix des mots": ['CPM2', 'CPM3', 'CPM4',
       'CPM5', 'CPM6', 'CPM8', 'CPM9',  'CPM16', 'CPM17', 'CPM18', 'CPM19',
       'CPM20', 'CPM21'],
    "Construction des phrases": [
        'CPM1', "CPM7", "CPM10", "CPM11", "CPM12", 'CPM13', 'CPM14','CPM15',
    ],
    "Illustrations": ['I1', 'I2'],
    "Qualité globales": ['CA1', 'CA2', 'CA3', 'CA4',
       'CA5', 'CA6', 'CA7', 'CA8'],
}
crit_themes = {
    crit: k for k, v in themes_crit.items() for crit in v
}
type_crit = {
    "ETR": ['CI3', 'CI4', 'CI5', 'CI6', 'CI8', 'CPM1',
       'CPM2', 'CPM3', 'CPM4', 'CPM5', 'CPM6', 'CPM7', 'CPM8', 'CPM9', 'CPM10',
       'CPM11', 'CPM12', 'CPM13', 'CPM14', 'CPM15', 'CPM16', 'CPM17', 'CPM18',
       'CPM19', 'CPM20', 'CPM21', 'I1', 'I2'],
    "global quality": ['CA1', 'CA2', 'CA3', 'CA4',
       'CA5', 'CA6', 'CA7', 'CA8']
}
crit_type = {
    crit: k for k, v in type_crit.items() for crit in v
}
df_melted["criteria_theme"] = df_melted["criteria"].replace(crit_themes)
df_melted["criteria_type"] = df_melted["criteria"].replace(crit_type)
df_melted["annotator"] = df_melted["annotator"].replace({"elisa": "coder_1", "melanie": "coder_2", "stanislas":"coder_3"})

In [15]:
# df_melted = df_melted.pivot(index=['example_id', 'model', 'source', 'target', 'prediction', 
#                                    'dataset', 'sample_id', 'criteria', 'criteria_theme', 'criteria_type'],
#                            columns='annotator',
#                            values='value').reset_index()

In [16]:
crit_to_score = {
    'CI3': 2,
    'CI4': 6,
    'CI5': 2,
    'CI6': 2,
    'CI8': 2,
    'CPM1': 6,
    'CPM2': 6,
    'CPM3': 6,
    'CPM4': 2,
    'CPM5': 6,
    'CPM6': 4,
    'CPM7': 2,
    'CPM8': 2,
    'CPM9': 2,
    'CPM10': 4,
    'CPM11': 2,
    'CPM12': 4,
    'CPM13': 4,
    'CPM14': 4,
    'CPM15': 2,
    'CPM16': 2,
    'CPM17': 2,
    'CPM18': 2,
    'CPM19': 2,
    'CPM20': 2,
    'CPM21': 2,
    'I1': 4,
    'I2': 4
}
df_melted["criteria_score"] = df_melted["criteria"].replace(crit_to_score)

In [17]:
df_melted.to_csv(Path("./annot_tables/")/"results_melted.csv")