In [8]:
import plotly.express as px
import pandas as pd
import json
import os
os.environ['USER'] = "rmoine"
from pathlib import Path
import numpy as np
from typing import *
from llama.main import compute_metrics_from_files, DatasetName, compute_metrics_from_list
from sklearn.metrics import f1_score
from PIL import Image
from pandasql import sqldf
from io import BytesIO
import base64
from plotly.graph_objects import Figure
import win32clipboard
import textwrap
import colorsys
import optuna
import tqdm
def img_to_clipboard(fig: Figure):
    img_bytes = fig.to_image(format="png")
    image = Image.open(BytesIO(img_bytes))
    
    output = BytesIO()
    image.convert('RGB').save(output, 'BMP')
    data = output.getvalue()[14:]
    output.close()
    win32clipboard.OpenClipboard()
    win32clipboard.EmptyClipboard()
    win32clipboard.SetClipboardData(win32clipboard.CF_DIB, data)
    win32clipboard.CloseClipboard()

In [9]:
l = []
l1 = []
root = Path(f"../../data/finetuning/")
i_tot = 0
pathes = list(root.rglob("qlora_finetune_*"))[:-1]
for path_data in tqdm.tqdm(pathes):
    with open(path_data / "parameters.json") as fp:
        parameters = json.load(fp)
    for dataset_type in ["train","val"]:
        files_data = list(path_data.rglob(f"data_epoch_*_{dataset_type}*.json"))
        if len(files_data) == 0:
            continue
        i_tot += 1
        i = i_tot//2
        for f in files_data:
            with open(f) as fp:
                d = json.load(fp)
            confusion_matrix, f1, _ = compute_metrics_from_list(d, pred_field="prediction")
            accuracy = np.sum(np.diag(confusion_matrix)) / np.sum(confusion_matrix)
            epoch = int(f.stem.split("_")[-2].split("-")[0])
            loss = sum(e['loss'] for e in d)/len(d)
            l.append({"epoch":epoch,"dataset_type":dataset_type,"confusion_matrix":confusion_matrix,"f1":f1,"accuracy":accuracy*100, "path_data_folder": path_data.stem, "loss": loss, "i":i, **parameters, "hyperparameters": parameters})
df = pd.DataFrame(l)
display(df.head())
    


100%|██████████| 56/56 [02:42<00:00,  2.90s/it]


Unnamed: 0,epoch,dataset_type,confusion_matrix,f1,accuracy,path_data_folder,loss,i,dataset_choice,folder_out,...,learning_rate,limit_tokens,mapping_dict,lim_size,id,use_cpu,tr_weighted_sampling,early_stopping_patience,early_stopping_threshold,hyperparameters
0,0,train,"[[6134, 6530], [3050, 34820]]","[0.561515928231417, 0.8790709416813934]",81.042466,qlora_finetune_19459371,0.511686,0,eclipse_72k,/project/6023391/rmoine/data,...,0.0001,500,,-1,_19459371,False,False,3,0.001,"{'dataset_choice': 'eclipse_72k', 'folder_out'..."
1,1,train,"[[7892, 4772], [2313, 35557]]","[0.6901919629192357, 0.9093978183864244]",85.979736,qlora_finetune_19459371,0.410209,0,eclipse_72k,/project/6023391/rmoine/data,...,0.0001,500,,-1,_19459371,False,False,3,0.001,"{'dataset_choice': 'eclipse_72k', 'folder_out'..."
2,2,train,"[[9425, 3239], [1482, 36388]]","[0.7997115099062407, 0.9390815128327549]",90.657775,qlora_finetune_19459371,0.32256,0,eclipse_72k,/project/6023391/rmoine/data,...,0.0001,500,,-1,_19459371,False,False,3,0.001,"{'dataset_choice': 'eclipse_72k', 'folder_out'..."
3,3,train,"[[10771, 1893], [679, 37191]]","[0.893339968483039, 0.9665774358707799]",94.910357,qlora_finetune_19459371,0.208422,0,eclipse_72k,/project/6023391/rmoine/data,...,0.0001,500,,-1,_19459371,False,False,3,0.001,"{'dataset_choice': 'eclipse_72k', 'folder_out'..."
4,1,val,"[[1883, 1735], [524, 10296]]","[0.6250622406639004, 0.9011421819614021]",84.353789,qlora_finetune_19459371,0.440984,1,eclipse_72k,/project/6023391/rmoine/data,...,0.0001,500,,-1,_19459371,False,False,3,0.001,"{'dataset_choice': 'eclipse_72k', 'folder_out'..."


In [14]:
def dump_to_optuna(df: pd.DataFrame, metric: Literal["accuracy","f1_avg"], interesting_parameters: Optional[List[str]] = None):
    df1 = df[["epoch","name",metric,"dataset_type"]]
    query = f"""SELECT * FROM df1 WHERE name IN (SELECT name FROM df1 WHERE dataset_type == 'train' GROUP BY name HAVING {metric} == MAX({metric})) AND dataset_type == 'val'"""
    df_best = sqldf(query, locals())
    df_best = df.loc[df_best.index,:]
    df_best = df_best.query("dataset_type == 'val'")
    study_name = "finetuning_llama2"
    storage = f"sqlite:///study-finetuning_llama2.db"
    path_db = Path(storage.split("/")[-1])
    if path_db.exists():
        path_db.unlink()
    study = optuna.create_study(direction="maximize",study_name=study_name, storage=storage)
    distributions = {}
    if interesting_parameters is None:
        interesting_parameters = df_best.columns
    Lremoved = []
    for i,k in enumerate(interesting_parameters):
        poss = sorted(df_best[k].unique().tolist())
        print(k,poss)
        if len(poss) <= 1:
            print(f"skipping {k} as there are no choices")
            Lremoved.append(i)
            continue
        distributions[k] = optuna.distributions.CategoricalDistribution(poss)
    interesting_parameters = [e for i,e in enumerate(interesting_parameters) if i not in Lremoved]
    print(distributions)
    for data_best in df_best.to_dict(orient="records"):
        params = {k:v for k,v in data_best["hyperparameters"].items() if k in interesting_parameters}
        trial = optuna.trial.create_trial(
            params=params,
            distributions=distributions,
            value=data_best[metric]
        )
        study.add_trial(trial)
    def mock_function(trial):
        for k,v in distributions.items():
            x = trial.suggest_categorical(k,v)
        return 0
    study.optimize(mock_function, n_trials=0)
    del study
# print(df.columns)
if not Path("tmp.json").exists():
    df.sort_values(["dataset_choice","dataset_type","epoch"],inplace=True)
    df["weighted"] = df["tr_weighted_sampling"].apply(lambda x:"weighted" if x else "not_weighted")
    df["name"] = df['i'].astype(str) + " " + df["dataset_choice"] + ", " + df["weighted"] + ", r=" + df["lora_r"].astype(str)
    df["id"] = df["name"] + ", " + df["dataset_type"]
    df["f1_avg"] = df["f1"].apply(lambda x:np.array(x).mean())
    df.sort_values(by=["i","dataset_type","epoch"],inplace=True)
    df.to_json("./tmp.json")
else:
    df = pd.read_json("./tmp.json")
interesting_parameters = [
    "dataset_choice",
    "lora_alpha",
    "lora_dropout",
    "lora_r",
    "model_name",
    "tr_bs",
    "learning_rate",
    "limit_tokens",
    "tr_weighted_sampling",
    "early_stopping_patience",
    "early_stopping_threshold"
]
dump_to_optuna(df, metric="accuracy",interesting_parameters=interesting_parameters)
# display(df)


[I 2023-12-01 11:20:45,728] A new study created in RDB with name: finetuning_llama2


dataset_choice ['eclipse_72k']
skipping dataset_choice as there are no choices
lora_alpha [4, 10]
lora_dropout [0.0, 0.1, 0.2]
lora_r [5, 10, 64]
model_name ['meta-llama/Llama-2-7b-chat-hf']
skipping model_name as there are no choices
tr_bs [4]
skipping tr_bs as there are no choices
learning_rate [1e-05, 0.0001]
limit_tokens [500]
skipping limit_tokens as there are no choices
tr_weighted_sampling [False]
skipping tr_weighted_sampling as there are no choices
early_stopping_patience [3]
skipping early_stopping_patience as there are no choices
early_stopping_threshold [0.001]
skipping early_stopping_threshold as there are no choices
{'lora_alpha': CategoricalDistribution(choices=(4, 10)), 'lora_dropout': CategoricalDistribution(choices=(0.0, 0.1, 0.2)), 'lora_r': CategoricalDistribution(choices=(5, 10, 64)), 'learning_rate': CategoricalDistribution(choices=(1e-05, 0.0001))}


In [11]:
l = df.iloc[0]
width, height = 800, 600
title_attrs = {
    "bs":l.tr_bs,
    "lora_alpha":l.lora_alpha,
    "lora_dropout":l.lora_dropout,
    "model_name":l.model_name.split('/')[1],
    "learning_rate": l.learning_rate,
    
}
unique_datasets = list(df["name"].unique())[::2]
print(unique_datasets)
title = "<br>".join(textwrap.wrap(" ; ".join(f"{k}={v}" for k,v in title_attrs.items()), width=75))
color_map = {}
for i, dataset in enumerate(sorted(unique_datasets)):
    base_color = [int(i*360/len(unique_datasets)),1,1]
    i = int(dataset.split(" ")[0])
    dataset = " ".join(dataset.split(" ")[1:])
    color_map[f"{i} {dataset}, train"] = f'hsv({base_color[0]}, 1, 1)'
    color_map[f"{i+1} {dataset}, val"] = f'hsv({base_color[0]}, 0.5, 0.9)'
print("-----------------")
for k,v in sorted(color_map.items(),key=lambda x:int(x[0].split(" ")[0])):
    print(k,v)

['0 eclipse_72k, not_weighted, r=64', '1 eclipse_72k, not_weighted, r=64', '2 eclipse_72k, not_weighted, r=5', '4 eclipse_72k, not_weighted, r=64', '6 eclipse_72k, not_weighted, r=5', '7 eclipse_72k, not_weighted, r=10', '8 eclipse_72k, not_weighted, r=5', '9 eclipse_72k, not_weighted, r=5', '11 eclipse_72k, not_weighted, r=10', '12 eclipse_72k, not_weighted, r=64', '13 eclipse_72k, not_weighted, r=10', '14 eclipse_72k, not_weighted, r=10', '15 eclipse_72k, not_weighted, r=10', '16 eclipse_72k, not_weighted, r=64', '18 eclipse_72k, not_weighted, r=64', '19 eclipse_72k, not_weighted, r=10', '20 eclipse_72k, not_weighted, r=10', '22 eclipse_72k, not_weighted, r=10', '23 eclipse_72k, not_weighted, r=10', '24 eclipse_72k, not_weighted, r=5', '25 eclipse_72k, not_weighted, r=10', '26 mozilla_200k, not_weighted, r=64', '27 mozilla_200k, not_weighted, r=5', '29 mozilla_200k, not_weighted, r=64', '31 mozilla_200k, not_weighted, r=5', '32 mozilla_200k, not_weighted, r=10', '33 mozilla_200k, not

In [12]:
print(color_map)
img = px.line(df,x="epoch",y="accuracy",color="id",color_discrete_map=color_map, width=width, height=height, title=title)
img.update_layout(
    legend=dict(traceorder="normal")
)
img.show()
img_to_clipboard(img)

{'0 eclipse_72k, not_weighted, r=64, train': 'hsv(0, 1, 1)', '1 eclipse_72k, not_weighted, r=64, val': 'hsv(0, 0.5, 0.9)', '1 eclipse_72k, not_weighted, r=64, train': 'hsv(9, 1, 1)', '2 eclipse_72k, not_weighted, r=64, val': 'hsv(9, 0.5, 0.9)', '11 eclipse_72k, not_weighted, r=10, train': 'hsv(18, 1, 1)', '12 eclipse_72k, not_weighted, r=10, val': 'hsv(18, 0.5, 0.9)', '12 eclipse_72k, not_weighted, r=64, train': 'hsv(27, 1, 1)', '13 eclipse_72k, not_weighted, r=64, val': 'hsv(27, 0.5, 0.9)', '13 eclipse_72k, not_weighted, r=10, train': 'hsv(36, 1, 1)', '14 eclipse_72k, not_weighted, r=10, val': 'hsv(36, 0.5, 0.9)', '14 eclipse_72k, not_weighted, r=10, train': 'hsv(46, 1, 1)', '15 eclipse_72k, not_weighted, r=10, val': 'hsv(46, 0.5, 0.9)', '15 eclipse_72k, not_weighted, r=10, train': 'hsv(55, 1, 1)', '16 eclipse_72k, not_weighted, r=10, val': 'hsv(55, 0.5, 0.9)', '16 eclipse_72k, not_weighted, r=64, train': 'hsv(64, 1, 1)', '17 eclipse_72k, not_weighted, r=64, val': 'hsv(64, 0.5, 0.9)',

In [13]:
img = px.line(df,x="epoch",y="loss",color="id",color_discrete_map=color_map, width=width, height=height, title=title)
img.show()
img_to_clipboard(img)