In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
import numpy as np
import os
import argparse
from time import time
from collections import defaultdict
import matplotlib.pyplot as plt

import torch
from torch.utils.data import DataLoader

from kmembert.dataset import EHRDataset
from kmembert.utils import pretty_time, printc, create_session, save_json, get_label_threshold, get_error, plot_epoch_loss, get_error_IT, plot_epoch_error_IT, my_custom_loss
from kmembert.models.health_bert import CamembertRegressor
from kmembert.testing import test

parser = argparse.ArgumentParser()
parser.add_argument("-d", "--data_folder", type=str, default="ehr", 
    help="data folder name")
parser.add_argument("-m", "--mode", type=str, default="regression", choices=['regression', 'density', 'classif'],
    help="name of the task")
parser.add_argument("-b", "--batch_size", type=int, default=8, 
    help="dataset batch size")
parser.add_argument("-e", "--epochs", type=int, default=5, 
    help="number of epochs")
parser.add_argument("-drop", "--drop_rate", type=float, default=0.1, 
    help="dropout ratio. By default, None uses p=0.1")
parser.add_argument("-nr", "--nrows", type=int, default=None, 
    help="maximum number of samples for training and validation")
parser.add_argument("-k", "--print_every_k_batch", type=int, default=1, 
    help="prints training loss every k batch")
parser.add_argument("-f", "--freeze", type=bool, default=False, const=True, nargs="?",
    help="whether or not to freeze the Bert part")
parser.add_argument("-dt", "--days_threshold", type=int, default=365, 
    help="days threshold to convert into classification task")
parser.add_argument("-lr", "--learning_rate", type=float, default=1e-4, 
    help="model learning rate")
parser.add_argument("-r_lr", "--ratio_lr_embeddings", type=float, default=1, 
    help="the ratio applied to lr for embeddings layer")
parser.add_argument("-wg", "--weight_decay", type=float, default=0, 
    help="the weight decay for L2 regularization")
parser.add_argument("-v", "--voc_file", type=str, default=None, 
    help="voc file containing camembert added vocabulary")
parser.add_argument("-r", "--resume", type=str, default=None, 
    help="result folder in which the saved checkpoint will be reused")
parser.add_argument("-p", "--patience", type=int, default=4, 
    help="number of decreasing accuracy epochs to stop the training")
parser.add_argument("-nl", "--num_label", type=int, default=3, 
        help="number of label to predict")
        
args = parser.parse_args("")

In [3]:
path_dataset, _, device, config = create_session(args)

config.label_threshold = get_label_threshold(config, path_dataset)

train_dataset, validation_dataset = EHRDataset.get_train_validation(path_dataset, config)

train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True)
validation_loader = DataLoader(validation_dataset, batch_size=1, shuffle=True)

model = CamembertRegressor(device, config)

from kmembert.training import train_and_validate
train_and_validate(model, train_loader, validation_loader, device, config, config.path_result)

[1m> DEVICE:  cpu[0m
[1m> ROOT:    c:\Users\DIPIAZZA\Documents\CLB Projet\Projet1\kmembert_pytorch_help\KmemBERT[0m
[1m> SESSION: c:\Users\DIPIAZZA\Documents\CLB Projet\Projet1\kmembert_pytorch_help\KmemBERT\results\ipykernel_launcher_22-04-07_10h21m09s[0m
[1m
Loading camembert and its tokenizer...[0m
[92mSuccessfully loaded
[0m
[94m
----- STARTING TRAINING -----[0m
> EPOCH 0
tensor([[ 1.,  0.,  0.],
        [ 1.,  1., -1.],
        [ 1.,  1., -1.],
        [ 1.,  1., -1.],
        [ 1.,  1., -1.],
        [ 1.,  0.,  0.],
        [ 1.,  0.,  0.],
        [ 1.,  0.,  0.]], dtype=torch.float64)
shape de labels: torch.Size([8, 3])
    [0-1]  -  Average loss: 0.699226  -  Time elapsed: 0m2s
tensor([[ 1.,  1., -1.],
        [ 1.,  0.,  0.],
        [ 1.,  0.,  0.],
        [ 1.,  1., -1.]], dtype=torch.float64)
shape de labels: torch.Size([4, 3])
    [1-2]  -  Average loss: 0.696524  -  Time elapsed: 0m2s
[95m    Training   | Error: 0.7 - Global average loss: 0.697875 - Time e

0.5363740881003013

In [4]:
import pandas as pd 
df = pd.read_csv("C:\\Users\\DIPIAZZA\\Documents\\CLB Projet\\Projet1\\kmembert_pytorch_help\\KmemBERT\\data\\ehr\\train.csv")
df["f1"]=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
df["f2"]=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1]
df["f3"]=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, -1, 1, 1, 1]
df[["Texte", "f1", "f2", "f3"]].iloc[[12,13,14]]

Unnamed: 0,Texte,f1,f2,f3
12,10 #$ DEPARTEMENT DE MEDECINE - ONCO-UROLOGIE....,1,0,0
13,11 #$ DEPARTEMENT DE MEDECINE - ONCO-UROLOGIE ...,1,1,1
14,ABC #$ DEPARTEMENT DE MEDECINE - ONCO-UROLOGIE...,1,1,-1
