In [1]:
import torch
from fast_bert.data_cls import BertDataBunch
from fast_bert.learner_cls import BertLearner
from fast_bert.data_lm import BertLMDataBunch
from fast_bert.learner_lm import BertLMLearner
from fast_bert.metrics import fbeta, roc_auc
from fast_bert.prediction import BertClassificationPredictor
from pathlib import Path
import pandas as pd
import logging

logger = logging.getLogger()
device_cuda = torch.device("cuda")

In [2]:
torch.cuda.get_arch_list()

['sm_37',
 'sm_50',
 'sm_60',
 'sm_61',
 'sm_70',
 'sm_75',
 'sm_80',
 'sm_86',
 'compute_37']

In [3]:
torch.cuda.reset_peak_memory_stats()

In [4]:
DATA_PATH = Path('C:/Users/agallais/VScodeProjects/lyrics/data/')
LOG_PATH = Path('C:/Users/agallais/VScodeProjects/lyrics/logs/')
MODEL_PATH = Path('C:/Users/agallais/VScodeProjects/lyrics/model/')
LABEL_PATH = Path('C:/Users/agallais/VScodeProjects/lyrics/labels/')

In [5]:
df_paroles_genres = pd.read_csv(f"{DATA_PATH}/data_lyrics.csv")
df_paroles_genres = df_paroles_genres.dropna(axis=0)
df_paroles_genres

Unnamed: 0.1,Unnamed: 0,artist,artist_id,artist_process,title,url_lyrics,lyrics,GENRES_MAX,MAX_CERTAINTY,detect
0,0,France Gall,4675,france-gall,Aime-la,https://www.paroles.net/france-gall/paroles-ai...,". La chance a tourné dans ta vie, tes amours. ...",Pop,True,fr
1,1,France Gall,4675,france-gall,Amor tambien,https://www.paroles.net/france-gall/paroles-am...,Y a des jours fragiles. Des soirées difficiles...,Pop,True,fr
2,2,France Gall,4675,france-gall,Babacar,https://www.paroles.net/france-gall/paroles-ba...,J'ai ton cœur qui tape qui cogne. Dans mon cor...,Pop,True,fr
3,3,France Gall,4675,france-gall,Bébé comme la vie,https://www.paroles.net/france-gall/paroles-be...,"Y a des couleurs qu'on oublie pas. Le cœur, qu...",Pop,True,fr
4,4,France Gall,4675,france-gall,C'est Bon Que Tu Sois Là,https://www.paroles.net/france-gall/paroles-c-...,Tous ces mots qui condamnent. Comme c'est dur ...,Pop,True,fr
...,...,...,...,...,...,...,...,...,...,...
21770,26562,Dub Silence,1585174,dub-silence,It's Time,https://www.paroles.net/dub-silence/paroles-it...,On ne le dirait pas. Mais tout le monde en a. ...,Pop,False,fr
21771,26563,Dub Silence,1585174,dub-silence,L'hymne des légumes,https://www.paroles.net/dub-silence/paroles-l-...,"Mon style est vrai, j’ai le flow, toujours fra...",Pop,False,fr
21772,26565,Dub Silence,1585174,dub-silence,MAJ,https://www.paroles.net/dub-silence/paroles-maj,Il y a un petit bout de temps que je vous ai r...,Pop,False,fr
21773,26566,Dub Silence,1585174,dub-silence,Matchgirl,https://www.paroles.net/dub-silence/paroles-ma...,"Demain c'est le 25 décembre, la ville s'est pa...",Pop,False,fr


In [6]:
df_paroles_genres['GENRES_MAX'].value_counts()

Pop                    8012
Rap                    3859
Chanson française      2292
Rock                   2035
Reggae                  955
Alternative             354
Jazz                    239
R&B                     232
Latino                  178
Soul                    121
Musique africaine       114
Electro                 108
Singer & Songwriter      17
Bandes originales        13
Name: GENRES_MAX, dtype: int64

In [7]:
df = df_paroles_genres[df_paroles_genres['GENRES_MAX'].isin(['Pop','Rap','Chanson française','Rock','Reggae'])]
df = pd.concat([df[["title", "lyrics"]], pd.get_dummies(df[["GENRES_MAX"]], prefix="Genre")], axis=1)
df

Unnamed: 0,title,lyrics,Genre_Chanson française,Genre_Pop,Genre_Rap,Genre_Reggae,Genre_Rock
0,Aime-la,". La chance a tourné dans ta vie, tes amours. ...",0,1,0,0,0
1,Amor tambien,Y a des jours fragiles. Des soirées difficiles...,0,1,0,0,0
2,Babacar,J'ai ton cœur qui tape qui cogne. Dans mon cor...,0,1,0,0,0
3,Bébé comme la vie,"Y a des couleurs qu'on oublie pas. Le cœur, qu...",0,1,0,0,0
4,C'est Bon Que Tu Sois Là,Tous ces mots qui condamnent. Comme c'est dur ...,0,1,0,0,0
...,...,...,...,...,...,...,...
21770,It's Time,On ne le dirait pas. Mais tout le monde en a. ...,0,1,0,0,0
21771,L'hymne des légumes,"Mon style est vrai, j’ai le flow, toujours fra...",0,1,0,0,0
21772,MAJ,Il y a un petit bout de temps que je vous ai r...,0,1,0,0,0
21773,Matchgirl,"Demain c'est le 25 décembre, la ville s'est pa...",0,1,0,0,0


In [8]:
val_set = df.sample(frac=0.2, replace=False, random_state=42)
train_set = df.drop(index = val_set.index)
print('Nombre de chansons dans le val_set:',len(val_set))
print('Nombre de chansons dans le train_set:', len(train_set))
val_set.to_csv(f"{DATA_PATH}/val_set.csv")
train_set.to_csv(f"{DATA_PATH}/train_set.csv")

Nombre de chansons dans le val_set: 3431
Nombre de chansons dans le train_set: 13722


In [9]:
labels = df.columns[2:].to_list()
with open(f"{LABEL_PATH}/labels.txt", 'w') as f:
    for i in labels:
        f.write(i + "\n")

In [10]:
all_texts = df['lyrics'].to_list()
print('Nombre de chansons:', len(all_texts))

Nombre de chansons: 17153


### Création de LMDataBunch

In [11]:
databunch_lm = BertLMDataBunch.from_raw_corpus(
                    data_dir=DATA_PATH,
                    text_list=all_texts,
                    tokenizer='camembert-base',
                    batch_size_per_gpu=5,
                    max_seq_length=512,
                    multi_gpu=False,
                    model_type='camembert-base',
                    logger=logger)

### Création de LMLearner

In [12]:
lm_learner = BertLMLearner.from_pretrained_model(
                            dataBunch=databunch_lm,
                            pretrained_path='camembert-base',
                            output_dir=MODEL_PATH,
                            metrics=[],
                            device=device_cuda,
                            logger=logger,
                            multi_gpu=False,
                            logging_steps=50,
                            fp16_opt_level="O2")

Some weights of CamembertForMaskedLM were not initialized from the model checkpoint at camembert-base and are newly initialized: ['lm_head.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
lm_learner.fit(epochs=30,
            lr=1e-4,
            validate=True,
            schedule_type="warmup_cosine",
            optimizer_type="adamw")

RuntimeError: CUDA out of memory. Tried to allocate 312.00 MiB (GPU 0; 4.00 GiB total capacity; 3.42 GiB already allocated; 0 bytes free; 3.46 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
lm_learner.validate()

In [None]:
learner.save_model()

### Création de databunch pour la classification

In [None]:
databunch = BertDataBunch(DATA_PATH, LABEL_PATH,
                          tokenizer='camembert-base',
                          train_file='train_set.csv',
                          val_file='val_set.csv',
                          label_file='labels.txt',
                          text_col='review',
                          label_col=['cadre/atmosphère','probleme technique',"temps d'attente",'accueil/relation commerciale'],
                          batch_size_per_gpu=16,
                          max_seq_length=512,
                          multi_gpu=False,
                          multi_label=True,
                          model_type='camembert-base')

### Création de Learner

In [None]:
metrics = [{'name': 'fbeta', 'function': fbeta}, {'name': 'roc_auc', 'function': roc_auc}]
OUTPUT_DIR = Path('./finetuned_model')
WGTS_PATH = Path('model/model_out/pytorch_model.bin')

In [None]:
cl_learner = BertLearner.from_pretrained_model(
                        databunch,
                        pretrained_path='model/model_out',
                        metrics=metrics,
                        device=device_cuda,
                        logger=logger,
                        output_dir=OUTPUT_DIR,
                        finetuned_wgts_path=WGTS_PATH,
                        warmup_steps=300,
                        multi_gpu=False,
                        multi_label=True,
                        is_fp16=True,
                        logging_steps=50)

In [None]:
cl_learner.fit(epochs=30,
            lr=9e-5,
            validate=True,
            schedule_type="warmup_cosine",
            optimizer_type="adamw")

In [None]:
cl_learner.validate()

In [None]:
class_learner.save_model()

### Prédictions

In [None]:
predictor = BertClassificationPredictor(
                model_path='finetuned_model/model_out',
                label_path='labels/',
                multi_label=True,
                model_type='camembert-base',
                do_lower_case=False)

In [None]:
predictor.predict("Texte à classer")