# Projet 7 - Catégorisez automatiquement des questions

### Espace de bibliothèques

In [1]:
import pandas as pd
import torch
from collections import Counter
import numpy as np
from sklearn.model_selection import train_test_split
import datasets

In [2]:
torch.__version__

'1.12.1+cu113'

# BERT

In [3]:
X = pd.read_csv("/home/fayz/Documents/OpenClassrooms/Projet7/X.csv")
NombreDeTags = 200
labels = list(X.columns[1:])

In [4]:
X.head()

Unnamed: 0,sentence_title_bow,python,java,javascript,c#,ios,android,c++,.net,iphone,...,facebook,colors,apache-spark-sql,cookies,loops,core-data,mvvm,dom,architecture,ssh
0,print array without brackets commas porting ha...,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,allen holub wrote you never use get set functi...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,how get directory listing how scan directory f...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,returning datatables wcf .net wcf service want...,0,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,element visible error able click element want ...,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
X_train_val, X_test = train_test_split(X, test_size=0.15, random_state=42)
X_train, X_val = train_test_split(X_train_val, test_size=0.3, random_state=42)

In [5]:
MAX_LEN = 300
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 4
EPOCHS = 2
LEARNING_RATE = 1e-05

In [6]:
from transformers import BertTokenizer, BertModel

In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [8]:
X.sentence_title_bow[3]

'returning datatables wcf .net wcf service want return datatable know often highly debated topic far whether returning datatables good practice let put aside moment. when create datatable scratch problems whatsoever the table created populated returned client well pre code datacontract public datatable gettbl datatable tbl new datatable testtbl int i=0 100 tbl.columns.add tbl.rows.add new string testvalue return tbl code pre however soon hit database create table get communicationexception the underlying connection closed the connection closed unexpectedly pre code datacontract public datatable gettbl datatable tbl new datatable testtbl populate table sql query return tbl code pre the table populated correctly server side significantly smaller test table looped returned query small fast issue timeouts large data transfer the exact functions datacontracts servicecontracts behaviorcontracts used. why would way table populated bearing table returning successfully'

In [9]:
a = tokenizer(X.sentence_title_bow[3])
len(a["input_ids"])

196

In [10]:
class CustomDataset(torch.utils.data.Dataset):

    def __init__(self, df, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.df = df
        self.title = df['sentence_title_bow']
        self.targets = self.df[labels].values
        self.max_len = max_len

    def __len__(self):
        return len(self.title)

    def __getitem__(self, index):
        title = str(self.title[index])
        title = " ".join(title.split())

        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs["token_type_ids"].flatten(),
            'targets': torch.FloatTensor(self.targets[index])
        }

In [11]:
train_size = 0.6
train_df = X
train_df = train_df.sample(frac=train_size, random_state=200).reset_index()
val_df = train_df.drop(['index'], axis = 1).reset_index(drop = True)

In [12]:
train_dataset = CustomDataset(train_df, tokenizer, MAX_LEN)
valid_dataset = CustomDataset(val_df, tokenizer, MAX_LEN)

In [13]:
train_data_loader = torch.utils.data.DataLoader(train_dataset, 
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=False
)

val_data_loader = torch.utils.data.DataLoader(valid_dataset, 
    batch_size=VALID_BATCH_SIZE,
    shuffle=False
)

In [14]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)

cuda


In [15]:
def load_ckp(checkpoint_fpath, model, optimizer):
    """
    checkpoint_path: path to save checkpoint
    model: model that we want to load checkpoint parameters into       
    optimizer: optimizer we defined in previous training
    """
    # load check point
    checkpoint = torch.load(checkpoint_fpath)
    # initialize state_dict from checkpoint to model
    model.load_state_dict(checkpoint['state_dict'])
    # initialize optimizer from checkpoint to optimizer
    optimizer.load_state_dict(checkpoint['optimizer'])
    # initialize valid_loss_min from checkpoint to valid_loss_min
    valid_loss_min = checkpoint['valid_loss_min']
    # return model, optimizer, epoch value, min validation loss 
    return model, optimizer, checkpoint['epoch'], valid_loss_min.item()

def save_ckp(state, is_best, checkpoint_path, best_model_path):
    """
    state: checkpoint we want to save
    is_best: is this the best checkpoint; min validation loss
    checkpoint_path: path to save checkpoint
    best_model_path: path to save best model
    """
    f_path = checkpoint_path
    # save checkpoint data to the path given, checkpoint_path
    torch.save(state, f_path)
    # if it is a best model, min validation loss
    if is_best:
        best_fpath = best_model_path
        # copy that checkpoint file to best path given, best_model_path
        shutil.copyfile(f_path, best_fpath)

In [16]:
import shutil

In [17]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.bert_model = BertModel.from_pretrained('bert-base-uncased', problem_type="multi_label_classification",return_dict=True)
        #self.bert_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', problem_type="multi_label_classification",return_dict=True)
        self.dropout = torch.nn.Dropout(0.3)
        self.linear = torch.nn.Linear(768, NombreDeTags)
    
    def forward(self, input_ids, attn_mask, token_type_ids):
        output = self.bert_model(
            input_ids, 
            attention_mask=attn_mask, 
            token_type_ids=token_type_ids
        )
        output_dropout = self.dropout(output.pooler_output)
        output = self.linear(output_dropout)
        return output

model = BERTClass()
model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BERTClass(
  (bert_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [18]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [19]:
val_targets=[]
val_outputs=[]

In [20]:
def train_model(n_epochs, training_loader, validation_loader, model, 
                optimizer, checkpoint_path, best_model_path):
   
  # initialize tracker for minimum validation loss
  valid_loss_min = np.Inf

  for epoch in range(1, n_epochs+1):
    train_loss = 0
    valid_loss = 0
    model.train()
    print('############# Epoch {}: Training Start   #############'.format(epoch))
    for batch_idx, data in enumerate(training_loader):
        
        print('epoch', batch_idx)
        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if batch_idx%50==0:
            print(f'Epoch: {epoch}, Training Loss:  {loss.item()}')
            epoch = epoch +1
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        #print('before loss data in training', loss.item(), train_loss)
        train_loss = train_loss + ((1 / (batch_idx + 1)) * (loss.item() - train_loss))
        #print('after loss data in training', loss.item(), train_loss)
    
    print('############# Epoch {}: Training End     #############'.format(epoch))
    
    print('############# Epoch {}: Validation Start   #############'.format(epoch))
    ######################    
    # validate the model #
    ######################
 
    model.eval()
   
    with torch.no_grad():
      for batch_idx, data in enumerate(validation_loader, 0):
            ids = data['input_ids'].to(device, dtype = torch.long)
            mask = data['attention_mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)

            loss = loss_fn(outputs, targets)
            valid_loss = valid_loss + ((1 / (batch_idx + 1)) * (loss.item() - valid_loss))
            val_targets.extend(targets.cpu().detach().numpy().tolist())
            #val_targets.extend(targets.device().detach().numpy().tolist())
            val_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())

      print('############# Epoch {}: Validation End     #############'.format(epoch))
      # calculate average losses
      #print('before cal avg train loss', train_loss)
      train_loss = train_loss/len(training_loader)
      valid_loss = valid_loss/len(validation_loader)
      # print training/validation statistics 
      print('Epoch: {} \tAvgerage Training Loss: {:.6f} \tAverage Validation Loss: {:.6f}'.format(
            epoch, 
            train_loss,
            valid_loss
            ))
      
      # create checkpoint variable and add important data
      checkpoint = {
            'epoch': epoch + 1,
            'valid_loss_min': valid_loss,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict()
      }
        
        # save checkpoint
      save_ckp(checkpoint, False, checkpoint_path, best_model_path)
        
      ## TODO: save the model if validation loss has decreased
      if valid_loss <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min,valid_loss))
        # save checkpoint as best model
        save_ckp(checkpoint, True, checkpoint_path, best_model_path)
        valid_loss_min = valid_loss

    print('############# Epoch {}  Done   #############\n'.format(epoch))

  return model

In [21]:
ckpt_path = f'/home/fayz/Documents/OpenClassrooms/Projet7/curr_ckpt4'
best_model_path = f'/home/fayz/Documents/OpenClassrooms/Projet7/best_model4.pt'

In [22]:
%%time
trained_model = train_model(EPOCHS, train_data_loader, val_data_loader, model, optimizer, ckpt_path, best_model_path)

############# Epoch 1: Training Start   #############
epoch 0
Epoch: 1, Training Loss:  0.7005751132965088
epoch 1
epoch 2
epoch 3
epoch 4
epoch 5
epoch 6
epoch 7
epoch 8
epoch 9
epoch 10
epoch 11
epoch 12
epoch 13
epoch 14
epoch 15
epoch 16
epoch 17
epoch 18
epoch 19
epoch 20
epoch 21
epoch 22
epoch 23
epoch 24
epoch 25
epoch 26
epoch 27
epoch 28
epoch 29
epoch 30
epoch 31
epoch 32
epoch 33
epoch 34
epoch 35
epoch 36
epoch 37
epoch 38
epoch 39
epoch 40
epoch 41
epoch 42
epoch 43
epoch 44
epoch 45
epoch 46
epoch 47
epoch 48
epoch 49
epoch 50
Epoch: 2, Training Loss:  0.5396659970283508
epoch 51
epoch 52
epoch 53
epoch 54
epoch 55
epoch 56
epoch 57
epoch 58
epoch 59
epoch 60
epoch 61
epoch 62
epoch 63
epoch 64
epoch 65
epoch 66
epoch 67
epoch 68
epoch 69
epoch 70
epoch 71
epoch 72
epoch 73
epoch 74
epoch 75
epoch 76
epoch 77
epoch 78
epoch 79
epoch 80
epoch 81
epoch 82
epoch 83
epoch 84
epoch 85
epoch 86
epoch 87
epoch 88
epoch 89
epoch 90
epoch 91
epoch 92
epoch 93
epoch 94
epoch 95
ep

epoch 752
epoch 753
epoch 754
epoch 755
epoch 756
epoch 757
epoch 758
epoch 759
epoch 760
epoch 761
epoch 762
epoch 763
epoch 764
epoch 765
epoch 766
epoch 767
epoch 768
epoch 769
epoch 770
epoch 771
epoch 772
epoch 773
epoch 774
epoch 775
epoch 776
epoch 777
epoch 778
epoch 779
epoch 780
epoch 781
epoch 782
epoch 783
epoch 784
epoch 785
epoch 786
epoch 787
epoch 788
epoch 789
epoch 790
epoch 791
epoch 792
epoch 793
epoch 794
epoch 795
epoch 796
epoch 797
epoch 798
epoch 799
epoch 800
Epoch: 17, Training Loss:  0.08345451205968857
epoch 801
epoch 802
epoch 803
epoch 804
epoch 805
epoch 806
epoch 807
epoch 808
epoch 809
epoch 810
epoch 811
epoch 812
epoch 813
epoch 814
epoch 815
epoch 816
epoch 817
epoch 818
epoch 819
epoch 820
epoch 821
epoch 822
epoch 823
epoch 824
epoch 825
epoch 826
epoch 827
epoch 828
epoch 829
epoch 830
epoch 831
epoch 832
epoch 833
epoch 834
epoch 835
epoch 836
epoch 837
epoch 838
epoch 839
epoch 840
epoch 841
epoch 842
epoch 843
epoch 844
epoch 845
epoch 846
epo

epoch 1460
epoch 1461
epoch 1462
epoch 1463
epoch 1464
epoch 1465
epoch 1466
epoch 1467
epoch 1468
epoch 1469
epoch 1470
epoch 1471
epoch 1472
epoch 1473
epoch 1474
epoch 1475
epoch 1476
epoch 1477
epoch 1478
epoch 1479
epoch 1480
epoch 1481
epoch 1482
epoch 1483
epoch 1484
epoch 1485
epoch 1486
epoch 1487
epoch 1488
epoch 1489
epoch 1490
epoch 1491
epoch 1492
epoch 1493
epoch 1494
epoch 1495
epoch 1496
epoch 1497
epoch 1498
epoch 1499
epoch 1500
Epoch: 31, Training Loss:  0.05137484520673752
epoch 1501
epoch 1502
epoch 1503
epoch 1504
epoch 1505
epoch 1506
epoch 1507
epoch 1508
epoch 1509
epoch 1510
epoch 1511
epoch 1512
epoch 1513
epoch 1514
epoch 1515
epoch 1516
epoch 1517
epoch 1518
epoch 1519
epoch 1520
epoch 1521
epoch 1522
epoch 1523
epoch 1524
epoch 1525
epoch 1526
epoch 1527
epoch 1528
epoch 1529
epoch 1530
epoch 1531
epoch 1532
epoch 1533
epoch 1534
epoch 1535
epoch 1536
epoch 1537
epoch 1538
epoch 1539
epoch 1540
epoch 1541
epoch 1542
epoch 1543
epoch 1544
epoch 1545
epoch 1

epoch 2150
Epoch: 44, Training Loss:  0.05509922280907631
epoch 2151
epoch 2152
epoch 2153
epoch 2154
epoch 2155
epoch 2156
epoch 2157
epoch 2158
epoch 2159
epoch 2160
epoch 2161
epoch 2162
epoch 2163
epoch 2164
epoch 2165
epoch 2166
epoch 2167
epoch 2168
epoch 2169
epoch 2170
epoch 2171
epoch 2172
epoch 2173
epoch 2174
epoch 2175
epoch 2176
epoch 2177
epoch 2178
epoch 2179
epoch 2180
epoch 2181
epoch 2182
epoch 2183
epoch 2184
epoch 2185
epoch 2186
epoch 2187
epoch 2188
epoch 2189
epoch 2190
epoch 2191
epoch 2192
epoch 2193
epoch 2194
epoch 2195
epoch 2196
epoch 2197
epoch 2198
epoch 2199
epoch 2200
Epoch: 45, Training Loss:  0.06843023747205734
epoch 2201
epoch 2202
epoch 2203
epoch 2204
epoch 2205
epoch 2206
epoch 2207
epoch 2208
epoch 2209
epoch 2210
epoch 2211
epoch 2212
epoch 2213
epoch 2214
epoch 2215
epoch 2216
epoch 2217
epoch 2218
epoch 2219
epoch 2220
epoch 2221
epoch 2222
epoch 2223
epoch 2224
epoch 2225
epoch 2226
epoch 2227
epoch 2228
epoch 2229
epoch 2230
epoch 2231
epoc

epoch 2835
epoch 2836
epoch 2837
epoch 2838
epoch 2839
epoch 2840
epoch 2841
epoch 2842
epoch 2843
epoch 2844
epoch 2845
epoch 2846
epoch 2847
epoch 2848
epoch 2849
epoch 2850
Epoch: 58, Training Loss:  0.05055386573076248
epoch 2851
epoch 2852
epoch 2853
epoch 2854
epoch 2855
epoch 2856
epoch 2857
epoch 2858
epoch 2859
epoch 2860
epoch 2861
epoch 2862
epoch 2863
epoch 2864
epoch 2865
epoch 2866
epoch 2867
epoch 2868
epoch 2869
epoch 2870
epoch 2871
epoch 2872
epoch 2873
epoch 2874
epoch 2875
epoch 2876
epoch 2877
epoch 2878
epoch 2879
epoch 2880
epoch 2881
epoch 2882
epoch 2883
epoch 2884
epoch 2885
epoch 2886
epoch 2887
epoch 2888
epoch 2889
epoch 2890
epoch 2891
epoch 2892
epoch 2893
epoch 2894
epoch 2895
epoch 2896
epoch 2897
epoch 2898
epoch 2899
epoch 2900
Epoch: 59, Training Loss:  0.06373074650764465
epoch 2901
epoch 2902
epoch 2903
epoch 2904
epoch 2905
epoch 2906
epoch 2907
epoch 2908
epoch 2909
epoch 2910
epoch 2911
epoch 2912
epoch 2913
epoch 2914
epoch 2915
epoch 2916
epoc

epoch 3520
epoch 3521
epoch 3522
epoch 3523
epoch 3524
epoch 3525
epoch 3526
epoch 3527
epoch 3528
epoch 3529
epoch 3530
epoch 3531
epoch 3532
epoch 3533
epoch 3534
epoch 3535
epoch 3536
epoch 3537
epoch 3538
epoch 3539
epoch 3540
epoch 3541
epoch 3542
epoch 3543
epoch 3544
epoch 3545
epoch 3546
epoch 3547
epoch 3548
epoch 3549
epoch 3550
Epoch: 72, Training Loss:  0.059134408831596375
epoch 3551
epoch 3552
epoch 3553
epoch 3554
epoch 3555
epoch 3556
epoch 3557
epoch 3558
epoch 3559
epoch 3560
epoch 3561
epoch 3562
epoch 3563
epoch 3564
epoch 3565
epoch 3566
epoch 3567
epoch 3568
epoch 3569
epoch 3570
epoch 3571
epoch 3572
epoch 3573
epoch 3574
epoch 3575
epoch 3576
epoch 3577
epoch 3578
epoch 3579
epoch 3580
epoch 3581
epoch 3582
epoch 3583
epoch 3584
epoch 3585
epoch 3586
epoch 3587
epoch 3588
epoch 3589
epoch 3590
epoch 3591
epoch 3592
epoch 3593
epoch 3594
epoch 3595
epoch 3596
epoch 3597
epoch 3598
epoch 3599
epoch 3600
Epoch: 73, Training Loss:  0.04690486192703247
epoch 3601
epo

epoch 51
epoch 52
epoch 53
epoch 54
epoch 55
epoch 56
epoch 57
epoch 58
epoch 59
epoch 60
epoch 61
epoch 62
epoch 63
epoch 64
epoch 65
epoch 66
epoch 67
epoch 68
epoch 69
epoch 70
epoch 71
epoch 72
epoch 73
epoch 74
epoch 75
epoch 76
epoch 77
epoch 78
epoch 79
epoch 80
epoch 81
epoch 82
epoch 83
epoch 84
epoch 85
epoch 86
epoch 87
epoch 88
epoch 89
epoch 90
epoch 91
epoch 92
epoch 93
epoch 94
epoch 95
epoch 96
epoch 97
epoch 98
epoch 99
epoch 100
Epoch: 4, Training Loss:  0.05828841030597687
epoch 101
epoch 102
epoch 103
epoch 104
epoch 105
epoch 106
epoch 107
epoch 108
epoch 109
epoch 110
epoch 111
epoch 112
epoch 113
epoch 114
epoch 115
epoch 116
epoch 117
epoch 118
epoch 119
epoch 120
epoch 121
epoch 122
epoch 123
epoch 124
epoch 125
epoch 126
epoch 127
epoch 128
epoch 129
epoch 130
epoch 131
epoch 132
epoch 133
epoch 134
epoch 135
epoch 136
epoch 137
epoch 138
epoch 139
epoch 140
epoch 141
epoch 142
epoch 143
epoch 144
epoch 145
epoch 146
epoch 147
epoch 148
epoch 149
epoch 150
Epo

epoch 806
epoch 807
epoch 808
epoch 809
epoch 810
epoch 811
epoch 812
epoch 813
epoch 814
epoch 815
epoch 816
epoch 817
epoch 818
epoch 819
epoch 820
epoch 821
epoch 822
epoch 823
epoch 824
epoch 825
epoch 826
epoch 827
epoch 828
epoch 829
epoch 830
epoch 831
epoch 832
epoch 833
epoch 834
epoch 835
epoch 836
epoch 837
epoch 838
epoch 839
epoch 840
epoch 841
epoch 842
epoch 843
epoch 844
epoch 845
epoch 846
epoch 847
epoch 848
epoch 849
epoch 850
Epoch: 19, Training Loss:  0.047365620732307434
epoch 851
epoch 852
epoch 853
epoch 854
epoch 855
epoch 856
epoch 857
epoch 858
epoch 859
epoch 860
epoch 861
epoch 862
epoch 863
epoch 864
epoch 865
epoch 866
epoch 867
epoch 868
epoch 869
epoch 870
epoch 871
epoch 872
epoch 873
epoch 874
epoch 875
epoch 876
epoch 877
epoch 878
epoch 879
epoch 880
epoch 881
epoch 882
epoch 883
epoch 884
epoch 885
epoch 886
epoch 887
epoch 888
epoch 889
epoch 890
epoch 891
epoch 892
epoch 893
epoch 894
epoch 895
epoch 896
epoch 897
epoch 898
epoch 899
epoch 900
Ep

epoch 1509
epoch 1510
epoch 1511
epoch 1512
epoch 1513
epoch 1514
epoch 1515
epoch 1516
epoch 1517
epoch 1518
epoch 1519
epoch 1520
epoch 1521
epoch 1522
epoch 1523
epoch 1524
epoch 1525
epoch 1526
epoch 1527
epoch 1528
epoch 1529
epoch 1530
epoch 1531
epoch 1532
epoch 1533
epoch 1534
epoch 1535
epoch 1536
epoch 1537
epoch 1538
epoch 1539
epoch 1540
epoch 1541
epoch 1542
epoch 1543
epoch 1544
epoch 1545
epoch 1546
epoch 1547
epoch 1548
epoch 1549
epoch 1550
Epoch: 33, Training Loss:  0.05650481954216957
epoch 1551
epoch 1552
epoch 1553
epoch 1554
epoch 1555
epoch 1556
epoch 1557
epoch 1558
epoch 1559
epoch 1560
epoch 1561
epoch 1562
epoch 1563
epoch 1564
epoch 1565
epoch 1566
epoch 1567
epoch 1568
epoch 1569
epoch 1570
epoch 1571
epoch 1572
epoch 1573
epoch 1574
epoch 1575
epoch 1576
epoch 1577
epoch 1578
epoch 1579
epoch 1580
epoch 1581
epoch 1582
epoch 1583
epoch 1584
epoch 1585
epoch 1586
epoch 1587
epoch 1588
epoch 1589
epoch 1590
epoch 1591
epoch 1592
epoch 1593
epoch 1594
epoch 1

epoch 2198
epoch 2199
epoch 2200
Epoch: 46, Training Loss:  0.0625787004828453
epoch 2201
epoch 2202
epoch 2203
epoch 2204
epoch 2205
epoch 2206
epoch 2207
epoch 2208
epoch 2209
epoch 2210
epoch 2211
epoch 2212
epoch 2213
epoch 2214
epoch 2215
epoch 2216
epoch 2217
epoch 2218
epoch 2219
epoch 2220
epoch 2221
epoch 2222
epoch 2223
epoch 2224
epoch 2225
epoch 2226
epoch 2227
epoch 2228
epoch 2229
epoch 2230
epoch 2231
epoch 2232
epoch 2233
epoch 2234
epoch 2235
epoch 2236
epoch 2237
epoch 2238
epoch 2239
epoch 2240
epoch 2241
epoch 2242
epoch 2243
epoch 2244
epoch 2245
epoch 2246
epoch 2247
epoch 2248
epoch 2249
epoch 2250
Epoch: 47, Training Loss:  0.055704791098833084
epoch 2251
epoch 2252
epoch 2253
epoch 2254
epoch 2255
epoch 2256
epoch 2257
epoch 2258
epoch 2259
epoch 2260
epoch 2261
epoch 2262
epoch 2263
epoch 2264
epoch 2265
epoch 2266
epoch 2267
epoch 2268
epoch 2269
epoch 2270
epoch 2271
epoch 2272
epoch 2273
epoch 2274
epoch 2275
epoch 2276
epoch 2277
epoch 2278
epoch 2279
epoc

epoch 2883
epoch 2884
epoch 2885
epoch 2886
epoch 2887
epoch 2888
epoch 2889
epoch 2890
epoch 2891
epoch 2892
epoch 2893
epoch 2894
epoch 2895
epoch 2896
epoch 2897
epoch 2898
epoch 2899
epoch 2900
Epoch: 60, Training Loss:  0.053233154118061066
epoch 2901
epoch 2902
epoch 2903
epoch 2904
epoch 2905
epoch 2906
epoch 2907
epoch 2908
epoch 2909
epoch 2910
epoch 2911
epoch 2912
epoch 2913
epoch 2914
epoch 2915
epoch 2916
epoch 2917
epoch 2918
epoch 2919
epoch 2920
epoch 2921
epoch 2922
epoch 2923
epoch 2924
epoch 2925
epoch 2926
epoch 2927
epoch 2928
epoch 2929
epoch 2930
epoch 2931
epoch 2932
epoch 2933
epoch 2934
epoch 2935
epoch 2936
epoch 2937
epoch 2938
epoch 2939
epoch 2940
epoch 2941
epoch 2942
epoch 2943
epoch 2944
epoch 2945
epoch 2946
epoch 2947
epoch 2948
epoch 2949
epoch 2950
Epoch: 61, Training Loss:  0.05384096875786781
epoch 2951
epoch 2952
epoch 2953
epoch 2954
epoch 2955
epoch 2956
epoch 2957
epoch 2958
epoch 2959
epoch 2960
epoch 2961
epoch 2962
epoch 2963
epoch 2964
epo

epoch 3568
epoch 3569
epoch 3570
epoch 3571
epoch 3572
epoch 3573
epoch 3574
epoch 3575
epoch 3576
epoch 3577
epoch 3578
epoch 3579
epoch 3580
epoch 3581
epoch 3582
epoch 3583
epoch 3584
epoch 3585
epoch 3586
epoch 3587
epoch 3588
epoch 3589
epoch 3590
epoch 3591
epoch 3592
epoch 3593
epoch 3594
epoch 3595
epoch 3596
epoch 3597
epoch 3598
epoch 3599
epoch 3600
Epoch: 74, Training Loss:  0.036237187683582306
epoch 3601
epoch 3602
epoch 3603
epoch 3604
epoch 3605
epoch 3606
epoch 3607
epoch 3608
epoch 3609
epoch 3610
epoch 3611
epoch 3612
epoch 3613
epoch 3614
epoch 3615
epoch 3616
epoch 3617
epoch 3618
epoch 3619
epoch 3620
epoch 3621
epoch 3622
epoch 3623
epoch 3624
epoch 3625
epoch 3626
epoch 3627
epoch 3628
epoch 3629
epoch 3630
epoch 3631
epoch 3632
epoch 3633
epoch 3634
epoch 3635
epoch 3636
epoch 3637
epoch 3638
epoch 3639
epoch 3640
epoch 3641
epoch 3642
epoch 3643
epoch 3644
epoch 3645
epoch 3646
epoch 3647
epoch 3648
epoch 3649
epoch 3650
Epoch: 75, Training Loss:  0.0444621108

In [27]:
test_df = X_test

In [28]:
train_size = 0.1
test_df = X_test
test_df = test_df.sample(frac=train_size, random_state=200).reset_index(drop = True)
#test_df = test_df.drop(['index'], axis = 1)

In [29]:
test_df.loc[1,test_df.loc[1,:]==1]

ios    1
Name: 1, dtype: object

In [30]:
test_df

Unnamed: 0,sentence_title_bow,python,java,javascript,c#,ios,android,c++,.net,iphone,...,facebook,colors,apache-spark-sql,cookies,loops,core-data,mvvm,dom,architecture,ssh
0,noclassdeffounderror eclipse android problem t...,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,inputaccessoryview docked bottom trying achiev...,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,getting saved instruction pointer address sign...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,create dropdown list mvc3 using entity framewo...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,check string valid windows directory folder pa...,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
408,how store date time timestamps utc time zone j...,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
409,how properly create composite primary keys mys...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
410,zend framework doctrine would like start devel...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
411,the problem installing pil using virtualenv bu...,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [92]:
for i in range(50):
    # testing
    example = test_df['sentence_title_bow'][i]
    encodings = tokenizer.encode_plus(
        example,
        None,
        add_special_tokens=True,
        max_length=MAX_LEN,
        padding='max_length',
        return_token_type_ids=True,
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    trained_model.eval()
    with torch.no_grad():
        input_ids = encodings['input_ids'].to(device, dtype=torch.long)
        attention_mask = encodings['attention_mask'].to(device, dtype=torch.long)
        token_type_ids = encodings['token_type_ids'].to(device, dtype=torch.long)
        output = trained_model(input_ids, attention_mask, token_type_ids)
        
        final_output = torch.sigmoid(output).cpu().detach().numpy().ravel().tolist()#.tolist()
        #print(train_df.columns[0:].to_list()[int(np.argmax(final_output, axis=1))])
        #print(final_output)
        dict_from_list = dict(zip(labels,final_output))
        y_pred = list(sorted(dict_from_list.items(),key = lambda t : t[1]))[-4:]
        y_pred2 = [y_pred[i][0] for i in range(len(y_pred))]
        print("Texte test : ")
        print(test_df['sentence_title_bow'][i])
        print("Tags réels")
        print(test_df.loc[i,test_df.loc[i,:]==1].index.tolist())
        print("Tags Prédit : ") 
        print(y_pred2)
        print() 

Texte test : 
noclassdeffounderror eclipse android problem trying run android app adding second external library build path working fine since added scoreninja jar get strong noclassdeffounderror strong try run app. here message pre code 21:45:26.154 error androidruntime 3654 java.lang.noclassdeffounderror com.scoreninja.adapter.scoreninjaadapter code pre build scripts generated android tools sure else cleaning rebuilding restarting eclipse already tried three does anyone know amend
Tags réels
['java', 'android', 'eclipse', 'build']
Tags Prédit : 
['ios', 'javascript', 'java', 'android']

Texte test : 
inputaccessoryview docked bottom trying achieve similar positioning behavior bottom text input bar apple messages app. tried many approaches searched high low many similar questions none satisfactory. specify there code uitoolbar code bottom view the toolbar follow keyboard keyboard appears disappears the toolbar stay top keyboard keyboard visible when keyboard hidden toolbar stays docke

Texte test : 
unexpected results working big integers interpreted languages trying get sum code ... 1000000000 code getting funny results php href= en.wikipedia.org wiki node.js rel= noreferrer node.js strong php strong pre code sum 1000000000 sum printf number_format sum 500000000067108992 code pre strong node.js strong pre code var sum 1000000000 sum console.log sum 500000000067109000 code pre the correct answer calculated using pre code ... code pre correct answer strong 500000000500000000 strong decided try another language. strong strong pre code var sum int64 1000000000 sum fmt.println sum 500000000500000000 code pre but works fine wrong php node.js code perhaps problem interpreted languages works compiled language like would interpreted languages python perl problem
Tags réels
['php', 'node.js']
Tags Prédit : 
['css', 'jquery', 'html', 'javascript']

Texte test : 
free alternative atlassian greenhopper pivotaltracker working brother website idea like use tool plan sprints assign

Texte test : 
what difference mockito.mock someclass mock annotation what difference code mockito.mock class classtomock code method code mock code annotation are for example pre code private testclass test mockito.mock testclass.class code pre pre code mock private testclass test code pre
Tags réels
['java', 'unit-testing']
Tags Prédit : 
['javascript', '.net', 'c#', 'java']

Texte test : 
what purpose static method interface java why static methods supported java what difference two lines main method code pre code package sample public class public static void dosomething system.out.println make something public interface public static void dosomething system.out.println make something public class public static void main string args a.dosomething difference i.dosomething code pre see even implemented what purpose would serve static method interface write static method another class call was introduced purpose modularity and modularity mean following pre code public interface singabl

Texte test : 
hide status bar ios how hide status bar ios this deprecated pre code uiapplication sharedapplication setstatusbarhidden yes code pre
Tags réels
['ios', 'objective-c', 'swift']
Tags Prédit : 
['swift', 'iphone', 'objective-c', 'ios']

Texte test : 
jpa difference joincolumn primarykeyjoincolumn what exact difference code joincolumn code code primarykeyjoincolumn code you use code joincolumn code columns part foreign key typical column could look like e.g join table additional attributes pre code manytoone joincolumn name ... private otherclass code pre what happens promote column a.k.a identifying relationship column must tag code code pre code manytoone joincolumn name ... private otherclass code pre now question are code code code joincolumn code code primarykeyjoincolumn code pre code manytoone primarykeyjoincolumn name ... private otherclass code pre code primarykeyjoincolumn code
Tags réels
['java', 'hibernate', 'jpa', 'jakarta-ee']
Tags Prédit : 
['javascript', 'c#',

## Obtention des valeurs moyenne d'accuracy et F1

In [87]:
from sklearn.metrics import f1_score, accuracy_score
f1_micro_average = 0
accuracy = 0
for i in range(len(test_df)):
    a = test_df.loc[i,test_df.loc[i,:]==1]
    y_true = a.index.tolist()
    example = test_df['sentence_title_bow'][i]
    encodings = tokenizer.encode_plus(
        example,
        None,
        add_special_tokens=True,
        max_length=MAX_LEN,
        padding='max_length',
        return_token_type_ids=True,
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    trained_model.eval()
    with torch.no_grad():
        input_ids = encodings['input_ids'].to(device, dtype=torch.long)
        attention_mask = encodings['attention_mask'].to(device, dtype=torch.long)
        token_type_ids = encodings['token_type_ids'].to(device, dtype=torch.long)
        output = trained_model(input_ids, attention_mask, token_type_ids)
        
        final_output = torch.sigmoid(output).cpu().detach().numpy().ravel().tolist()#.tolist()
        #print(train_df.columns[0:].to_list()[int(np.argmax(final_output, axis=1))])
        #print(final_output)
        dict_from_list = dict(zip(labels,final_output))
        if(len(y_true)!=0):
            y_pred = list(sorted(dict_from_list.items(),key = lambda t : t[1]))[-len(y_true):]
            y_pred2 = [y_pred[i][0] for i in range(len(y_pred))]
        else:
            y_pred2= []
    f1_micro_average = f1_micro_average + f1_score(y_true=y_true, y_pred=y_pred2, average='micro')
    accuracy = accuracy + accuracy_score(y_true, y_pred2)
accuracy = accuracy/len(test_df)
f1_micro_average = f1_micro_average/len(test_df)
print(f1_micro_average)
print(accuracy)

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type

0.1209443099273608
nan


# BERT - implémentation n°2

In [71]:
import pandas as pd
import torch
from collections import Counter
import numpy as np
from sklearn.model_selection import train_test_split
from datasets import Dataset

In [72]:
X = pd.read_csv("/home/fayz/Documents/OpenClassrooms/Projet7/X.csv")
X = X.rename(columns={"sentence_title_bow" : "text"})
labels = list(X.columns[1:])
X_train_val, X_test = train_test_split(X, test_size=0.15, random_state=42)
X_train, X_val = train_test_split(X_train_val, test_size=0.3, random_state=42)

In [73]:
dataset = Dataset.from_pandas(X_train)

In [74]:
dataset = dataset.train_test_split(0.2)

In [75]:
labels = [label for label in dataset['train'].features.keys() if label not in ['text', '__index_level_0__']]
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}

In [76]:
from transformers import AutoTokenizer
import numpy as np

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

loading configuration file config.json from cache at /home/fayz/.cache/huggingface/hub/models--bert-base-uncased/snapshots/5546055f03398095e385d7dc625e636cc8910bf2/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.22.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file vocab.txt from cache at /home/fayz/.cache/huggingface/hub/models--bert-base-uncased/snapshots/5546055f03398095e385d7dc625e636cc8910bf2/vocab.txt
loadi

In [77]:
def preprocess_data(examples):
  # take a batch of texts
  text = examples["text"]
  # encode them
  encoding = tokenizer(text, padding="max_length", truncation=True, max_length=256)
  # add labels
  labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
  # create numpy array of shape (batch_size, num_labels)
  labels_matrix = np.zeros((len(text), len(labels)))
  # fill numpy array
  for idx, label in enumerate(labels):
    labels_matrix[:, idx] = labels_batch[label]

  encoding["labels"] = labels_matrix.tolist()
  
  return encoding

In [78]:
encoded_dataset = dataset.map(preprocess_data, batched=True, remove_columns=dataset['train'].column_names)

  0%|          | 0/14 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

In [79]:
example = encoded_dataset['train'][0]
print(example.keys())

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])


In [80]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)

cuda


In [81]:
tokenizer.decode(example['input_ids'])

'[CLS] objective assign copy retain new objective basic knowledge including concept pointers two basic questions can someone explain difference assign copy retain analogy how handle function returns pointer variable perform messaging return pointer [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [

In [82]:
example['labels']

[0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0

In [83]:
[id2label[idx] for idx, label in enumerate(example['labels']) if label == 1.0]

['ios', 'objective-c', 'macos', 'memory-management']

In [84]:
encoded_dataset.set_format("torch")

In [85]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", 
                                                           problem_type="multi_label_classification", 
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id)

loading configuration file config.json from cache at /home/fayz/.cache/huggingface/hub/models--bert-base-uncased/snapshots/5546055f03398095e385d7dc625e636cc8910bf2/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "python",
    "1": "java",
    "2": "javascript",
    "3": "c#",
    "4": "ios",
    "5": "android",
    "6": "c++",
    "7": ".net",
    "8": "iphone",
    "9": "objective-c",
    "10": "html",
    "11": "jquery",
    "12": "php",
    "13": "c",
    "14": "linux",
    "15": "css",
    "16": "node.js",
    "17": "sql",
    "18": "performance",
    "19": "swift",
    "20": "xcode",
    "21": "asp.net",
    "22": "spring",
    "23": "windows",
    "24": "asp.net-mvc",
    "25": "mysql",
    "26": "coco

loading weights file pytorch_model.bin from cache at /home/fayz/.cache/huggingface/hub/models--bert-base-uncased/snapshots/5546055f03398095e385d7dc625e636cc8910bf2/pytorch_model.bin
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializ

In [86]:
batch_size = 8
metric_name = "f1"

In [87]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    f"bert-finetuned-sem_eval-english",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=10,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name
    #push_to_hub=True,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [88]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch
    
# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    print("test2")
    print(predictions)
    print(labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    print("test3")
    print(probs)
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    print("test4")
    print(y_pred)
    print(y_pred.shape)
    # finally, compute metrics
    y_true = labels
    print("test5")
    print(y_true)
    
    f1_micro_average =0
    for i in range(y_pred.shape[0]):
        f1_micro_average = f1_micro_average + f1_score(y_true=y_true[i], y_pred=y_pred[i], average='micro')
    f1_micro_average = f1_micro_average/y_pred.shape[0]
    print("test6")
    print(f1_micro_average)
    #roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = 0
    for i in range(y_pred.shape[0]):
        accuracy = accuracy + accuracy_score(y_true[i], y_pred[i])
    accuracy = accuracy/y_pred.shape[0]
    
    print(accuracy)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               #'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    print("test1")
    print(preds)
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result

In [89]:
encoded_dataset['train'][0]['labels'].type()

'torch.FloatTensor'

In [90]:
encoded_dataset['train']['input_ids'][0]

tensor([  101,  7863, 23911,  6100,  9279,  2047,  7863,  3937,  3716,  2164,
         4145, 20884,  2015,  2048,  3937,  3980,  2064,  2619,  4863,  4489,
        23911,  6100,  9279, 23323,  2129,  5047,  3853,  5651, 20884,  8023,
         4685, 24732,  2709, 20884,   102,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0, 

In [91]:
outputs = model(input_ids=encoded_dataset['train']['input_ids'][0].unsqueeze(0), labels=encoded_dataset['train'][0]['labels'].unsqueeze(0))
outputs

SequenceClassifierOutput(loss=tensor(0.6959, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>), logits=tensor([[-0.1609, -0.1499, -0.1886,  0.6041,  0.2103,  0.0764,  0.1223,  0.0759,
         -0.0988,  0.1416, -0.1282,  0.0439, -0.4655,  0.1741,  0.4702, -0.1765,
          0.0239,  0.1541, -0.3121, -0.5239,  0.2310,  0.4924,  0.1190, -0.0612,
          0.0958,  0.0170, -0.3113, -0.0164,  0.0423, -0.1785,  0.3607, -0.6185,
          0.4126, -0.1016, -0.1021, -0.1539, -0.1108, -0.8156, -0.0167, -0.6098,
         -0.0199, -0.6714,  0.2848,  0.0660, -0.3356,  0.1391, -0.3312, -0.4969,
          0.1209,  0.0153, -0.1157, -0.0938,  0.0861,  0.1531, -0.0093, -0.1374,
          0.1585, -0.1725, -0.3653,  0.2436, -0.0814, -0.6348,  0.4081, -0.1530,
         -0.0124, -0.0065, -0.4403,  0.4356, -0.2743,  0.2739, -0.2390,  0.0465,
          0.1222, -0.5803, -0.1041,  0.1065, -0.0692, -0.0240, -0.1275,  0.0380,
         -0.3943,  0.1945,  0.1100,  0.3091, -0.0474,  0.0904, -0.3332, -0.5087,
       

In [92]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    tokenizer=tokenizer
    ,compute_metrics=compute_metrics
)

In [93]:
trainer.train()

***** Running training *****
  Num examples = 13102
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 16380
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.0587,0.058443,0.987984,0.987984
2,0.0566,0.053911,0.987984,0.987984
3,0.0485,0.045682,0.989527,0.989527
4,0.0414,0.041008,0.989686,0.989686
5,0.0375,0.038274,0.990061,0.990061
6,0.0349,0.036295,0.99052,0.99052
7,0.0328,0.034998,0.990574,0.990574
8,0.0307,0.034176,0.990696,0.990696
9,0.03,0.033643,0.990853,0.990853
10,0.0299,0.033464,0.990867,0.990867


***** Running Evaluation *****
  Num examples = 3276
  Batch size = 8


test1
[[-2.0042481 -2.0633707 -2.284382  ... -5.251593  -5.0752726 -5.3852463]
 [-2.014184  -2.0590508 -2.2803688 ... -5.24678   -5.065469  -5.380563 ]
 [-2.0004578 -2.0661583 -2.2856336 ... -5.254985  -5.083724  -5.383062 ]
 ...
 [-2.0052216 -2.0644143 -2.283593  ... -5.2520475 -5.0788527 -5.381599 ]
 [-2.004861  -2.0638552 -2.2831068 ... -5.254961  -5.076467  -5.38079  ]
 [-2.0027997 -2.0635386 -2.2856123 ... -5.2509675 -5.0809727 -5.384317 ]]
test2
[[-2.0042481 -2.0633707 -2.284382  ... -5.251593  -5.0752726 -5.3852463]
 [-2.014184  -2.0590508 -2.2803688 ... -5.24678   -5.065469  -5.380563 ]
 [-2.0004578 -2.0661583 -2.2856336 ... -5.254985  -5.083724  -5.383062 ]
 ...
 [-2.0052216 -2.0644143 -2.283593  ... -5.2520475 -5.0788527 -5.381599 ]
 [-2.004861  -2.0638552 -2.2831068 ... -5.254961  -5.076467  -5.38079  ]
 [-2.0027997 -2.0635386 -2.2856123 ... -5.2509675 -5.0809727 -5.384317 ]]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0.

Saving model checkpoint to bert-finetuned-sem_eval-english/checkpoint-1638
Configuration saved in bert-finetuned-sem_eval-english/checkpoint-1638/config.json
Model weights saved in bert-finetuned-sem_eval-english/checkpoint-1638/pytorch_model.bin
tokenizer config file saved in bert-finetuned-sem_eval-english/checkpoint-1638/tokenizer_config.json
Special tokens file saved in bert-finetuned-sem_eval-english/checkpoint-1638/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 3276
  Batch size = 8


test1
[[-2.9380383  -2.1178188  -2.5357969  ... -5.137516   -5.126982
  -5.4957356 ]
 [-2.6584258  -2.1486015  -0.99219304 ... -5.2211103  -5.1327095
  -5.242314  ]
 [-3.058804   -2.2475433  -1.8253374  ... -5.5765643  -5.501362
  -5.712489  ]
 ...
 [-1.7541625  -2.7230887  -1.9879395  ... -5.260019   -5.2201147
  -5.2240667 ]
 [-0.6285759  -2.859528   -2.4573364  ... -5.1360826  -5.42455
  -5.341634  ]
 [-3.634196   -0.7385677  -2.23263    ... -5.269      -5.050451
  -5.3396497 ]]
test2
[[-2.9380383  -2.1178188  -2.5357969  ... -5.137516   -5.126982
  -5.4957356 ]
 [-2.6584258  -2.1486015  -0.99219304 ... -5.2211103  -5.1327095
  -5.242314  ]
 [-3.058804   -2.2475433  -1.8253374  ... -5.5765643  -5.501362
  -5.712489  ]
 ...
 [-1.7541625  -2.7230887  -1.9879395  ... -5.260019   -5.2201147
  -5.2240667 ]
 [-0.6285759  -2.859528   -2.4573364  ... -5.1360826  -5.42455
  -5.341634  ]
 [-3.634196   -0.7385677  -2.23263    ... -5.269      -5.050451
  -5.3396497 ]]
[[0. 0. 0. ... 0. 0. 0.]
 

Saving model checkpoint to bert-finetuned-sem_eval-english/checkpoint-3276
Configuration saved in bert-finetuned-sem_eval-english/checkpoint-3276/config.json
Model weights saved in bert-finetuned-sem_eval-english/checkpoint-3276/pytorch_model.bin
tokenizer config file saved in bert-finetuned-sem_eval-english/checkpoint-3276/tokenizer_config.json
Special tokens file saved in bert-finetuned-sem_eval-english/checkpoint-3276/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 3276
  Batch size = 8


test1
[[-3.2861223  -2.6053164  -3.9338238  ... -4.8581324  -4.9736824
  -5.230724  ]
 [-3.9507089  -2.8469894   0.51232046 ... -4.866737   -5.4913273
  -5.231877  ]
 [-3.6686506  -3.262334   -0.21147199 ... -5.118901   -5.9083843
  -5.2827573 ]
 ...
 [-2.7606936  -3.4059703  -3.5640595  ... -5.1361074  -5.384358
  -5.0031376 ]
 [ 0.7941739  -3.7657337  -3.7406642  ... -5.5367184  -6.2311044
  -5.706171  ]
 [-4.9005933   1.0103682  -3.3927054  ... -5.1407485  -4.7001085
  -5.2324276 ]]
test2
[[-3.2861223  -2.6053164  -3.9338238  ... -4.8581324  -4.9736824
  -5.230724  ]
 [-3.9507089  -2.8469894   0.51232046 ... -4.866737   -5.4913273
  -5.231877  ]
 [-3.6686506  -3.262334   -0.21147199 ... -5.118901   -5.9083843
  -5.2827573 ]
 ...
 [-2.7606936  -3.4059703  -3.5640595  ... -5.1361074  -5.384358
  -5.0031376 ]
 [ 0.7941739  -3.7657337  -3.7406642  ... -5.5367184  -6.2311044
  -5.706171  ]
 [-4.9005933   1.0103682  -3.3927054  ... -5.1407485  -4.7001085
  -5.2324276 ]]
[[0. 0. 0. ... 0. 

Saving model checkpoint to bert-finetuned-sem_eval-english/checkpoint-4914
Configuration saved in bert-finetuned-sem_eval-english/checkpoint-4914/config.json
Model weights saved in bert-finetuned-sem_eval-english/checkpoint-4914/pytorch_model.bin
tokenizer config file saved in bert-finetuned-sem_eval-english/checkpoint-4914/tokenizer_config.json
Special tokens file saved in bert-finetuned-sem_eval-english/checkpoint-4914/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 3276
  Batch size = 8


test1
[[-3.7992167  -3.4781637  -4.2414384  ... -5.187949   -4.7670507
  -5.1191926 ]
 [-4.8274984  -3.2958949   0.9184362  ... -4.478097   -5.6175027
  -5.2022424 ]
 [-4.314894   -3.9979599  -0.63736415 ... -4.9737043  -6.220214
  -5.191996  ]
 ...
 [-3.3271124  -4.101198   -4.261372   ... -5.7210426  -5.736252
  -4.7022777 ]
 [ 1.4680263  -4.1724133  -4.123177   ... -5.9385343  -6.582646
  -5.796789  ]
 [-5.4588423   1.2887492  -3.544028   ... -5.4251475  -4.900267
  -5.3950152 ]]
test2
[[-3.7992167  -3.4781637  -4.2414384  ... -5.187949   -4.7670507
  -5.1191926 ]
 [-4.8274984  -3.2958949   0.9184362  ... -4.478097   -5.6175027
  -5.2022424 ]
 [-4.314894   -3.9979599  -0.63736415 ... -4.9737043  -6.220214
  -5.191996  ]
 ...
 [-3.3271124  -4.101198   -4.261372   ... -5.7210426  -5.736252
  -4.7022777 ]
 [ 1.4680263  -4.1724133  -4.123177   ... -5.9385343  -6.582646
  -5.796789  ]
 [-5.4588423   1.2887492  -3.544028   ... -5.4251475  -4.900267
  -5.3950152 ]]
[[0. 0. 0. ... 0. 0. 0.]

Saving model checkpoint to bert-finetuned-sem_eval-english/checkpoint-6552
Configuration saved in bert-finetuned-sem_eval-english/checkpoint-6552/config.json
Model weights saved in bert-finetuned-sem_eval-english/checkpoint-6552/pytorch_model.bin
tokenizer config file saved in bert-finetuned-sem_eval-english/checkpoint-6552/tokenizer_config.json
Special tokens file saved in bert-finetuned-sem_eval-english/checkpoint-6552/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 3276
  Batch size = 8


test1
[[-4.0251217 -3.8972807 -4.625097  ... -5.546585  -4.910903  -5.4583235]
 [-5.1278334 -3.4576097  1.0564224 ... -4.1490836 -5.5280266 -5.436822 ]
 [-4.8155694 -4.426889  -1.0106769 ... -4.843619  -5.7239428 -5.761312 ]
 ...
 [-3.7179744 -4.419923  -4.6763344 ... -5.765224  -5.4688663 -4.375586 ]
 [ 2.023515  -4.3452005 -4.4152317 ... -6.105035  -6.7135935 -6.0922318]
 [-5.678031   1.2759732 -3.2031024 ... -5.07055   -4.7352624 -5.759255 ]]
test2
[[-4.0251217 -3.8972807 -4.625097  ... -5.546585  -4.910903  -5.4583235]
 [-5.1278334 -3.4576097  1.0564224 ... -4.1490836 -5.5280266 -5.436822 ]
 [-4.8155694 -4.426889  -1.0106769 ... -4.843619  -5.7239428 -5.761312 ]
 ...
 [-3.7179744 -4.419923  -4.6763344 ... -5.765224  -5.4688663 -4.375586 ]
 [ 2.023515  -4.3452005 -4.4152317 ... -6.105035  -6.7135935 -6.0922318]
 [-5.678031   1.2759732 -3.2031024 ... -5.07055   -4.7352624 -5.759255 ]]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0.

Saving model checkpoint to bert-finetuned-sem_eval-english/checkpoint-8190
Configuration saved in bert-finetuned-sem_eval-english/checkpoint-8190/config.json
Model weights saved in bert-finetuned-sem_eval-english/checkpoint-8190/pytorch_model.bin
tokenizer config file saved in bert-finetuned-sem_eval-english/checkpoint-8190/tokenizer_config.json
Special tokens file saved in bert-finetuned-sem_eval-english/checkpoint-8190/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 3276
  Batch size = 8


test1
[[-3.9844658  -4.324794   -4.7929115  ... -5.6895947  -4.5967593
  -5.4405985 ]
 [-4.999656   -3.3227973   1.2260418  ... -3.9181762  -5.162171
  -5.1305857 ]
 [-5.155782   -4.493022   -0.78378516 ... -4.4209538  -5.22129
  -5.640427  ]
 ...
 [-4.3262873  -4.7291656  -4.963439   ... -6.144407   -5.6283703
  -4.367629  ]
 [ 2.4639602  -4.184226   -4.3253255  ... -6.0817285  -6.616683
  -6.083292  ]
 [-5.806903    0.95580465 -3.7671134  ... -5.5877047  -4.9965334
  -6.1684504 ]]
test2
[[-3.9844658  -4.324794   -4.7929115  ... -5.6895947  -4.5967593
  -5.4405985 ]
 [-4.999656   -3.3227973   1.2260418  ... -3.9181762  -5.162171
  -5.1305857 ]
 [-5.155782   -4.493022   -0.78378516 ... -4.4209538  -5.22129
  -5.640427  ]
 ...
 [-4.3262873  -4.7291656  -4.963439   ... -6.144407   -5.6283703
  -4.367629  ]
 [ 2.4639602  -4.184226   -4.3253255  ... -6.0817285  -6.616683
  -6.083292  ]
 [-5.806903    0.95580465 -3.7671134  ... -5.5877047  -4.9965334
  -6.1684504 ]]
[[0. 0. 0. ... 0. 0. 0.]

Saving model checkpoint to bert-finetuned-sem_eval-english/checkpoint-9828
Configuration saved in bert-finetuned-sem_eval-english/checkpoint-9828/config.json
Model weights saved in bert-finetuned-sem_eval-english/checkpoint-9828/pytorch_model.bin
tokenizer config file saved in bert-finetuned-sem_eval-english/checkpoint-9828/tokenizer_config.json
Special tokens file saved in bert-finetuned-sem_eval-english/checkpoint-9828/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 3276
  Batch size = 8


test1
[[-3.9533124 -4.192071  -5.2507653 ... -5.876297  -5.0450096 -5.64026  ]
 [-4.953708  -3.6082432  1.5744668 ... -3.8501709 -5.246896  -5.314312 ]
 [-5.1173935 -4.8967104 -1.6416193 ... -4.7639112 -5.5373526 -5.884583 ]
 ...
 [-4.444526  -4.8113813 -5.2271924 ... -6.3782988 -5.764687  -4.4870667]
 [ 2.3695626 -4.5108995 -4.4184346 ... -6.3922405 -6.9531093 -6.468816 ]
 [-5.942346   1.0177653 -3.459211  ... -5.482094  -4.9694877 -6.2555737]]
test2
[[-3.9533124 -4.192071  -5.2507653 ... -5.876297  -5.0450096 -5.64026  ]
 [-4.953708  -3.6082432  1.5744668 ... -3.8501709 -5.246896  -5.314312 ]
 [-5.1173935 -4.8967104 -1.6416193 ... -4.7639112 -5.5373526 -5.884583 ]
 ...
 [-4.444526  -4.8113813 -5.2271924 ... -6.3782988 -5.764687  -4.4870667]
 [ 2.3695626 -4.5108995 -4.4184346 ... -6.3922405 -6.9531093 -6.468816 ]
 [-5.942346   1.0177653 -3.459211  ... -5.482094  -4.9694877 -6.2555737]]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0.

Saving model checkpoint to bert-finetuned-sem_eval-english/checkpoint-11466
Configuration saved in bert-finetuned-sem_eval-english/checkpoint-11466/config.json
Model weights saved in bert-finetuned-sem_eval-english/checkpoint-11466/pytorch_model.bin
tokenizer config file saved in bert-finetuned-sem_eval-english/checkpoint-11466/tokenizer_config.json
Special tokens file saved in bert-finetuned-sem_eval-english/checkpoint-11466/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 3276
  Batch size = 8


test1
[[-3.9716027 -4.4381332 -5.452963  ... -6.2071686 -5.220328  -5.9036393]
 [-4.9646506 -3.585062   1.6813344 ... -3.730832  -5.0952263 -5.3174505]
 [-5.2893906 -5.1631474 -1.2233869 ... -4.577202  -5.49709   -5.6373906]
 ...
 [-4.620589  -4.9545    -5.373372  ... -6.447201  -5.7985635 -4.422313 ]
 [ 2.9450674 -4.6219926 -4.295802  ... -6.288571  -6.8825297 -6.3590455]
 [-6.085259   0.9947542 -3.9019115 ... -5.7933793 -5.09754   -6.459538 ]]
test2
[[-3.9716027 -4.4381332 -5.452963  ... -6.2071686 -5.220328  -5.9036393]
 [-4.9646506 -3.585062   1.6813344 ... -3.730832  -5.0952263 -5.3174505]
 [-5.2893906 -5.1631474 -1.2233869 ... -4.577202  -5.49709   -5.6373906]
 ...
 [-4.620589  -4.9545    -5.373372  ... -6.447201  -5.7985635 -4.422313 ]
 [ 2.9450674 -4.6219926 -4.295802  ... -6.288571  -6.8825297 -6.3590455]
 [-6.085259   0.9947542 -3.9019115 ... -5.7933793 -5.09754   -6.459538 ]]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0.

Saving model checkpoint to bert-finetuned-sem_eval-english/checkpoint-13104
Configuration saved in bert-finetuned-sem_eval-english/checkpoint-13104/config.json
Model weights saved in bert-finetuned-sem_eval-english/checkpoint-13104/pytorch_model.bin
tokenizer config file saved in bert-finetuned-sem_eval-english/checkpoint-13104/tokenizer_config.json
Special tokens file saved in bert-finetuned-sem_eval-english/checkpoint-13104/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 3276
  Batch size = 8


test1
[[-3.898176   -4.451516   -5.5049872  ... -6.1410346  -5.0408754
  -5.876754  ]
 [-4.9246774  -3.4578602   1.4864594  ... -3.6848786  -4.838425
  -5.291048  ]
 [-5.2630773  -5.1354113  -1.3144058  ... -4.5789766  -5.312361
  -5.871128  ]
 ...
 [-4.671137   -4.959435   -5.350483   ... -6.480127   -5.731479
  -4.391764  ]
 [ 2.895159   -4.6058354  -4.484947   ... -6.4689455  -7.05696
  -6.580425  ]
 [-6.0596294   0.84713435 -3.6356294  ... -5.628479   -5.0078874
  -6.504194  ]]
test2
[[-3.898176   -4.451516   -5.5049872  ... -6.1410346  -5.0408754
  -5.876754  ]
 [-4.9246774  -3.4578602   1.4864594  ... -3.6848786  -4.838425
  -5.291048  ]
 [-5.2630773  -5.1354113  -1.3144058  ... -4.5789766  -5.312361
  -5.871128  ]
 ...
 [-4.671137   -4.959435   -5.350483   ... -6.480127   -5.731479
  -4.391764  ]
 [ 2.895159   -4.6058354  -4.484947   ... -6.4689455  -7.05696
  -6.580425  ]
 [-6.0596294   0.84713435 -3.6356294  ... -5.628479   -5.0078874
  -6.504194  ]]
[[0. 0. 0. ... 0. 0. 0.]
 

Saving model checkpoint to bert-finetuned-sem_eval-english/checkpoint-14742
Configuration saved in bert-finetuned-sem_eval-english/checkpoint-14742/config.json
Model weights saved in bert-finetuned-sem_eval-english/checkpoint-14742/pytorch_model.bin
tokenizer config file saved in bert-finetuned-sem_eval-english/checkpoint-14742/tokenizer_config.json
Special tokens file saved in bert-finetuned-sem_eval-english/checkpoint-14742/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 3276
  Batch size = 8


test1
[[-3.9192286  -4.5322714  -5.5712457  ... -6.250638   -5.126588
  -5.972098  ]
 [-5.0160904  -3.5691764   1.6119667  ... -3.6809626  -4.858189
  -5.298685  ]
 [-5.3636746  -5.2649803  -1.3187525  ... -4.704716   -5.3910055
  -5.922079  ]
 ...
 [-4.773623   -5.029008   -5.391867   ... -6.5292363  -5.7620664
  -4.4125304 ]
 [ 2.9374511  -4.6535344  -4.619891   ... -6.540115   -7.0713544
  -6.57932   ]
 [-6.0569196   0.85379964 -3.6553578  ... -5.6259074  -4.9810567
  -6.496258  ]]
test2
[[-3.9192286  -4.5322714  -5.5712457  ... -6.250638   -5.126588
  -5.972098  ]
 [-5.0160904  -3.5691764   1.6119667  ... -3.6809626  -4.858189
  -5.298685  ]
 [-5.3636746  -5.2649803  -1.3187525  ... -4.704716   -5.3910055
  -5.922079  ]
 ...
 [-4.773623   -5.029008   -5.391867   ... -6.5292363  -5.7620664
  -4.4125304 ]
 [ 2.9374511  -4.6535344  -4.619891   ... -6.540115   -7.0713544
  -6.57932   ]
 [-6.0569196   0.85379964 -3.6553578  ... -5.6259074  -4.9810567
  -6.496258  ]]
[[0. 0. 0. ... 0. 0.

Saving model checkpoint to bert-finetuned-sem_eval-english/checkpoint-16380
Configuration saved in bert-finetuned-sem_eval-english/checkpoint-16380/config.json
Model weights saved in bert-finetuned-sem_eval-english/checkpoint-16380/pytorch_model.bin
tokenizer config file saved in bert-finetuned-sem_eval-english/checkpoint-16380/tokenizer_config.json
Special tokens file saved in bert-finetuned-sem_eval-english/checkpoint-16380/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from bert-finetuned-sem_eval-english/checkpoint-16380 (score: 0.990866910866869).


TrainOutput(global_step=16380, training_loss=0.04389947226341536, metrics={'train_runtime': 6031.1037, 'train_samples_per_second': 21.724, 'train_steps_per_second': 2.716, 'total_flos': 1.726704746471424e+16, 'train_loss': 0.04389947226341536, 'epoch': 10.0})

In [94]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 3276
  Batch size = 8


test1
[[-3.9192286  -4.5322714  -5.5712457  ... -6.250638   -5.126588
  -5.972098  ]
 [-5.0160904  -3.5691764   1.6119667  ... -3.6809626  -4.858189
  -5.298685  ]
 [-5.3636746  -5.2649803  -1.3187525  ... -4.704716   -5.3910055
  -5.922079  ]
 ...
 [-4.773623   -5.029008   -5.391867   ... -6.5292363  -5.7620664
  -4.4125304 ]
 [ 2.9374511  -4.6535344  -4.619891   ... -6.540115   -7.0713544
  -6.57932   ]
 [-6.0569196   0.85379964 -3.6553578  ... -5.6259074  -4.9810567
  -6.496258  ]]
test2
[[-3.9192286  -4.5322714  -5.5712457  ... -6.250638   -5.126588
  -5.972098  ]
 [-5.0160904  -3.5691764   1.6119667  ... -3.6809626  -4.858189
  -5.298685  ]
 [-5.3636746  -5.2649803  -1.3187525  ... -4.704716   -5.3910055
  -5.922079  ]
 ...
 [-4.773623   -5.029008   -5.391867   ... -6.5292363  -5.7620664
  -4.4125304 ]
 [ 2.9374511  -4.6535344  -4.619891   ... -6.540115   -7.0713544
  -6.57932   ]
 [-6.0569196   0.85379964 -3.6553578  ... -5.6259074  -4.9810567
  -6.496258  ]]
[[0. 0. 0. ... 0. 0.

{'eval_loss': 0.033464327454566956,
 'eval_f1': 0.990866910866869,
 'eval_accuracy': 0.990866910866869,
 'eval_runtime': 42.7954,
 'eval_samples_per_second': 76.55,
 'eval_steps_per_second': 9.58,
 'epoch': 10.0}

In [95]:
pd.DataFrame(X_test.text).reset_index().iloc[0].text

'node wait async function continue node application use async functions. how waiting asynchronous function complete proceeding rest application flow below simple example. pre code var var async function requires least sec myasyncfunction function data err todo wait async function console.log must return code pre example element code code return must equal application wait async function. thanks'

In [101]:
X_test = X_test.reset_index(drop=True)

In [107]:
for i in range(50):
    text = pd.DataFrame(X_test.text).reset_index().iloc[i].text

    encoding = tokenizer(text, return_tensors="pt", max_length=512)
    encoding = {k: v.to(trainer.model.device) for k,v in encoding.items()}

    outputs = trainer.model(**encoding)
    logits = outputs.logits

    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(logits.squeeze().cpu())

    predictions = np.zeros(probs.shape)
    predictions[np.where(probs >= 0.5)] = 1

    # turn predicted id's into actual label names
    #print(predictions)
    print(X_test.loc[i, X_test.loc[i,:]==1])
    predicted_labels = [id2label[idx] for idx, label in enumerate(predictions) if label == 1.0]
    print(predicted_labels)
    print()
    print()
    #print(X_test.loc[i, X_test.loc[i,:]==1])

javascript      1
node.js         1
asynchronous    1
async-await     1
Name: 0, dtype: object
[]


swift    1
Name: 1, dtype: object
['ios', 'swift']


ios            1
uitableview    1
Name: 2, dtype: object
['ios']


flutter    1
Name: 3, dtype: object
[]


java          1
hibernate     1
jpa           1
jakarta-ee    1
Name: 4, dtype: object
['java', 'hibernate', 'jpa']


c#           1
.net         1
debugging    1
Name: 5, dtype: object
['c#', '.net']


c++       1
syntax    1
Name: 6, dtype: object
[]


django    1
Name: 7, dtype: object
[]


java      1
json      1
object    1
Name: 8, dtype: object
['java']


haskell                   1
functional-programming    1
Name: 9, dtype: object
[]


android    1
mvvm       1
Name: 10, dtype: object
['android']


Series([], Name: 11, dtype: object)
[]


Series([], Name: 12, dtype: object)
['c++', 'c']


python                 1
django                 1
authentication         1
amazon-web-services    1
Name: 13, dtype: object
['python',

# BERT - Implémentation n°3

In [3]:
import pandas as pd
import torch
from collections import Counter
import numpy as np
from sklearn.model_selection import train_test_split
from datasets import Dataset

In [4]:
X = pd.read_csv("/home/fayz/Documents/OpenClassrooms/Projet7/X2.csv")
X = X.rename(columns={"sentence_title_bow" : "text"})
labels = list(X.columns[1:])
X_train_val, X_test = train_test_split(X, test_size=0.15, random_state=42)
X_train, X_val = train_test_split(X_train_val, test_size=0.3, random_state=42)
dataset = Dataset.from_pandas(X_train)
dataset = dataset.train_test_split(0.2)
labels = [label for label in dataset['train'].features.keys() if label not in ['text', '__index_level_0__']]
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}
from transformers import AutoTokenizer
import numpy as np

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
def preprocess_data(examples):
  # take a batch of texts
  text = examples["text"]
  # encode them
  encoding = tokenizer(text, padding="max_length", truncation=True, max_length=256)
  # add labels
  labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
  # create numpy array of shape (batch_size, num_labels)
  labels_matrix = np.zeros((len(text), len(labels)))
  # fill numpy array
  for idx, label in enumerate(labels):
    labels_matrix[:, idx] = labels_batch[label]

  encoding["labels"] = labels_matrix.tolist()
  
  return encoding
encoded_dataset = dataset.map(preprocess_data, batched=True, remove_columns=dataset['train'].column_names)
example = encoded_dataset['train'][0]
print(example.keys())
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)
tokenizer.decode(example['input_ids'])
encoded_dataset.set_format("torch")
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", 
                                                           problem_type="multi_label_classification", 
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id)
batch_size = 8
metric_name = "f1"
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    f"bert-finetuned-second",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=10,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name
    #push_to_hub=True,
)
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch
    
# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    print("test2")
    print(predictions)
    print(labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    print("test3")
    print(probs)
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    print("test4")
    print(y_pred)
    print(y_pred.shape)
    # finally, compute metrics
    y_true = labels
    print("test5")
    print(y_true)
    
    f1_micro_average =0
    for i in range(y_pred.shape[0]):
        f1_micro_average = f1_micro_average + f1_score(y_true=y_true[i], y_pred=y_pred[i], average='micro')
    f1_micro_average = f1_micro_average/y_pred.shape[0]
    print("test6")
    print(f1_micro_average)
    #roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = 0
    for i in range(y_pred.shape[0]):
        accuracy = accuracy + accuracy_score(y_true[i], y_pred[i])
    accuracy = accuracy/y_pred.shape[0]
    
    print(accuracy)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               #'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    print("test1")
    print(preds)
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    tokenizer=tokenizer
    ,compute_metrics=compute_metrics
)
trainer.train()
trainer.evaluate()
X_test = X_test.reset_index(drop=True)

  0%|          | 0/14 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])
cuda


2022-10-13 07:33:23.542153: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-10-13 07:33:23.625863: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2022-10-13 07:33:23.644534: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-10-13 07:33:23.973878: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; 

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.0868,0.084122,0.980559,0.980559
2,0.0689,0.064037,0.983532,0.983532
3,0.0567,0.054515,0.9846,0.9846
4,0.0479,0.049649,0.985281,0.985281
5,0.0427,0.046157,0.98591,0.98591
6,0.0389,0.044185,0.986386,0.986386
7,0.0354,0.042519,0.986755,0.986755
8,0.033,0.041603,0.987167,0.987167
9,0.0316,0.041138,0.987179,0.987179
10,0.0303,0.040985,0.987253,0.987253


***** Running Evaluation *****
  Num examples = 3276
  Batch size = 8


test1
[[-2.188304  -2.0759563 -2.2107937 ... -5.1232347 -4.9441013 -4.885079 ]
 [-2.3051517 -2.0128582 -2.2025166 ... -5.1331406 -4.9496613 -4.8767414]
 [-1.8119067 -2.2853878 -2.2957344 ... -5.095075  -4.8335056 -4.831067 ]
 ...
 [-2.2243586 -1.7400179 -2.0640466 ... -4.9700055 -4.7311535 -4.6921954]
 [-2.0305595 -2.0817332 -2.204231  ... -5.1024585 -4.9035916 -4.847089 ]
 [-2.3443718 -1.896009  -2.1567764 ... -5.080053  -4.889118  -4.820449 ]]
test2
[[-2.188304  -2.0759563 -2.2107937 ... -5.1232347 -4.9441013 -4.885079 ]
 [-2.3051517 -2.0128582 -2.2025166 ... -5.1331406 -4.9496613 -4.8767414]
 [-1.8119067 -2.2853878 -2.2957344 ... -5.095075  -4.8335056 -4.831067 ]
 ...
 [-2.2243586 -1.7400179 -2.0640466 ... -4.9700055 -4.7311535 -4.6921954]
 [-2.0305595 -2.0817332 -2.204231  ... -5.1024585 -4.9035916 -4.847089 ]
 [-2.3443718 -1.896009  -2.1567764 ... -5.080053  -4.889118  -4.820449 ]]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 1. ... 0. 0.

Saving model checkpoint to bert-finetuned-second/checkpoint-1638
Configuration saved in bert-finetuned-second/checkpoint-1638/config.json
Model weights saved in bert-finetuned-second/checkpoint-1638/pytorch_model.bin
tokenizer config file saved in bert-finetuned-second/checkpoint-1638/tokenizer_config.json
Special tokens file saved in bert-finetuned-second/checkpoint-1638/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 3276
  Batch size = 8


test1
[[-2.6600022  -3.5825517  -3.4607353  ... -5.27483    -4.834356
  -4.4288473 ]
 [-2.1383498  -3.7265806  -3.7818577  ... -5.511592   -5.0031815
  -4.6210713 ]
 [-0.09245821 -3.664372   -3.7794797  ... -5.1994705  -4.8370585
  -4.292342  ]
 ...
 [-4.2668815  -3.6613119  -0.5060163  ... -5.3736277  -4.9112043
  -5.1201415 ]
 [-2.22265    -2.3950741  -2.2617705  ... -4.4902105  -4.5129232
  -4.5821853 ]
 [-3.4575393  -3.1147616   0.50049824 ... -4.3013144  -4.303338
  -4.6156507 ]]
test2
[[-2.6600022  -3.5825517  -3.4607353  ... -5.27483    -4.834356
  -4.4288473 ]
 [-2.1383498  -3.7265806  -3.7818577  ... -5.511592   -5.0031815
  -4.6210713 ]
 [-0.09245821 -3.664372   -3.7794797  ... -5.1994705  -4.8370585
  -4.292342  ]
 ...
 [-4.2668815  -3.6613119  -0.5060163  ... -5.3736277  -4.9112043
  -5.1201415 ]
 [-2.22265    -2.3950741  -2.2617705  ... -4.4902105  -4.5129232
  -4.5821853 ]
 [-3.4575393  -3.1147616   0.50049824 ... -4.3013144  -4.303338
  -4.6156507 ]]
[[0. 0. 0. ... 0. 0.

Saving model checkpoint to bert-finetuned-second/checkpoint-3276
Configuration saved in bert-finetuned-second/checkpoint-3276/config.json
Model weights saved in bert-finetuned-second/checkpoint-3276/pytorch_model.bin
tokenizer config file saved in bert-finetuned-second/checkpoint-3276/tokenizer_config.json
Special tokens file saved in bert-finetuned-second/checkpoint-3276/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 3276
  Batch size = 8


test1
[[-3.7326825  -3.5638325  -4.20688    ... -5.6573086  -5.0323834
  -4.690791  ]
 [-4.1695385  -3.96164    -4.4995623  ... -6.1983047  -5.6837626
  -5.4406567 ]
 [-1.560123   -3.642217   -4.1247     ... -5.212864   -4.4320354
  -3.6899977 ]
 ...
 [-4.5331535  -3.6287842  -0.11195707 ... -5.1003156  -4.4101105
  -5.2111    ]
 [-3.2296224  -3.053524   -2.8926275  ... -4.9670753  -4.6011467
  -4.847928  ]
 [-3.7056592  -3.4702783  -0.01600251 ... -4.656677   -4.57586
  -4.8078475 ]]
test2
[[-3.7326825  -3.5638325  -4.20688    ... -5.6573086  -5.0323834
  -4.690791  ]
 [-4.1695385  -3.96164    -4.4995623  ... -6.1983047  -5.6837626
  -5.4406567 ]
 [-1.560123   -3.642217   -4.1247     ... -5.212864   -4.4320354
  -3.6899977 ]
 ...
 [-4.5331535  -3.6287842  -0.11195707 ... -5.1003156  -4.4101105
  -5.2111    ]
 [-3.2296224  -3.053524   -2.8926275  ... -4.9670753  -4.6011467
  -4.847928  ]
 [-3.7056592  -3.4702783  -0.01600251 ... -4.656677   -4.57586
  -4.8078475 ]]
[[0. 0. 0. ... 0. 0.

Saving model checkpoint to bert-finetuned-second/checkpoint-4914
Configuration saved in bert-finetuned-second/checkpoint-4914/config.json
Model weights saved in bert-finetuned-second/checkpoint-4914/pytorch_model.bin
tokenizer config file saved in bert-finetuned-second/checkpoint-4914/tokenizer_config.json
Special tokens file saved in bert-finetuned-second/checkpoint-4914/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 3276
  Batch size = 8


test1
[[-3.9532826  -3.7106323  -4.697054   ... -6.446871   -5.877095
  -5.8158727 ]
 [-4.2007732  -3.9348013  -4.196821   ... -6.443411   -5.636328
  -5.678164  ]
 [-2.2228284  -3.3706405  -3.8392563  ... -5.254287   -4.449086
  -4.106633  ]
 ...
 [-4.8614984  -4.042458   -0.10935406 ... -5.1759663  -4.7295947
  -5.772315  ]
 [-3.8572776  -3.7833815  -3.6079516  ... -5.770257   -5.319052
  -5.676725  ]
 [-4.2430086  -3.2035666   0.15232973 ... -4.724476   -4.533152
  -5.1393332 ]]
test2
[[-3.9532826  -3.7106323  -4.697054   ... -6.446871   -5.877095
  -5.8158727 ]
 [-4.2007732  -3.9348013  -4.196821   ... -6.443411   -5.636328
  -5.678164  ]
 [-2.2228284  -3.3706405  -3.8392563  ... -5.254287   -4.449086
  -4.106633  ]
 ...
 [-4.8614984  -4.042458   -0.10935406 ... -5.1759663  -4.7295947
  -5.772315  ]
 [-3.8572776  -3.7833815  -3.6079516  ... -5.770257   -5.319052
  -5.676725  ]
 [-4.2430086  -3.2035666   0.15232973 ... -4.724476   -4.533152
  -5.1393332 ]]
[[0. 0. 0. ... 0. 0. 0.]
 

Saving model checkpoint to bert-finetuned-second/checkpoint-6552
Configuration saved in bert-finetuned-second/checkpoint-6552/config.json
Model weights saved in bert-finetuned-second/checkpoint-6552/pytorch_model.bin
tokenizer config file saved in bert-finetuned-second/checkpoint-6552/tokenizer_config.json
Special tokens file saved in bert-finetuned-second/checkpoint-6552/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 3276
  Batch size = 8


test1
[[-5.133205   -2.81755    -5.0924478  ... -5.872308   -5.2421513
  -5.6613464 ]
 [-4.596967   -3.3024879  -4.522616   ... -6.5559807  -5.5796547
  -6.1374693 ]
 [-1.8829858  -2.1895034  -4.2977138  ... -4.843555   -3.5924325
  -3.6834145 ]
 ...
 [-5.302846   -3.8596916  -0.70869446 ... -5.533879   -5.129046
  -6.308764  ]
 [-4.3509827  -4.0221424  -4.093562   ... -6.467214   -5.6496105
  -6.238952  ]
 [-4.4770613  -2.3262954   0.33765718 ... -5.007326   -4.704985
  -5.45106   ]]
test2
[[-5.133205   -2.81755    -5.0924478  ... -5.872308   -5.2421513
  -5.6613464 ]
 [-4.596967   -3.3024879  -4.522616   ... -6.5559807  -5.5796547
  -6.1374693 ]
 [-1.8829858  -2.1895034  -4.2977138  ... -4.843555   -3.5924325
  -3.6834145 ]
 ...
 [-5.302846   -3.8596916  -0.70869446 ... -5.533879   -5.129046
  -6.308764  ]
 [-4.3509827  -4.0221424  -4.093562   ... -6.467214   -5.6496105
  -6.238952  ]
 [-4.4770613  -2.3262954   0.33765718 ... -5.007326   -4.704985
  -5.45106   ]]
[[0. 0. 0. ... 0. 0.

Saving model checkpoint to bert-finetuned-second/checkpoint-8190
Configuration saved in bert-finetuned-second/checkpoint-8190/config.json
Model weights saved in bert-finetuned-second/checkpoint-8190/pytorch_model.bin
tokenizer config file saved in bert-finetuned-second/checkpoint-8190/tokenizer_config.json
Special tokens file saved in bert-finetuned-second/checkpoint-8190/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 3276
  Batch size = 8


test1
[[-4.8111615  -3.8128097  -5.5574584  ... -6.3094025  -5.6092324
  -5.9964623 ]
 [-4.749241   -3.8066552  -4.376748   ... -6.6654224  -5.5548644
  -6.2444096 ]
 [-2.4761596  -2.8368177  -3.5435348  ... -5.495094   -3.6853333
  -4.2212787 ]
 ...
 [-5.4365125  -4.2101583  -0.86519104 ... -5.699424   -5.327427
  -6.5273404 ]
 [-4.4247417  -4.2686877  -3.9982722  ... -6.7972026  -5.715139
  -6.454821  ]
 [-4.030051   -2.8826056  -0.28023744 ... -5.3810606  -4.8390365
  -5.384718  ]]
test2
[[-4.8111615  -3.8128097  -5.5574584  ... -6.3094025  -5.6092324
  -5.9964623 ]
 [-4.749241   -3.8066552  -4.376748   ... -6.6654224  -5.5548644
  -6.2444096 ]
 [-2.4761596  -2.8368177  -3.5435348  ... -5.495094   -3.6853333
  -4.2212787 ]
 ...
 [-5.4365125  -4.2101583  -0.86519104 ... -5.699424   -5.327427
  -6.5273404 ]
 [-4.4247417  -4.2686877  -3.9982722  ... -6.7972026  -5.715139
  -6.454821  ]
 [-4.030051   -2.8826056  -0.28023744 ... -5.3810606  -4.8390365
  -5.384718  ]]
[[0. 0. 0. ... 0. 0.

Saving model checkpoint to bert-finetuned-second/checkpoint-9828
Configuration saved in bert-finetuned-second/checkpoint-9828/config.json
Model weights saved in bert-finetuned-second/checkpoint-9828/pytorch_model.bin
tokenizer config file saved in bert-finetuned-second/checkpoint-9828/tokenizer_config.json
Special tokens file saved in bert-finetuned-second/checkpoint-9828/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 3276
  Batch size = 8


test1
[[-5.908071  -4.1025715 -5.071086  ... -6.597125  -5.888297  -6.4872456]
 [-5.563974  -4.0993986 -4.367524  ... -7.0730715 -6.0291142 -6.8637486]
 [-2.7808833 -2.796422  -3.7248805 ... -5.6583247 -3.6116557 -4.4050956]
 ...
 [-5.5336986 -4.7855883  0.5899701 ... -5.8053484 -5.367823  -6.518194 ]
 [-4.7393937 -4.7982163 -4.355467  ... -7.3084946 -6.1300316 -6.696937 ]
 [-4.5026283 -2.3659434 -0.5865086 ... -5.4501705 -4.9576283 -5.516382 ]]
test2
[[-5.908071  -4.1025715 -5.071086  ... -6.597125  -5.888297  -6.4872456]
 [-5.563974  -4.0993986 -4.367524  ... -7.0730715 -6.0291142 -6.8637486]
 [-2.7808833 -2.796422  -3.7248805 ... -5.6583247 -3.6116557 -4.4050956]
 ...
 [-5.5336986 -4.7855883  0.5899701 ... -5.8053484 -5.367823  -6.518194 ]
 [-4.7393937 -4.7982163 -4.355467  ... -7.3084946 -6.1300316 -6.696937 ]
 [-4.5026283 -2.3659434 -0.5865086 ... -5.4501705 -4.9576283 -5.516382 ]]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 1. ... 0. 0.

Saving model checkpoint to bert-finetuned-second/checkpoint-11466
Configuration saved in bert-finetuned-second/checkpoint-11466/config.json
Model weights saved in bert-finetuned-second/checkpoint-11466/pytorch_model.bin
tokenizer config file saved in bert-finetuned-second/checkpoint-11466/tokenizer_config.json
Special tokens file saved in bert-finetuned-second/checkpoint-11466/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 3276
  Batch size = 8


test1
[[-6.1570983  -4.3564024  -5.654873   ... -6.770038   -6.2068477
  -6.763908  ]
 [-5.6313562  -4.138797   -4.334113   ... -7.0967684  -6.138
  -6.943718  ]
 [-2.6231425  -3.0323727  -3.8323212  ... -6.0179415  -3.6500635
  -4.533693  ]
 ...
 [-5.75302    -4.7564635  -0.33810982 ... -6.220651   -5.8570557
  -7.0301228 ]
 [-4.7693653  -4.6439667  -4.855344   ... -7.4189286  -6.1501527
  -6.9127364 ]
 [-4.522553   -2.316187   -0.47754866 ... -5.4174523  -4.9046426
  -5.539687  ]]
test2
[[-6.1570983  -4.3564024  -5.654873   ... -6.770038   -6.2068477
  -6.763908  ]
 [-5.6313562  -4.138797   -4.334113   ... -7.0967684  -6.138
  -6.943718  ]
 [-2.6231425  -3.0323727  -3.8323212  ... -6.0179415  -3.6500635
  -4.533693  ]
 ...
 [-5.75302    -4.7564635  -0.33810982 ... -6.220651   -5.8570557
  -7.0301228 ]
 [-4.7693653  -4.6439667  -4.855344   ... -7.4189286  -6.1501527
  -6.9127364 ]
 [-4.522553   -2.316187   -0.47754866 ... -5.4174523  -4.9046426
  -5.539687  ]]
[[0. 0. 0. ... 0. 0. 0.]

Saving model checkpoint to bert-finetuned-second/checkpoint-13104
Configuration saved in bert-finetuned-second/checkpoint-13104/config.json
Model weights saved in bert-finetuned-second/checkpoint-13104/pytorch_model.bin
tokenizer config file saved in bert-finetuned-second/checkpoint-13104/tokenizer_config.json
Special tokens file saved in bert-finetuned-second/checkpoint-13104/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 3276
  Batch size = 8


test1
[[-6.31243    -3.7825077  -5.2972913  ... -6.686558   -5.8837523
  -6.695596  ]
 [-5.6456313  -3.6578755  -4.3503585  ... -7.131322   -6.01491
  -7.0055156 ]
 [-2.4897864  -2.6140158  -3.5987754  ... -5.824583   -3.2783368
  -4.3792377 ]
 ...
 [-5.8309727  -4.922526   -0.12283342 ... -6.27171    -5.773381
  -7.000118  ]
 [-4.8595123  -4.747023   -4.9404793  ... -7.5786157  -6.255064
  -7.0619855 ]
 [-4.397893   -2.1538043  -0.6808553  ... -5.679964   -5.0364795
  -5.475764  ]]
test2
[[-6.31243    -3.7825077  -5.2972913  ... -6.686558   -5.8837523
  -6.695596  ]
 [-5.6456313  -3.6578755  -4.3503585  ... -7.131322   -6.01491
  -7.0055156 ]
 [-2.4897864  -2.6140158  -3.5987754  ... -5.824583   -3.2783368
  -4.3792377 ]
 ...
 [-5.8309727  -4.922526   -0.12283342 ... -6.27171    -5.773381
  -7.000118  ]
 [-4.8595123  -4.747023   -4.9404793  ... -7.5786157  -6.255064
  -7.0619855 ]
 [-4.397893   -2.1538043  -0.6808553  ... -5.679964   -5.0364795
  -5.475764  ]]
[[0. 0. 0. ... 0. 0. 0.]

Saving model checkpoint to bert-finetuned-second/checkpoint-14742
Configuration saved in bert-finetuned-second/checkpoint-14742/config.json
Model weights saved in bert-finetuned-second/checkpoint-14742/pytorch_model.bin
tokenizer config file saved in bert-finetuned-second/checkpoint-14742/tokenizer_config.json
Special tokens file saved in bert-finetuned-second/checkpoint-14742/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 3276
  Batch size = 8


test1
[[-6.3556743  -4.0085826  -5.4161654  ... -6.7050195  -5.991133
  -6.727547  ]
 [-5.6983194  -3.8773212  -4.226888   ... -7.195786   -6.134214
  -7.054529  ]
 [-2.648206   -2.800273   -3.6320305  ... -5.9218383  -3.4048064
  -4.47765   ]
 ...
 [-5.766661   -4.9889884   0.08357094 ... -6.259904   -5.8228054
  -6.9923306 ]
 [-4.906903   -4.848342   -4.9257865  ... -7.6521077  -6.3684545
  -7.1244416 ]
 [-4.277033   -2.1440275  -0.4982259  ... -5.542861   -4.947903
  -5.3426075 ]]
test2
[[-6.3556743  -4.0085826  -5.4161654  ... -6.7050195  -5.991133
  -6.727547  ]
 [-5.6983194  -3.8773212  -4.226888   ... -7.195786   -6.134214
  -7.054529  ]
 [-2.648206   -2.800273   -3.6320305  ... -5.9218383  -3.4048064
  -4.47765   ]
 ...
 [-5.766661   -4.9889884   0.08357094 ... -6.259904   -5.8228054
  -6.9923306 ]
 [-4.906903   -4.848342   -4.9257865  ... -7.6521077  -6.3684545
  -7.1244416 ]
 [-4.277033   -2.1440275  -0.4982259  ... -5.542861   -4.947903
  -5.3426075 ]]
[[0. 0. 0. ... 0. 0. 0

Saving model checkpoint to bert-finetuned-second/checkpoint-16380
Configuration saved in bert-finetuned-second/checkpoint-16380/config.json
Model weights saved in bert-finetuned-second/checkpoint-16380/pytorch_model.bin
tokenizer config file saved in bert-finetuned-second/checkpoint-16380/tokenizer_config.json
Special tokens file saved in bert-finetuned-second/checkpoint-16380/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from bert-finetuned-second/checkpoint-16380 (score: 0.9872527472527194).
***** Running Evaluation *****
  Num examples = 3276
  Batch size = 8


test1
[[-6.3556743  -4.0085826  -5.4161654  ... -6.7050195  -5.991133
  -6.727547  ]
 [-5.6983194  -3.8773212  -4.226888   ... -7.195786   -6.134214
  -7.054529  ]
 [-2.648206   -2.800273   -3.6320305  ... -5.9218383  -3.4048064
  -4.47765   ]
 ...
 [-5.766661   -4.9889884   0.08357094 ... -6.259904   -5.8228054
  -6.9923306 ]
 [-4.906903   -4.848342   -4.9257865  ... -7.6521077  -6.3684545
  -7.1244416 ]
 [-4.277033   -2.1440275  -0.4982259  ... -5.542861   -4.947903
  -5.3426075 ]]
test2
[[-6.3556743  -4.0085826  -5.4161654  ... -6.7050195  -5.991133
  -6.727547  ]
 [-5.6983194  -3.8773212  -4.226888   ... -7.195786   -6.134214
  -7.054529  ]
 [-2.648206   -2.800273   -3.6320305  ... -5.9218383  -3.4048064
  -4.47765   ]
 ...
 [-5.766661   -4.9889884   0.08357094 ... -6.259904   -5.8228054
  -6.9923306 ]
 [-4.906903   -4.848342   -4.9257865  ... -7.6521077  -6.3684545
  -7.1244416 ]
 [-4.277033   -2.1440275  -0.4982259  ... -5.542861   -4.947903
  -5.3426075 ]]
[[0. 0. 0. ... 0. 0. 0

In [23]:
for i in range(50):
    text = pd.DataFrame(X_test.text).reset_index().iloc[i].text

    encoding = tokenizer(text, return_tensors="pt", max_length=512)
    encoding = {k: v.to(trainer.model.device) for k,v in encoding.items()}

    outputs = trainer.model(**encoding)
    logits = outputs.logits

    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(logits.squeeze().cpu())

    predictions = np.zeros(probs.shape)
    predictions[np.where(probs >= 0.1)] = 1

    # turn predicted id's into actual label names
    #print(predictions)
    print(X_test.text[i])
    print(X_test.loc[i, X_test.loc[i,:]==1])
    predicted_labels = [id2label[idx] for idx, label in enumerate(predictions) if label == 1.0]
    print(predicted_labels)
    print()
    print()
    #print(X_test.loc[i, X_test.loc[i,:]==1])

NameError: name 'X_test' is not defined

# Code utile si besoin de charger le modèle dans un fichier externe

In [79]:
tokenizer = AutoTokenizer.from_pretrained("/home/fayz/Documents/OpenClassrooms/Projet7/bert-finetuned-second/checkpoint-16380/")

loading file vocab.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json


In [80]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", 
                                                           problem_type="multi_label_classification", 
                                                           num_labels=100)
                                                           #id2label=id2label,
                                                           #label2id=label2id)

loading configuration file config.json from cache at /home/fayz/.cache/huggingface/hub/models--bert-base-uncased/snapshots/5546055f03398095e385d7dc625e636cc8910bf2/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17",
    "18": "LABEL_18",
    "19": "LABEL_19",
    "20": "LABEL_20",
    "21": "LABEL_21",
    "22": "LABEL_22",
    "23": "LABEL_23",
    "2

In [81]:
model.load_state_dict(torch.load("/home/fayz/Documents/OpenClassrooms/Projet7/bert-finetuned-second/checkpoint-16380/pytorch_model.bin"))

<All keys matched successfully>

In [85]:
X = pd.read_csv("/home/fayz/Documents/OpenClassrooms/Projet7/X2.csv")
X = X.rename(columns={"sentence_title_bow" : "text"})
labels = list(X.columns[1:])
X_train_val, X_test = train_test_split(X, test_size=0.15, random_state=42)
X_train, X_val = train_test_split(X_train_val, test_size=0.3, random_state=42)
dataset = Dataset.from_pandas(X_train)
dataset = dataset.train_test_split(0.2)
labels = [label for label in dataset['train'].features.keys() if label not in ['text', '__index_level_0__']]
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}

In [84]:
text = X.text[123]

encoding = tokenizer(text, return_tensors="pt", max_length=512)
encoding = {k: v.to(trainer.model.device) for k,v in encoding.items()}

outputs = trainer.model(**encoding)
logits = outputs.logits

sigmoid = torch.nn.Sigmoid()
probs = sigmoid(logits.squeeze().cpu())

predictions = np.zeros(probs.shape)
predictions[np.where(probs >= 0.5)] = 1
    
print(X.loc[123, X.loc[123,:]==1])
predicted_labels = [id2label[idx] for idx, label in enumerate(predictions) if label == 1.0]
print(predicted_labels)

java    1
Name: 123, dtype: object
['java']
