In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import metrics
from tqdm import tqdm

import torch
import torch.nn.functional as F
import torch.nn as nn

from transformers import AutoTokenizer, DistilBertModel
from transformers import AdamW


In [2]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
if torch.cuda.is_available():
  for i in range(torch.cuda.device_count()):
    print(torch.cuda.get_device_name(i))
else:
  print("You are running on CPU")

NVIDIA GeForce GTX 1050 Ti


In [3]:
df = pd.read_csv("../Datasets/Cuvinte-Eliminate/train-punct-stop-1000.csv")
df=df.dropna()
df=df.sample(n=100000)

In [4]:
df.head()

Unnamed: 0,sentiment,text
332584,2,great inspiring book authors show wonderfully ...
2347926,0,sorry pretty messed release theatrical edition...
1723728,0,big blues fan since 8 yrs old excited first he...
857228,2,straight forward install use no surprises tigh...
116169,2,hands no better live act rock today cannot get...


In [5]:
df['sentiment'] = df['sentiment'].replace(2,1)

In [6]:
df.head()

Unnamed: 0,sentiment,text
332584,1,great inspiring book authors show wonderfully ...
2347926,0,sorry pretty messed release theatrical edition...
1723728,0,big blues fan since 8 yrs old excited first he...
857228,1,straight forward install use no surprises tigh...
116169,1,hands no better live act rock today cannot get...


In [7]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
def tokenize_function(example):
    tok = tokenizer(example, padding="max_length", truncation=True)
    return tok['input_ids'], tok['attention_mask']
    
df['input_ids'], df['attention_mask'] = zip(*df['text'].map(tokenize_function))

df_train, df_test = train_test_split(df, test_size=0.2)

In [8]:
df.head()

Unnamed: 0,sentiment,text,input_ids,attention_mask
332584,1,great inspiring book authors show wonderfully ...,"[101, 2307, 18988, 2338, 6048, 2265, 6919, 213...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2347926,0,sorry pretty messed release theatrical edition...,"[101, 3374, 3492, 18358, 2713, 8900, 3179, 202...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1723728,0,big blues fan since 8 yrs old excited first he...,"[101, 2502, 5132, 5470, 2144, 1022, 1061, 2869...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
857228,1,straight forward install use no surprises tigh...,"[101, 3442, 2830, 16500, 2224, 2053, 20096, 43...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
116169,1,hands no better live act rock today cannot get...,"[101, 2398, 2053, 2488, 2444, 2552, 2600, 2651...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [9]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, dataframe):
        self.df = dataframe

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        item = self.df.iloc[idx]
        return {
            'input_ids' : torch.tensor(item['input_ids']).to(device),
            'attention_mask' : torch.tensor(item['attention_mask']).to(device),
            'labels' : torch.tensor(item['sentiment']).to(device)
        }

train_set = Dataset(df_train)
test_set = Dataset(df_test)

In [None]:
class Classifier(nn.Module):
    def __init__(self, out_feat=2):
        super().__init__()
        self.bert = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.cls = nn.Linear(768, out_feat)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
        )
 
        pooled_output = outputs.last_hidden_state[:,0,:] #[batch_dim, token_dim, ...] [CLS]
        logits = self.cls(pooled_output)
        return F.softmax(logits, dim=1)

    def freeze_until_layer(self, n):
      for name, param in self.named_parameters():
        if str(n) in name:
          break

        param.requires_grad = False
        
    def print_layers(self):
      total_nr_w = 0
      trainable_nr_w = 0
      for name, param in self.named_parameters():
        nr_w = np.prod(param.size())
        total_nr_w += nr_w
        if param.requires_grad:
          trainable_nr_w += nr_w
        print('{}\t{}\t\t\t{}'.format(param.requires_grad, nr_w ,name))
      print('The network has {} parameters, out of which {} ({:.1f}%) are trainable.'.format(total_nr_w, trainable_nr_w, trainable_nr_w / total_nr_w * 100))

model = Classifier(2).to(device) # generez o instanță a modelului
print(model)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
model.freeze_until_layer(5)
model.print_layers()

In [None]:
def evaluate(model):
  # Pregatim o modalitate de stocare a datelor pentru evaluare
  eval_outputs = []
  true_labels = []
  # Trecem modelul in modul train
  model.eval()

  ########### Evaluation Loop #############
  with torch.no_grad():
      for batch in tqdm(test_loader, total=len(test_loader)):
          outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])

          outputs = outputs.cpu() ## copy-ing the outputs from CUDA to CPU
          outputs = np.argmax(outputs, axis=1)

          eval_outputs += outputs.tolist()
          true_labels += batch['labels'].tolist()

  #acc = metrics.accuracy_score(true_labels, eval_outputs)
  f1 = metrics.f1_score(true_labels, eval_outputs)
  print("F1: {}".format(f1) , end =" ")

In [None]:
LEARNING_RATE = 5e-5 # Rata de invatare
NR_EPOCHS = 8 # Numarul de epoci
BATCH_SIZE = 32 # Numarul de samples dintr-un batch

criterion = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

# Pregatim o modalitate de loggare a informatiilor din timpul antrenarii
log_info = []

# Pregatim DataLoader-ul pentru antrenare
train_loader = torch.utils.data.DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True)

# Pregatim DataLoader-ul pentru validare
test_loader = torch.utils.data.DataLoader(test_set, batch_size=32, shuffle=False)

# Trecem modelul in modul train
model.train() 


########### Training Loop #############


min_epoch_loss = np.Inf
last_epoch_loss = np.Inf
# pentru fiecare epoca (1 epoca = o iteratie peste intregul set de date)
for epoch in range(NR_EPOCHS):
    print('Running epoch {}'.format(epoch), end =" ")

    epoch_losses = []
    # pentru fiecare batch de BATCH_SIZE exemple din setul de date    
    for i, batch in tqdm(enumerate(train_loader), total=len(train_loader)):

        # anulam gradientii deja acumulati la nivelul retelei neuronale
        optimizer.zero_grad()

        # FORWARD PASS: trecem inputurile prin retea
        outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])

        # Calculam LOSSul dintre etichetele prezise si cele reale
        loss = criterion(outputs, batch['labels'])

        # BACKPRPAGATION: calculam gradientii propagand LOSSul in retea
        loss.backward()

        # Utilizam optimizorul pentru a modifica parametrii retelei in functie de gradientii acumulati
        optimizer.step()

        # Salvam informatii despre antrenare (in cazul nostru, salvam valoarea LOSS)
        epoch_losses.append(loss.item()) 
    
    this_epoch_loss = np.mean(epoch_losses)
    log_info.append((epoch, this_epoch_loss))
    
    if this_epoch_loss <= min_epoch_loss:
      min_epoch_loss = this_epoch_loss
      print("Saving model with train loss: {}".format(this_epoch_loss), end =" ")
      torch.save(model, "classfication_model.pt")

    evaluate(model)
    
    if last_epoch_loss - this_epoch_loss < 0.01:
      print("Early Stopping!")
      break

    last_epoch_loss = this_epoch_loss

In [None]:
X = [x for x, loss in log_info]
Y = [loss for x, loss in log_info]
plt.plot(X,Y)
plt.xlabel("Epoch")
plt.ylabel("LOSS")
plt.show()

In [None]:
evaluate(model)