In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import metrics
from tqdm import tqdm

import torch
import torch.nn.functional as F
import torch.nn as nn

from transformers import AutoTokenizer, DistilBertModel
from transformers import AdamW


In [2]:
df = pd.read_csv("../Datasets/Cuvinte-Eliminate/test-punct-stop-1000.csv")
df=df.dropna()
df=df.sample(10000)

In [3]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
if torch.cuda.is_available():
  for i in range(torch.cuda.device_count()):
    print(torch.cuda.get_device_name(i))
else:
  print("You are running on CPU")

NVIDIA GeForce GTX 1050 Ti


In [4]:
df['sentiment'] = df['sentiment'].replace(2,1)

In [5]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
def tokenize_function(example):
    tok = tokenizer(example, padding="max_length", truncation=True)
    return tok['input_ids'], tok['attention_mask']
    
df['input_ids'], df['attention_mask'] = zip(*df['text'].map(tokenize_function))


In [6]:
df.head()

Unnamed: 0,sentiment,text,input_ids,attention_mask
269747,0,purchased router reading reviews well first 3 ...,"[101, 4156, 2799, 2099, 3752, 4391, 2092, 2034...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
143099,1,stories interesting study human condition iron...,"[101, 3441, 5875, 2817, 2529, 4650, 19728, 308...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
295835,1,good video get started good exercise routine d...,"[101, 2204, 2678, 2131, 2318, 2204, 6912, 9410...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
176493,1,like game lot better nba get last year thats m...,"[101, 2066, 2208, 2843, 2488, 6452, 2131, 2197...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
256705,1,well written easy understand book subjects pro...,"[101, 2092, 2517, 3733, 3305, 2338, 5739, 6011...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [7]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, dataframe):
        self.df = dataframe

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        item = self.df.iloc[idx]
        return {
            'input_ids' : torch.tensor(item['input_ids']).to(device),
            'attention_mask' : torch.tensor(item['attention_mask']).to(device),
            'labels' : torch.tensor(item['sentiment']).to(device)
        }
test_set = Dataset(df)

In [8]:
def evaluate(model):
  # Pregatim o modalitate de stocare a datelor pentru evaluare
  eval_outputs = []
  true_labels = []
  # Trecem modelul in modul train
  model.eval()

  ########### Evaluation Loop #############
  with torch.no_grad():
      for batch in tqdm(test_loader, total=len(test_loader)):
          outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])

          outputs = outputs.cpu() ## copy-ing the outputs from CUDA to CPU
          outputs = np.argmax(outputs, axis=1)

          eval_outputs += outputs.tolist()
          true_labels += batch['labels'].tolist()

  #acc = metrics.accuracy_score(true_labels, eval_outputs)
  f1 = metrics.f1_score(true_labels, eval_outputs)
  print("F1: {}".format(f1) , end =" ")

In [9]:
class Classifier(nn.Module):
    def __init__(self, out_feat=2):
        super().__init__()
        self.bert = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.cls = nn.Linear(768, out_feat)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
        )
 
        pooled_output = outputs.last_hidden_state[:,0,:] #[batch_dim, token_dim, ...] [CLS]
        logits = self.cls(pooled_output)
        return F.softmax(logits, dim=1)

    def freeze_until_layer(self, n):
      for name, param in self.named_parameters():
        if str(n) in name:
          break

        param.requires_grad = False
        
    def print_layers(self):
      total_nr_w = 0
      trainable_nr_w = 0
      for name, param in self.named_parameters():
        nr_w = np.prod(param.size())
        total_nr_w += nr_w
        if param.requires_grad:
          trainable_nr_w += nr_w
        print('{}\t{}\t\t\t{}'.format(param.requires_grad, nr_w ,name))
      print('The network has {} parameters, out of which {} ({:.1f}%) are trainable.'.format(total_nr_w, trainable_nr_w, trainable_nr_w / total_nr_w * 100))

model = Classifier(2)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
model=torch.load("classfication_model.pt")

In [11]:
test_loader = torch.utils.data.DataLoader(test_set, batch_size=32, shuffle=False)


In [12]:
evaluate(model)

100%|█████████████████████████████████████████| 313/313 [06:29<00:00,  1.24s/it]

F1: 0.8159777424483307 


