In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import metrics
from tqdm import tqdm

import torch
import torch.nn.functional as F
import torch.nn as nn
import tensorflow as tf
from transformers import AutoTokenizer, DistilBertModel
from transformers import AdamW
import transformers
from transformers import DistilBertTokenizer
from transformers import TFDistilBertForSequenceClassification




In [2]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
if torch.cuda.is_available():
  for i in range(torch.cuda.device_count()):
    print(torch.cuda.get_device_name(i))
else:
  print("You are running on CPU")

NVIDIA GeForce GTX 1050 Ti


In [3]:
device=torch.device("cpu")

In [4]:
df = pd.read_csv("../Datasets/Stem-Cuvinte-Eliminate/train-punct-stop-stem-200.csv")
df=df.dropna()

In [5]:
df.head()

Unnamed: 0,sentiment,text
0,2,sound track beauti paint mind well would recom...
1,2,im read lot review say best game soundtrack fi...
2,2,soundtrack favorit music time hand intens sad ...
3,2,truli like soundtrack enjoy video game music p...
4,2,youv play game know divin music everi singl so...


In [6]:
df['sentiment'] = df['sentiment'].replace(2,1)

In [7]:
df_poz=df[df['sentiment'] == 1]
df_neg=df[df['sentiment'] == 0]

In [8]:
df_poz=df_poz.sample(10000)
df_neg=df_neg.sample(12000)
df = pd.concat([df_poz,df_neg])

In [9]:
df.head()

Unnamed: 0,sentiment,text
1554501,1,give credit rabbi tri answer question god good...
2293,1,would good time america wake embrac tremend ta...
559246,1,book provid exampl graph help demonstr lot det...
861796,1,ye im not exagger show ultim diva put lot effo...
1004966,1,love fresh butter caraf great hand would recom...


In [10]:
BATCH_SIZE = 16
N_EPOCHS = 2 

In [11]:
X_train =df[2000:].text
X_test =df[:2000].text
y_train = df[2000:].sentiment
y_test = df[:2000].sentiment 

In [12]:
X_test

1554501    give credit rabbi tri answer question god good...
2293       would good time america wake embrac tremend ta...
559246     book provid exampl graph help demonstr lot det...
861796     ye im not exagger show ultim diva put lot effo...
1004966    love fresh butter caraf great hand would recom...
                                 ...                        
21203      ive rush fan sinc 12 im 42 watch bluray last n...
1137211    game excel graphic great game difficulti level...
308097     found book interest elev though sometim langua...
88403      moistur great light absorb quickli not greasi ...
663149     excel book must read women import recommend na...
Name: text, Length: 2000, dtype: object

In [13]:
MAX_LEN = X_train.apply(lambda s: len([x for x in s.split()])).max()
MAX_LEN

105

In [14]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

#tokenize the text (padding to max sequence in batch)
train_encodings = tokenizer(list(X_train.values), truncation=True, padding="max_length", max_length=128)
test_encodings = tokenizer(list(X_test.values), truncation=True, padding="max_length", max_length=128)

#print the first paragraph and it transformation
print(f'First paragraph: \'{X_train[:1]}\'')
print(f'Input ids: {train_encodings["input_ids"][0]}')
print(f'Attention mask: {train_encodings["attention_mask"][0]}')


First paragraph: '579674    great product keep motorcycl batteri warm juic...
Name: text, dtype: object'
Input ids: [101, 2307, 4031, 2562, 5013, 5666, 20464, 23801, 2072, 4010, 18414, 2594, 3191, 2072, 2175, 3147, 3467, 3204, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Attention mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [15]:
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings),
                                                    list(y_train.values)))

test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings),
                                                    list(y_test.values)))

2022-02-22 17:41:47.642512: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-02-22 17:41:47.649884: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-02-22 17:41:47.650401: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-02-22 17:41:47.651474: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

In [None]:
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
optimizerr = tf.keras.optimizers.Adam(learning_rate=5e-5)
losss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) # Computes the crossentropy loss between the labels and predictions. 
model.compile(optimizer=optimizerr,                                     
              loss=losss,
              metrics=['accuracy'])



In [16]:
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
model.load_weights('./checkpoints-7000/my_checkpoint')
optimizerr = tf.keras.optimizers.Adam(learning_rate=5e-5)
losss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) # Computes the crossentropy loss between the labels and predictions. 
model.compile(optimizer=optimizerr,                                     
              loss=losss,
              metrics=['accuracy'])


2022-02-22 17:41:58.447112: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.
Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_layer_norm', 'vocab_projector', 'activation_13', 'vocab_transform']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint 

In [17]:
model.fit(train_dataset.shuffle(len(X_train)).batch(BATCH_SIZE), 
          epochs=N_EPOCHS,
          batch_size=BATCH_SIZE,
          validation_data=(test_dataset.shuffle(len(X_test)).batch(BATCH_SIZE))
          )

Epoch 1/2
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Epoch 2/2


<keras.callbacks.History at 0x7f2285eec2e0>

In [None]:
model.evaluate(test_dataset.shuffle(len(X_test)).batch(BATCH_SIZE), return_dict=True, batch_size=BATCH_SIZE)

In [None]:
def predict_proba(text_list, model, tokenizer):
  """
  To get array with predicted probabilities for 0 - instructions, 1- ingredients classes 
  for each paragraph in the list of strings
  :param text_list: list[str]
  :param model: transformers.models.distilbert.modeling_tf_distilbert.TFDistilBertForSequenceClassification
  :param tokenizer: transformers.models.distilbert.tokenization_distilbert.DistilBertTokenizer
  :return res: numpy.ndarray
  """
     
  encodings = tokenizer(text_list, max_length=MAX_LEN, truncation=True, padding=True)
  dataset = tf.data.Dataset.from_tensor_slices((dict(encodings))) 
  preds = model.predict(dataset.batch(1)).logits
  res = tf.nn.softmax(preds, axis=1).numpy()
    
  return res

In [None]:
string1 = ["this is good"]
predict_proba(string1, model, tokenizer)

In [None]:
model.save_weights('./checkpoints-150000/my_checkpoint')

In [None]:
model.freeze_until_layer(5)