In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import metrics
from tqdm import tqdm

import torch
import torch.nn.functional as F
import torch.nn as nn
import tensorflow as tf
from transformers import AutoTokenizer, DistilBertModel
from transformers import AdamW
import transformers
from transformers import DistilBertTokenizer
from transformers import TFDistilBertForSequenceClassification




In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
if torch.cuda.is_available():
  for i in range(torch.cuda.device_count()):
    print(torch.cuda.get_device_name(i))
else:
  print("You are running on CPU")

In [None]:
device=torch.device("cpu")

In [None]:
df = pd.read_csv("../Datasets/Stem-Cuvinte-Eliminate/train-punct-stop-stem-200.csv")
df=df.dropna()

In [None]:
df.head()

In [None]:
df['sentiment'] = df['sentiment'].replace(2,1)

In [None]:
df_poz=df[df['sentiment'] == 1]
df_neg=df[df['sentiment'] == 0]

In [None]:
df_poz=df_poz.sample(1000)
df_neg=df_neg.sample(1000)
df = pd.concat([df_poz,df_neg])

In [None]:
df.head()

In [None]:
BATCH_SIZE = 16
N_EPOCHS = 10 

In [None]:
X_train =df[2000:].text
X_test =df[:2000].text
y_train = df[2000:].sentiment
y_test = df[:2000].sentiment 

In [None]:
X_test

In [None]:
MAX_LEN = X_train.apply(lambda s: len([x for x in s.split()])).max()
MAX_LEN

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

#tokenize the text (padding to max sequence in batch)
train_encodings = tokenizer(list(X_train.values), truncation=True, padding="max_length", max_length=128)
test_encodings = tokenizer(list(X_test.values), truncation=True, padding="max_length", max_length=128)

#print the first paragraph and it transformation
print(f'First paragraph: \'{X_train[:1]}\'')
print(f'Input ids: {train_encodings["input_ids"][0]}')
print(f'Attention mask: {train_encodings["attention_mask"][0]}')


In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings),
                                                    list(y_train.values)))

test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings),
                                                    list(y_test.values)))

In [None]:
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
optimizerr = tf.keras.optimizers.Adam(learning_rate=5e-5)
losss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) # Computes the crossentropy loss between the labels and predictions. 
model.compile(optimizer=optimizerr,                                     
              loss=losss,
              metrics=['accuracy'])



In [None]:
model.fit(train_dataset.shuffle(len(X_train)).batch(BATCH_SIZE), 
          epochs=N_EPOCHS,
          batch_size=BATCH_SIZE,
          validation_data=(test_encodings,y_test.values)
          )

In [None]:
model.evaluate(test_dataset.shuffle(len(X_test)).batch(BATCH_SIZE), return_dict=True, batch_size=BATCH_SIZE)

In [None]:
def predict_proba(text_list, model, tokenizer):
  """
  To get array with predicted probabilities for 0 - instructions, 1- ingredients classes 
  for each paragraph in the list of strings
  :param text_list: list[str]
  :param model: transformers.models.distilbert.modeling_tf_distilbert.TFDistilBertForSequenceClassification
  :param tokenizer: transformers.models.distilbert.tokenization_distilbert.DistilBertTokenizer
  :return res: numpy.ndarray
  """
     
  encodings = tokenizer(text_list, max_length=MAX_LEN, truncation=True, padding=True)
  dataset = tf.data.Dataset.from_tensor_slices((dict(encodings))) 
  preds = model.predict(dataset.batch(1)).logits
  res = tf.nn.softmax(preds, axis=1).numpy()
    
  return res

In [None]:
string1 = ["this is good"]
predict_proba(string1, model, tokenizer)

In [None]:
model.save_weights('./checkpoints-13000/my_checkpoint')

In [None]:
model.freeze_until_layer(5)