In [None]:
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow.keras.optimizers.schedules import PolynomialDecay
from tensorflow.keras.optimizers import Adam
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from transformers import TFAutoModelForSequenceClassification

In [None]:
df = pd.read_excel('datasetlabelled.xlsx')
df["label"] = df["label"].apply(lambda score: 1 if score == 1 else 0)
df['review'] =df['text']
df = df[["review", "label", ]]
# Get the underlying numpy arrays
reviews = df['review'].values
labels = df['label'].values

In [None]:
train_reviews, val_reviews, train_labels, val_labels = train_test_split(reviews, labels, test_size=.3)

In [None]:
checkpoint = "distilbert-base-uncased"
#Assign tokenizer object to the tokenizer class
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
def tokenize_dataset(reviews):
    encoded = tokenizer(
        reviews,
        padding=True,
        truncation=True,
        return_tensors='np',
    )
    return encoded.data
# Need to convert to List[str] because the tokenizer expects List but not np.array
tokenized_datasets = {
    "train": tokenize_dataset(train_reviews.tolist()),
    "validation": tokenize_dataset(val_reviews.tolist()),
}

In [None]:
batch_size = 8
num_epochs = 5
num_train_steps = (len(train_reviews) // batch_size) * num_epochs
# We let the declay goes from 1e-5 to 0 over course of training.

lr_scheduler = PolynomialDecay(
    initial_learning_rate=1e-5,
    end_learning_rate=0.,
    decay_steps=num_train_steps
)
# The optimizer is Adam with the learning rate schedule as specified
opt = Adam(learning_rate=lr_scheduler)

# Use the pretrained model from the same checkpoint as the tokenizer
# The num_label is 2 because we have a binary classification problem (Positive
# and negative)
model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
# The model return logit (not probability), so we need to make sure to use the
# matching loss function to calculate the Cross Entropy from logits.
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
# Compile the model and monitor the accuracy
model.compile(optimizer=opt, loss=loss, metrics=['accuracy'])

In [None]:
history = model.fit(
    tokenized_datasets['train'],
    train_labels,
    validation_data=(tokenized_datasets['validation'], val_labels),
    batch_size=batch_size,
    epochs=num_epochs
)
# END OF THE TRAINING PART FOR THE ALGORITHM IN CHARGE OF DETECTING THE SENTIMENT TOWARDS THE PRODUCT

In [None]:
Liste = ['It is bad',' I  like this laptop','I REGRET BUYING THIS LAPTOP']
tokenized_inputs2 = tokenize_dataset(Liste)
tf_output2 = model.predict(tokenized_inputs2)

tf_prediction2 = tf.nn.softmax(tf_output2.logits, axis=1)
labels2 = ['Positve','negative']
label2 = tf.argmax(tf_prediction2, axis=1)
liste2 = []
for i in range(len(Liste)):
  if label2[i] == 0:
    liste2.append(labels2[1])
  else:
    liste2.append(labels2[0])

print(liste2)